Update libvpx and incorporate new build system

Move libvpx down a directory. Separate libwebm and rename it. It would be more accurate to rename external/libvpx to external/webm. Use file lists directly from upstream libvpx. This allows adding new targets and new features (such as the encoder) easily. MIPS and x86 options are in progress. See new file "UPDATING" The new libvpx checkout is not from a release branch. The decoder is stable but it should be checked and potentially updated if the encoder is enabled. Requires I42b51e2845a696a6e211dde00951afc8f571336f which updates libstagefright to account for new paths and library names. Change-Id: I739f99d48b8d7e6354c416ef2ca79c954826307f
author: Johann <johannkoenig@google.com> 2012-10-10 11:46:26 -0700
committer: Johann <johannkoenig@google.com> 2012-12-11 11:30:01 -0800
commit: 1b362b15af34006e6a11974088a46d42b903418e (patch)
tree: 5c05c1e135557201ef2797e62a4e1480e712a7a2 /libvpx/vp8
parent: b9c4e676b7be3cf3a9e45e06d71bf0ca92344a5b (diff)
download: android_external_libvpx-1b362b15af34006e6a11974088a46d42b903418e.tar.gz
android_external_libvpx-1b362b15af34006e6a11974088a46d42b903418e.tar.bz2
android_external_libvpx-1b362b15af34006e6a11974088a46d42b903418e.zip
291 files changed, 99265 insertions, 0 deletions
diff --git a/libvpx/vp8/common/alloccommon.c b/libvpx/vp8/common/alloccommon.c
new file mode 100644
index 0000000..8af9e90
--- /dev/null
+++ b/libvpx/vp8/common/alloccommon.c
@@ -0,0 +1,190 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "onyxc_int.h"
+#include "findnearmv.h"
+#include "entropymode.h"
+#include "systemdependent.h"
+
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
+{
+    int i;
+    for (i = 0; i < NUM_YV12_BUFFERS; i++)
+        vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]);
+
+    vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame);
+#if CONFIG_POSTPROC
+    vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer);
+    if (oci->post_proc_buffer_int_used)
+        vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer_int);
+
+    vpx_free(oci->pp_limits_buffer);
+    oci->pp_limits_buffer = NULL;
+#endif
+
+    vpx_free(oci->above_context);
+    vpx_free(oci->mip);
+#if CONFIG_ERROR_CONCEALMENT
+    vpx_free(oci->prev_mip);
+    oci->prev_mip = NULL;
+#endif
+
+    oci->above_context = NULL;
+    oci->mip = NULL;
+}
+
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
+{
+    int i;
+
+    vp8_de_alloc_frame_buffers(oci);
+
+    /* our internal buffers are always multiples of 16 */
+    if ((width & 0xf) != 0)
+        width += 16 - (width & 0xf);
+
+    if ((height & 0xf) != 0)
+        height += 16 - (height & 0xf);
+
+
+    for (i = 0; i < NUM_YV12_BUFFERS; i++)
+    {
+        oci->fb_idx_ref_cnt[i] = 0;
+        oci->yv12_fb[i].flags = 0;
+        if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0)
+            goto allocation_fail;
+    }
+
+    oci->new_fb_idx = 0;
+    oci->lst_fb_idx = 1;
+    oci->gld_fb_idx = 2;
+    oci->alt_fb_idx = 3;
+
+    oci->fb_idx_ref_cnt[0] = 1;
+    oci->fb_idx_ref_cnt[1] = 1;
+    oci->fb_idx_ref_cnt[2] = 1;
+    oci->fb_idx_ref_cnt[3] = 1;
+
+    if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0)
+        goto allocation_fail;
+
+    oci->mb_rows = height >> 4;
+    oci->mb_cols = width >> 4;
+    oci->MBs = oci->mb_rows * oci->mb_cols;
+    oci->mode_info_stride = oci->mb_cols + 1;
+    oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
+
+    if (!oci->mip)
+        goto allocation_fail;
+
+    oci->mi = oci->mip + oci->mode_info_stride + 1;
+
+    /* Allocation of previous mode info will be done in vp8_decode_frame()
+     * as it is a decoder only data */
+
+    oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
+
+    if (!oci->above_context)
+        goto allocation_fail;
+
+#if CONFIG_POSTPROC
+    if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
+        goto allocation_fail;
+
+    oci->post_proc_buffer_int_used = 0;
+    vpx_memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
+    vpx_memset(oci->post_proc_buffer.buffer_alloc, 128,
+               oci->post_proc_buffer.frame_size);
+
+    /* Allocate buffer to store post-processing filter coefficients.
+     *
+     * Note: Round up mb_cols to support SIMD reads
+     */
+    oci->pp_limits_buffer = vpx_memalign(16, 24 * ((oci->mb_cols + 1) & ~1));
+    if (!oci->pp_limits_buffer)
+        goto allocation_fail;
+#endif
+
+    return 0;
+
+allocation_fail:
+    vp8_de_alloc_frame_buffers(oci);
+    return 1;
+}
+
+void vp8_setup_version(VP8_COMMON *cm)
+{
+    switch (cm->version)
+    {
+    case 0:
+        cm->no_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 0;
+        cm->full_pixel = 0;
+        break;
+    case 1:
+        cm->no_lpf = 0;
+        cm->filter_type = SIMPLE_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 1;
+        cm->full_pixel = 0;
+        break;
+    case 2:
+        cm->no_lpf = 1;
+        cm->filter_type = NORMAL_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 1;
+        cm->full_pixel = 0;
+        break;
+    case 3:
+        cm->no_lpf = 1;
+        cm->filter_type = SIMPLE_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 1;
+        cm->full_pixel = 1;
+        break;
+    default:
+        /*4,5,6,7 are reserved for future use*/
+        cm->no_lpf = 0;
+        cm->filter_type = NORMAL_LOOPFILTER;
+        cm->use_bilinear_mc_filter = 0;
+        cm->full_pixel = 0;
+        break;
+    }
+}
+void vp8_create_common(VP8_COMMON *oci)
+{
+    vp8_machine_specific_config(oci);
+
+    vp8_init_mbmode_probs(oci);
+    vp8_default_bmode_probs(oci->fc.bmode_prob);
+
+    oci->mb_no_coeff_skip = 1;
+    oci->no_lpf = 0;
+    oci->filter_type = NORMAL_LOOPFILTER;
+    oci->use_bilinear_mc_filter = 0;
+    oci->full_pixel = 0;
+    oci->multi_token_partition = ONE_PARTITION;
+    oci->clr_type = REG_YUV;
+    oci->clamp_type = RECON_CLAMP_REQUIRED;
+
+    /* Initialize reference frame sign bias structure to defaults */
+    vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+
+    /* Default disable buffer to buffer copying */
+    oci->copy_buffer_to_gf = 0;
+    oci->copy_buffer_to_arf = 0;
+}
+
+void vp8_remove_common(VP8_COMMON *oci)
+{
+    vp8_de_alloc_frame_buffers(oci);
+}
diff --git a/libvpx/vp8/common/alloccommon.h b/libvpx/vp8/common/alloccommon.h
new file mode 100644
index 0000000..ea93c25
--- /dev/null
+++ b/libvpx/vp8/common/alloccommon.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ALLOCCOMMON_H
+#define __INC_ALLOCCOMMON_H
+
+#include "onyxc_int.h"
+
+void vp8_create_common(VP8_COMMON *oci);
+void vp8_remove_common(VP8_COMMON *oci);
+void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
+int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
+void vp8_setup_version(VP8_COMMON *oci);
+
+#endif
diff --git a/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm b/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm
new file mode 100644
index 0000000..9704b42
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/bilinearfilter_v6.asm
@@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_filter_block2d_bil_first_pass_armv6|
+    EXPORT  |vp8_filter_block2d_bil_second_pass_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;-------------------------------------
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vp8_filter_block2d_bil_first_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    mov     r12, r3                         ; outer-loop counter
+
+    add     r7, r2, r4                      ; preload next row
+    pld     [r0, r7]
+
+    sub     r2, r2, r4                      ; src increment for height loop
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+
+    mov     r3, r3, lsl #1                  ; height*2
+    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+    mov     r11, r1                         ; save dst_ptr for each row
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+    ldrb    r6, [r0]                        ; load source data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+    ldrb    r9, [r0, #3]
+    ldrb    r10, [r0, #4]
+
+    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
+    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
+
+    smuad   r6, r6, r5                      ; apply the filter
+    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
+    smuad   r7, r7, r5
+    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
+
+    smuad   r8, r8, r5
+    smuad   r9, r9, r5
+
+    add     r0, r0, #4
+    subs    lr, lr, #1
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #16, r6, asr #7
+    usat    r7, #16, r7, asr #7
+
+    strh    r6, [r1], r3                    ; result is transposed and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strh    r7, [r1], r3
+    add     r9, r9, #0x40
+    usat    r8, #16, r8, asr #7
+    usat    r9, #16, r9, asr #7
+
+    strh    r8, [r1], r3                    ; result is transposed and stored
+
+    ldrneb  r6, [r0]                        ; load source data
+    strh    r9, [r1], r3
+
+    ldrneb  r7, [r0, #1]
+    ldrneb  r8, [r0, #2]
+
+    bne     bil_width_loop_1st_v6
+
+    add     r0, r0, r2                      ; move to next input row
+    subs    r12, r12, #1
+
+    add     r9, r2, r4, lsl #1              ; adding back block width
+    pld     [r0, r9]                        ; preload next row
+
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_1st_v6
+
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+    mov     lr, r4, lsr #2                  ; loop counter
+
+|bil_width_loop_null_1st|
+    ldrb    r6, [r0]                        ; load data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    ldrb    r9, [r0, #3]
+
+    strh    r6, [r1], r3                    ; store it to immediate buffer
+    add     r0, r0, #4
+    strh    r7, [r1], r3
+    subs    lr, lr, #1
+    strh    r8, [r1], r3
+    strh    r9, [r1], r3
+
+    bne     bil_width_loop_null_1st
+
+    subs    r12, r12, #1
+    add     r0, r0, r2                      ; move to next input line
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_null_1st
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP  ; |vp8_filter_block2d_bil_first_pass_armv6|
+
+
+;---------------------------------
+; r0    unsigned short *src_ptr,
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
+;---------------------------------
+|vp8_filter_block2d_bil_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
+    mov     r11, r1
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+    ldr     r6, [r0]                        ; load the data
+    ldr     r8, [r0, #4]
+    ldrh    r10, [r0, #8]
+    mov     lr, r3, lsr #2                  ; loop counter
+
+|bil_width_loop_2nd|
+    pkhtb   r7, r6, r8                      ; src[1] | src[2]
+    pkhtb   r9, r8, r10                     ; src[3] | src[4]
+
+    smuad   r6, r6, r5                      ; apply filter
+    smuad   r8, r8, r5                      ; apply filter
+
+    subs    lr, lr, #1
+
+    smuadx  r7, r7, r5                      ; apply filter
+    smuadx  r9, r9, r5                      ; apply filter
+
+    add     r0, r0, #8
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #8, r6, asr #7
+    usat    r7, #8, r7, asr #7
+    strb    r6, [r1], r2                    ; the result is transposed back and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strb    r7, [r1], r2
+    add     r9, r9, #0x40
+    usat    r8, #8, r8, asr #7
+    usat    r9, #8, r9, asr #7
+    strb    r8, [r1], r2                    ; the result is transposed back and stored
+
+    ldrne   r6, [r0]                        ; load data
+    strb    r9, [r1], r2
+    ldrne   r8, [r0, #4]
+    ldrneh  r10, [r0, #8]
+
+    bne     bil_width_loop_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4                      ; update src for next row
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_2nd
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+    mov     lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+    ldr     r6, [r0], #4                    ; load data
+    subs    lr, lr, #1
+    ldr     r8, [r0], #4
+
+    strb    r6, [r1], r2                    ; store data
+    mov     r7, r6, lsr #16
+    strb    r7, [r1], r2
+    mov     r9, r8, lsr #16
+    strb    r8, [r1], r2
+    strb    r9, [r1], r2
+
+    bne     bil_width_loop_null_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_null_2nd
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp8_filter_block2d_second_pass_armv6|
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm b/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm
new file mode 100644
index 0000000..abf048c
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/copymem16x16_v6.asm
@@ -0,0 +1,186 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem16x16_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem16x16_v6| PROC
+    stmdb       sp!, {r4 - r7}
+    ;push   {r4-r7}
+
+    ;preload
+    pld     [r0, #31]                ; preload for next 16x16 block
+
+    ands    r4, r0, #15
+    beq     copy_mem16x16_fast
+
+    ands    r4, r0, #7
+    beq     copy_mem16x16_8
+
+    ands    r4, r0, #3
+    beq     copy_mem16x16_4
+
+    ;copy one byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+    ldrb    r6, [r0, #2]
+    ldrb    r7, [r0, #3]
+
+    mov     r12, #16
+
+copy_mem16x16_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+    strb    r6, [r2, #2]
+    strb    r7, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+    ldrb    r6, [r0, #6]
+    ldrb    r7, [r0, #7]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+    strb    r6, [r2, #6]
+    strb    r7, [r2, #7]
+
+    ldrb    r4, [r0, #8]
+    ldrb    r5, [r0, #9]
+    ldrb    r6, [r0, #10]
+    ldrb    r7, [r0, #11]
+
+    strb    r4, [r2, #8]
+    strb    r5, [r2, #9]
+    strb    r6, [r2, #10]
+    strb    r7, [r2, #11]
+
+    ldrb    r4, [r0, #12]
+    ldrb    r5, [r0, #13]
+    ldrb    r6, [r0, #14]
+    ldrb    r7, [r0, #15]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #12]
+    strb    r5, [r2, #13]
+    strb    r6, [r2, #14]
+    strb    r7, [r2, #15]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+    ldrneb  r6, [r0, #2]
+    ldrneb  r7, [r0, #3]
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+
+    bne     copy_mem16x16_1_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem16x16_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+    ldr     r6, [r0, #8]
+    ldr     r7, [r0, #12]
+
+    mov     r12, #16
+
+copy_mem16x16_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+    str     r6, [r2, #8]
+    str     r7, [r2, #12]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+    ldrne   r6, [r0, #8]
+    ldrne   r7, [r0, #12]
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+
+    bne     copy_mem16x16_4_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem16x16_8
+    sub     r1, r1, #16
+    sub     r3, r3, #16
+
+    mov     r12, #16
+
+copy_mem16x16_8_loop
+    ldmia   r0!, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    ldmia   r0!, {r6-r7}
+
+    add     r0, r0, r1
+
+    stmia   r2!, {r4-r5}
+    subs    r12, r12, #1
+    ;stm        r2, {r4-r5}
+    stmia   r2!, {r6-r7}
+
+    add     r2, r2, r3
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+    bne     copy_mem16x16_8_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+;copy 16 bytes each time
+copy_mem16x16_fast
+    ;sub        r1, r1, #16
+    ;sub        r3, r3, #16
+
+    mov     r12, #16
+
+copy_mem16x16_fast_loop
+    ldmia   r0, {r4-r7}
+    ;ldm        r0, {r4-r7}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r7}
+    ;stm        r2, {r4-r7}
+    add     r2, r2, r3
+
+    pld     [r0, #31]               ; preload for next 16x16 block
+    bne     copy_mem16x16_fast_loop
+
+    ldmia       sp!, {r4 - r7}
+    ;pop        {r4-r7}
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem16x16_v6|
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/copymem8x4_v6.asm b/libvpx/vp8/common/arm/armv6/copymem8x4_v6.asm
new file mode 100644
index 0000000..d8362ef
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/copymem8x4_v6.asm
@@ -0,0 +1,128 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x4_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x4_v6| PROC
+    ;push   {r4-r5}
+    stmdb  sp!, {r4-r5}
+
+    ;preload
+    pld     [r0]
+    pld     [r0, r1]
+    pld     [r0, r1, lsl #1]
+
+    ands    r4, r0, #7
+    beq     copy_mem8x4_fast
+
+    ands    r4, r0, #3
+    beq     copy_mem8x4_4
+
+    ;copy 1 byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+
+    mov     r12, #4
+
+copy_mem8x4_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+
+    ldrb    r4, [r0, #2]
+    ldrb    r5, [r0, #3]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #2]
+    strb    r5, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+
+    ldrb    r4, [r0, #6]
+    ldrb    r5, [r0, #7]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #6]
+    strb    r5, [r2, #7]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+
+    bne     copy_mem8x4_1_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem8x4_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+
+    mov     r12, #4
+
+copy_mem8x4_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+
+    bne     copy_mem8x4_4_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem8x4_fast
+    ;sub        r1, r1, #8
+    ;sub        r3, r3, #8
+
+    mov     r12, #4
+
+copy_mem8x4_fast_loop
+    ldmia   r0, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r5}
+    ;stm        r2, {r4-r5}
+    add     r2, r2, r3
+
+    bne     copy_mem8x4_fast_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x4_v6|
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/copymem8x8_v6.asm b/libvpx/vp8/common/arm/armv6/copymem8x8_v6.asm
new file mode 100644
index 0000000..c6a60c6
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/copymem8x8_v6.asm
@@ -0,0 +1,128 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x8_v6|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x8_v6| PROC
+    ;push   {r4-r5}
+    stmdb  sp!, {r4-r5}
+
+    ;preload
+    pld     [r0]
+    pld     [r0, r1]
+    pld     [r0, r1, lsl #1]
+
+    ands    r4, r0, #7
+    beq     copy_mem8x8_fast
+
+    ands    r4, r0, #3
+    beq     copy_mem8x8_4
+
+    ;copy 1 byte each time
+    ldrb    r4, [r0]
+    ldrb    r5, [r0, #1]
+
+    mov     r12, #8
+
+copy_mem8x8_1_loop
+    strb    r4, [r2]
+    strb    r5, [r2, #1]
+
+    ldrb    r4, [r0, #2]
+    ldrb    r5, [r0, #3]
+
+    subs    r12, r12, #1
+
+    strb    r4, [r2, #2]
+    strb    r5, [r2, #3]
+
+    ldrb    r4, [r0, #4]
+    ldrb    r5, [r0, #5]
+
+    strb    r4, [r2, #4]
+    strb    r5, [r2, #5]
+
+    ldrb    r4, [r0, #6]
+    ldrb    r5, [r0, #7]
+
+    add     r0, r0, r1
+
+    strb    r4, [r2, #6]
+    strb    r5, [r2, #7]
+
+    add     r2, r2, r3
+
+    ldrneb  r4, [r0]
+    ldrneb  r5, [r0, #1]
+
+    bne     copy_mem8x8_1_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 4 bytes each time
+copy_mem8x8_4
+    ldr     r4, [r0]
+    ldr     r5, [r0, #4]
+
+    mov     r12, #8
+
+copy_mem8x8_4_loop
+    subs    r12, r12, #1
+    add     r0, r0, r1
+
+    str     r4, [r2]
+    str     r5, [r2, #4]
+
+    add     r2, r2, r3
+
+    ldrne   r4, [r0]
+    ldrne   r5, [r0, #4]
+
+    bne     copy_mem8x8_4_loop
+
+    ldmia       sp!, {r4 - r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+;copy 8 bytes each time
+copy_mem8x8_fast
+    ;sub        r1, r1, #8
+    ;sub        r3, r3, #8
+
+    mov     r12, #8
+
+copy_mem8x8_fast_loop
+    ldmia   r0, {r4-r5}
+    ;ldm        r0, {r4-r5}
+    add     r0, r0, r1
+
+    subs    r12, r12, #1
+    stmia   r2, {r4-r5}
+    ;stm        r2, {r4-r5}
+    add     r2, r2, r3
+
+    bne     copy_mem8x8_fast_loop
+
+    ldmia  sp!, {r4-r5}
+    ;pop        {r4-r5}
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x8_v6|
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/dc_only_idct_add_v6.asm b/libvpx/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
new file mode 100644
index 0000000..9aa659f
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
@@ -0,0 +1,70 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT  |vp8_dc_only_idct_add_v6|
+
+    AREA    |.text|, CODE, READONLY
+
+;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+;                            int pred_stride, unsigned char *dst_ptr,
+;                            int dst_stride)
+; r0  input_dc
+; r1  pred_ptr
+; r2  pred_stride
+; r3  dst_ptr
+; sp  dst_stride
+
+|vp8_dc_only_idct_add_v6| PROC
+    stmdb       sp!, {r4 - r7}
+
+    add         r0, r0, #4                ; input_dc += 4
+    ldr         r12, c0x0000FFFF
+    ldr         r4, [r1], r2
+    and         r0, r12, r0, asr #3       ; input_dc >> 3 + mask
+    ldr         r6, [r1], r2
+    orr         r0, r0, r0, lsl #16       ; a1 | a1
+
+    ldr         r12, [sp, #16]            ; dst stride
+
+    uxtab16     r5, r0, r4                ; a1+2 | a1+0
+    uxtab16     r4, r0, r4, ror #8        ; a1+3 | a1+1
+    uxtab16     r7, r0, r6
+    uxtab16     r6, r0, r6, ror #8
+    usat16      r5, #8, r5
+    usat16      r4, #8, r4
+    usat16      r7, #8, r7
+    usat16      r6, #8, r6
+    orr         r5, r5, r4, lsl #8
+    orr         r7, r7, r6, lsl #8
+    ldr         r4, [r1], r2
+    str         r5, [r3], r12
+    ldr         r6, [r1]
+    str         r7, [r3], r12
+
+    uxtab16     r5, r0, r4
+    uxtab16     r4, r0, r4, ror #8
+    uxtab16     r7, r0, r6
+    uxtab16     r6, r0, r6, ror #8
+    usat16      r5, #8, r5
+    usat16      r4, #8, r4
+    usat16      r7, #8, r7
+    usat16      r6, #8, r6
+    orr         r5, r5, r4, lsl #8
+    orr         r7, r7, r6, lsl #8
+    str         r5, [r3], r12
+    str         r7, [r3]
+
+    ldmia       sp!, {r4 - r7}
+    bx          lr
+
+    ENDP  ; |vp8_dc_only_idct_add_v6|
+
+; Constant Pool
+c0x0000FFFF DCD 0x0000FFFF
+    END
diff --git a/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm b/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
new file mode 100644
index 0000000..2510ad8
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
@@ -0,0 +1,190 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT |vp8_dequant_idct_add_v6|
+
+    AREA |.text|, CODE, READONLY
+;void vp8_dequant_idct_v6(short *input, short *dq,
+;                         unsigned char *dest, int stride)
+; r0 = q
+; r1 = dq
+; r2 = dst
+; r3 = stride
+
+|vp8_dequant_idct_add_v6| PROC
+    stmdb   sp!, {r4-r11, lr}
+
+    ldr     r4, [r0]                ;input
+    ldr     r5, [r1], #4            ;dq
+
+    sub     sp, sp, #4
+    str     r3, [sp]
+
+    mov     r12, #4
+
+vp8_dequant_add_loop
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    ldr     r4, [r0, #4]            ;input
+    ldr     r5, [r1], #4            ;dq
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    smulbb  r6, r4, r5
+    smultt  r7, r4, r5
+
+    subs    r12, r12, #1
+
+    ldrne   r4, [r0, #4]
+    ldrne   r5, [r1], #4
+
+    strh    r6, [r0], #2
+    strh    r7, [r0], #2
+
+    bne     vp8_dequant_add_loop
+
+    sub     r0, r0, #32
+    mov     r1, r0
+
+; short_idct4x4llm_v6_dual
+    ldr     r3, cospi8sqrt2minus1
+    ldr     r4, sinpi8sqrt2
+    ldr     r6, [r0, #8]
+    mov     r5, #2
+vp8_dequant_idct_loop1_v6
+    ldr     r12, [r0, #24]
+    ldr     r14, [r0, #16]
+    smulwt  r9, r3, r6
+    smulwb  r7, r3, r6
+    smulwt  r10, r4, r6
+    smulwb  r8, r4, r6
+    pkhbt   r7, r7, r9, lsl #16
+    smulwt  r11, r3, r12
+    pkhbt   r8, r8, r10, lsl #16
+    uadd16  r6, r6, r7
+    smulwt  r7, r4, r12
+    smulwb  r9, r3, r12
+    smulwb  r10, r4, r12
+    subs    r5, r5, #1
+    pkhbt   r9, r9, r11, lsl #16
+    ldr     r11, [r0], #4
+    pkhbt   r10, r10, r7, lsl #16
+    uadd16  r7, r12, r9
+    usub16  r7, r8, r7
+    uadd16  r6, r6, r10
+    uadd16  r10, r11, r14
+    usub16  r8, r11, r14
+    uadd16  r9, r10, r6
+    usub16  r10, r10, r6
+    uadd16  r6, r8, r7
+    usub16  r7, r8, r7
+    str     r6, [r1, #8]
+    ldrne   r6, [r0, #8]
+    str     r7, [r1, #16]
+    str     r10, [r1, #24]
+    str     r9, [r1], #4
+    bne     vp8_dequant_idct_loop1_v6
+
+    mov     r5, #2
+    sub     r0, r1, #8
+vp8_dequant_idct_loop2_v6
+    ldr     r6, [r0], #4
+    ldr     r7, [r0], #4
+    ldr     r8, [r0], #4
+    ldr     r9, [r0], #4
+    smulwt  r1, r3, r6
+    smulwt  r12, r4, r6
+    smulwt  lr, r3, r8
+    smulwt  r10, r4, r8
+    pkhbt   r11, r8, r6, lsl #16
+    pkhbt   r1, lr, r1, lsl #16
+    pkhbt   r12, r10, r12, lsl #16
+    pkhtb   r6, r6, r8, asr #16
+    uadd16  r6, r1, r6
+    pkhbt   lr, r9, r7, lsl #16
+    uadd16  r10, r11, lr
+    usub16  lr, r11, lr
+    pkhtb   r8, r7, r9, asr #16
+    subs    r5, r5, #1
+    smulwt  r1, r3, r8
+    smulwb  r7, r3, r8
+    smulwt  r11, r4, r8
+    smulwb  r9, r4, r8
+    pkhbt   r1, r7, r1, lsl #16
+    uadd16  r8, r1, r8
+    pkhbt   r11, r9, r11, lsl #16
+    usub16  r1, r12, r8
+    uadd16  r8, r11, r6
+    ldr     r9, c0x00040004
+    ldr     r12, [sp]               ; get stride from stack
+    uadd16  r6, r10, r8
+    usub16  r7, r10, r8
+    uadd16  r7, r7, r9
+    uadd16  r6, r6, r9
+    uadd16  r10, r14, r1
+    usub16  r1, r14, r1
+    uadd16  r10, r10, r9
+    uadd16  r1, r1, r9
+    ldr     r11, [r2]               ; load input from dst
+    mov     r8, r7, asr #3
+    pkhtb   r9, r8, r10, asr #19
+    mov     r8, r1, asr #3
+    pkhtb   r8, r8, r6, asr #19
+    uxtb16  lr, r11, ror #8
+    qadd16  r9, r9, lr
+    uxtb16  lr, r11
+    qadd16  r8, r8, lr
+    usat16  r9, #8, r9
+    usat16  r8, #8, r8
+    orr     r9, r8, r9, lsl #8
+    ldr     r11, [r2, r12]          ; load input from dst
+    mov     r7, r7, lsl #16
+    mov     r1, r1, lsl #16
+    mov     r10, r10, lsl #16
+    mov     r6, r6, lsl #16
+    mov     r7, r7, asr #3
+    pkhtb   r7, r7, r10, asr #19
+    mov     r1, r1, asr #3
+    pkhtb   r1, r1, r6, asr #19
+    uxtb16  r8, r11, ror #8
+    qadd16  r7, r7, r8
+    uxtb16  r8, r11
+    qadd16  r1, r1, r8
+    usat16  r7, #8, r7
+    usat16  r1, #8, r1
+    orr     r1, r1, r7, lsl #8
+    str     r9, [r2], r12           ; store output to dst
+    str     r1, [r2], r12           ; store output to dst
+    bne     vp8_dequant_idct_loop2_v6
+
+; vpx_memset
+    sub     r0, r0, #32
+    add     sp, sp, #4
+
+    mov     r12, #0
+    str     r12, [r0]
+    str     r12, [r0, #4]
+    str     r12, [r0, #8]
+    str     r12, [r0, #12]
+    str     r12, [r0, #16]
+    str     r12, [r0, #20]
+    str     r12, [r0, #24]
+    str     r12, [r0, #28]
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP    ; |vp8_dequant_idct_add_v6|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x00004E7B
+sinpi8sqrt2       DCD 0x00008A8C
+c0x00040004       DCD 0x00040004
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/dequantize_v6.asm b/libvpx/vp8/common/arm/armv6/dequantize_v6.asm
new file mode 100644
index 0000000..72f7e0e
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/dequantize_v6.asm
@@ -0,0 +1,69 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequantize_b_loop_v6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------
+;void   vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+; r0    short *Q,
+; r1    short *DQC
+; r2    short *DQ
+|vp8_dequantize_b_loop_v6| PROC
+    stmdb   sp!, {r4-r9, lr}
+
+    ldr     r3, [r0]                ;load Q
+    ldr     r4, [r1]                ;load DQC
+    ldr     r5, [r0, #4]
+    ldr     r6, [r1, #4]
+
+    mov     r12, #2                 ;loop counter
+
+dequant_loop
+    smulbb  r7, r3, r4              ;multiply
+    smultt  r8, r3, r4
+    smulbb  r9, r5, r6
+    smultt  lr, r5, r6
+
+    ldr     r3, [r0, #8]
+    ldr     r4, [r1, #8]
+    ldr     r5, [r0, #12]
+    ldr     r6, [r1, #12]
+
+    strh    r7, [r2], #2            ;store result
+    smulbb  r7, r3, r4              ;multiply
+    strh    r8, [r2], #2
+    smultt  r8, r3, r4
+    strh    r9, [r2], #2
+    smulbb  r9, r5, r6
+    strh    lr, [r2], #2
+    smultt  lr, r5, r6
+
+    subs    r12, r12, #1
+
+    add     r0, r0, #16
+    add     r1, r1, #16
+
+    ldrne       r3, [r0]
+    strh    r7, [r2], #2            ;store result
+    ldrne       r4, [r1]
+    strh    r8, [r2], #2
+    ldrne       r5, [r0, #4]
+    strh    r9, [r2], #2
+    ldrne       r6, [r1, #4]
+    strh    lr, [r2], #2
+
+    bne     dequant_loop
+
+    ldmia   sp!, {r4-r9, pc}
+    ENDP    ;|vp8_dequantize_b_loop_v6|
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/filter_v6.asm b/libvpx/vp8/common/arm/armv6/filter_v6.asm
new file mode 100644
index 0000000..1ba91dd
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/filter_v6.asm
@@ -0,0 +1,624 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_filter_block2d_first_pass_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_16x16_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_8x8_armv6|
+    EXPORT  |vp8_filter_block2d_second_pass_armv6|
+    EXPORT  |vp8_filter4_block2d_second_pass_armv6|
+    EXPORT  |vp8_filter_block2d_first_pass_only_armv6|
+    EXPORT  |vp8_filter_block2d_second_pass_only_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------------
+; r0    unsigned char *src_ptr
+; r1    short         *output_ptr
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int output_width
+; stack unsigned int output_height
+; stack const short *vp8_filter
+;-------------------------------------
+; vp8_filter the input and put in the output array.  Apply the 6 tap FIR filter with
+; the output being a 2 byte value and the intput being a 1 byte value.
+|vp8_filter_block2d_first_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+; --------------------------
+; 16x16 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_16x16_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    add     r4, r2, #18                     ; preload next low
+    pld     [r0, r4]
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_16_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_16_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_16_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r11, r2, #34                    ; adding back block width(=16)
+    pld     [r0, r11]                       ; preload next low
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_16_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+; --------------------------
+; 8x8 version
+; -----------------------------
+|vp8_filter_block2d_first_pass_8x8_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vp8_filter address
+    ldr     r7, [sp, #36]                   ; output height
+
+    add     r4, r2, #10                     ; preload next low
+    pld     [r0, r4]
+
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
+    add     r12, r3, #16                    ; square off the output
+    sub     sp, sp, #4
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r1, [sp]                        ; push destination to stack
+    mov     r7, r7, lsl #16                 ; height is top part of counter
+
+; six tap filter
+|height_loop_1st_8_6|
+    ldrb    r8, [r0, #-2]                   ; load source data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+    orr     r7, r7, r3, lsr #2              ; construct loop counter
+
+|width_loop_1st_8_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+    smuad   lr, lr, r4                      ; apply the filter
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    sub     r7, r7, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r11, r10, r6, r8
+
+    ands    r10, r7, #0xff                  ; test loop counter
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r11, r11, #0x40
+    ldrneb  r9, [r0, #-1]
+    usat    r11, #8, r11, asr #7
+
+    strh    lr, [r1], r12                   ; result is transposed and stored, which
+                                            ; will make second pass filtering easier.
+    ldrneb  r10, [r0], #2
+    strh    r11, [r1], r12
+
+    bne     width_loop_1st_8_6
+
+    ldr     r1, [sp]                        ; load and update dst address
+    subs    r7, r7, #0x10000
+    add     r0, r0, r2                      ; move to next input line
+
+    add     r11, r2, #18                    ; adding back block width(=8)
+    pld     [r0, r11]                       ; preload next low
+
+    add     r1, r1, #2                      ; move over to next column
+    str     r1, [sp]
+
+    bne     height_loop_1st_8_6
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;---------------------------------
+; r0    short         *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int output_pitch,
+; r3    unsigned int cnt,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter_block2d_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #36]                  ; vp8_filter address
+    sub     sp, sp, #4
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+    str     r1, [sp]                        ; push destination to stack
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    pkhbt   r12, r5, r4                     ; pack the filter differently
+    pkhbt   r11, r6, r5
+
+    sub     r0, r0, #4                      ; offset input buffer
+
+|height_loop_2nd|
+    ldr     r8, [r0]                        ; load the data
+    ldr     r9, [r0, #4]
+    orr     r7, r7, r3, lsr #1              ; loop counter
+
+|width_loop_2nd|
+    smuad   lr, r4, r8                      ; apply filter
+    sub     r7, r7, #1
+    smulbt  r8, r4, r8
+
+    ldr     r10, [r0, #8]
+
+    smlad   lr, r5, r9, lr
+    smladx  r8, r12, r9, r8
+
+    ldrh    r9, [r0, #12]
+
+    smlad   lr, r6, r10, lr
+    smladx  r8, r11, r10, r8
+
+    add     r0, r0, #4
+    smlatb  r10, r6, r9, r8
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ands    r8, r7, #0xff
+    usat    lr, #8, lr, asr #7
+    add     r10, r10, #0x40
+    strb    lr, [r1], r2                    ; the result is transposed back and stored
+    usat    r10, #8, r10, asr #7
+
+    ldrne   r8, [r0]                        ; load data for next loop
+    ldrne   r9, [r0, #4]
+    strb    r10, [r1], r2
+
+    bne     width_loop_2nd
+
+    ldr     r1, [sp]                        ; update dst for next loop
+    subs    r7, r7, #0x10000
+    add     r0, r0, #16                     ; updata src for next loop
+    add     r1, r1, #1
+    str     r1, [sp]
+
+    bne     height_loop_2nd
+
+    add     sp, sp, #4
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;---------------------------------
+; r0    short         *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int output_pitch,
+; r3    unsigned int cnt,
+; stack const short *vp8_filter
+;---------------------------------
+|vp8_filter4_block2d_second_pass_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #36]                  ; vp8_filter address
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    add     lr, r1, r3                      ; save final destination pointer
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    pkhbt   r12, r5, r4                     ; pack the filter differently
+    pkhbt   r11, r6, r5
+    mov     r4, #0x40                       ; rounding factor (for smlad{x})
+
+|height_loop_2nd_4|
+    ldrd    r8, [r0, #-4]                   ; load the data
+    orr     r7, r7, r3, lsr #1              ; loop counter
+
+|width_loop_2nd_4|
+    ldr     r10, [r0, #4]!
+    smladx  r6, r9, r12, r4                 ; apply filter
+    pkhbt   r8, r9, r8
+    smlad   r5, r8, r12, r4
+    pkhbt   r8, r10, r9
+    smladx  r6, r10, r11, r6
+    sub     r7, r7, #1
+    smlad   r5, r8, r11, r5
+
+    mov     r8, r9                          ; shift the data for the next loop
+    mov     r9, r10
+
+    usat    r6, #8, r6, asr #7              ; shift and clamp
+    usat    r5, #8, r5, asr #7
+
+    strb    r5, [r1], r2                    ; the result is transposed back and stored
+    tst     r7, #0xff
+    strb    r6, [r1], r2
+
+    bne     width_loop_2nd_4
+
+    subs    r7, r7, #0x10000
+    add     r0, r0, #16                     ; update src for next loop
+    sub     r1, lr, r7, lsr #16             ; update dst for next loop
+
+    bne     height_loop_2nd_4
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+;------------------------------------
+; r0    unsigned char *src_ptr
+; r1    unsigned char *output_ptr,
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp8_filter
+;------------------------------------
+|vp8_filter_block2d_first_pass_only_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    add     r7, r2, r3                      ; preload next low
+    add     r7, r7, #2
+    pld     [r0, r7]
+
+    ldr     r4, [sp, #36]                   ; output pitch
+    ldr     r11, [sp, #40]                  ; HFilter address
+    sub     sp, sp, #8
+
+    mov     r7, r3
+    sub     r2, r2, r3                      ; inside loop increments input array,
+                                            ; so the height loop only needs to add
+                                            ; r2 - width to the input pointer
+
+    sub     r4, r4, r3
+    str     r4, [sp]                        ; save modified output pitch
+    str     r2, [sp, #4]
+
+    mov     r2, #0x40
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+; six tap filter
+|height_loop_1st_only_6|
+    ldrb    r8, [r0, #-2]                   ; load data
+    ldrb    r9, [r0, #-1]
+    ldrb    r10, [r0], #2
+
+    mov     r12, r3, lsr #1                 ; loop counter
+
+|width_loop_1st_only_6|
+    ldrb    r11, [r0, #-1]
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0]
+
+;;  smuad   lr, lr, r4
+    smlad   lr, lr, r4, r2
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+;;  smuad   r8, r8, r4
+    smlad   r8, r8, r4, r2
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0, #1]
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0, #2]
+
+    subs    r12, r12, #1
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r10, r10, r6, r8
+
+;;  add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0, #-2]                   ; load data for next loop
+    usat    lr, #8, lr, asr #7
+;;  add     r10, r10, #0x40
+    strb    lr, [r1], #1                    ; store the result
+    usat    r10, #8, r10, asr #7
+
+    ldrneb  r9, [r0, #-1]
+    strb    r10, [r1], #1
+    ldrneb  r10, [r0], #2
+
+    bne     width_loop_1st_only_6
+
+    ldr     lr, [sp]                        ; load back output pitch
+    ldr     r12, [sp, #4]                   ; load back output pitch
+    subs    r7, r7, #1
+    add     r0, r0, r12                     ; updata src for next loop
+
+    add     r11, r12, r3                    ; preload next low
+    add     r11, r11, #2
+    pld     [r0, r11]
+
+    add     r1, r1, lr                      ; update dst for next loop
+
+    bne     height_loop_1st_only_6
+
+    add     sp, sp, #8
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp8_filter_block2d_first_pass_only_armv6|
+
+
+;------------------------------------
+; r0    unsigned char *src_ptr,
+; r1    unsigned char *output_ptr,
+; r2    unsigned int src_pixels_per_line
+; r3    unsigned int cnt,
+; stack unsigned int output_pitch,
+; stack const short *vp8_filter
+;------------------------------------
+|vp8_filter_block2d_second_pass_only_armv6| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; VFilter address
+    ldr     r12, [sp, #36]                  ; output pitch
+
+    mov     r7, r3, lsl #16                 ; height is top part of counter
+    sub     r0, r0, r2, lsl #1              ; need 6 elements for filtering, 2 before, 3 after
+
+    sub     sp, sp, #8
+
+    ldr     r4, [r11]                       ; load up packed filter coefficients
+    ldr     r5, [r11, #4]
+    ldr     r6, [r11, #8]
+
+    str     r0, [sp]                        ; save r0 to stack
+    str     r1, [sp, #4]                    ; save dst to stack
+
+; six tap filter
+|width_loop_2nd_only_6|
+    ldrb    r8, [r0], r2                    ; load data
+    orr     r7, r7, r3                      ; loop counter
+    ldrb    r9, [r0], r2
+    ldrb    r10, [r0], r2
+
+|height_loop_2nd_only_6|
+    ; filter first column in this inner loop, than, move to next colum.
+    ldrb    r11, [r0], r2
+
+    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
+    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
+
+    ldrb    r9, [r0], r2
+
+    smuad   lr, lr, r4
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+    smuad   r8, r8, r4
+    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
+
+    smlad   lr, r10, r5, lr
+    ldrb    r10, [r0], r2
+    smlad   r8, r11, r5, r8
+    ldrb    r11, [r0]
+
+    sub     r7, r7, #2
+    sub     r0, r0, r2, lsl #2
+
+    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
+    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
+
+    smlad   lr, r9, r6, lr
+    smlad   r10, r10, r6, r8
+
+    ands    r9, r7, #0xff
+
+    add     lr, lr, #0x40                   ; round_shift_and_clamp
+    ldrneb  r8, [r0], r2                    ; load data for next loop
+    usat    lr, #8, lr, asr #7
+    add     r10, r10, #0x40
+    strb    lr, [r1], r12                   ; store the result for the column
+    usat    r10, #8, r10, asr #7
+
+    ldrneb  r9, [r0], r2
+    strb    r10, [r1], r12
+    ldrneb  r10, [r0], r2
+
+    bne     height_loop_2nd_only_6
+
+    ldr     r0, [sp]
+    ldr     r1, [sp, #4]
+    subs    r7, r7, #0x10000
+    add     r0, r0, #1                      ; move to filter next column
+    str     r0, [sp]
+    add     r1, r1, #1
+    str     r1, [sp, #4]
+
+    bne     width_loop_2nd_only_6
+
+    add     sp, sp, #8
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vp8_filter_block2d_second_pass_only_armv6|
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/idct_blk_v6.c b/libvpx/vp8/common/arm/armv6/idct_blk_v6.c
new file mode 100644
index 0000000..6002c0f
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/idct_blk_v6.c
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+
+void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
+                                     unsigned char *dst,
+                                     int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_v6 (q, dq, dst, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        if (eobs[2] > 1)
+            vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride);
+        else if (eobs[2] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride);
+            ((int *)(q+32))[0] = 0;
+        }
+
+        if (eobs[3] > 1)
+            vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride);
+        else if (eobs[3] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride);
+            ((int *)(q+48))[0] = 0;
+        }
+
+        q    += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq,
+                                      unsigned char *dstu,
+                                      unsigned char *dstv,
+                                      int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_v6 (q, dq, dstu, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride,
+                                                  dstu+4, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        q    += 32;
+        dstu += 4*stride;
+        eobs += 2;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_v6 (q, dq, dstv, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride,
+                                                  dstv+4, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        q    += 32;
+        dstv += 4*stride;
+        eobs += 2;
+    }
+}
diff --git a/libvpx/vp8/common/arm/armv6/idct_v6.asm b/libvpx/vp8/common/arm/armv6/idct_v6.asm
new file mode 100644
index 0000000..b4d44cb
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/idct_v6.asm
@@ -0,0 +1,202 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_idct4x4llm_v6_dual|
+
+    AREA    |.text|, CODE, READONLY
+
+
+; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
+;                             unsigned char *dst, int stride)
+; r0    short* input
+; r1    unsigned char* pred
+; r2    int pitch
+; r3    unsigned char* dst
+; sp    int stride
+
+|vp8_short_idct4x4llm_v6_dual| PROC
+    stmdb   sp!, {r4-r11, lr}
+
+    sub     sp, sp, #4
+
+    mov     r4, #0x00008A00         ; sin
+    orr     r4, r4, #0x0000008C     ; sinpi8sqrt2
+
+    mov     r5, #0x00004E00         ; cos
+    orr     r5, r5, #0x0000007B     ; cospi8sqrt2minus1
+    orr     r5, r5, #1<<31          ; loop counter on top bit
+
+loop1_dual
+    ldr     r6, [r0, #(4*2)]        ; i5 | i4
+    ldr     r12, [r0, #(12*2)]      ; i13|i12
+    ldr     r14, [r0, #(8*2)]       ; i9 | i8
+
+    smulbt  r9, r5, r6              ; (ip[5] * cospi8sqrt2minus1) >> 16
+    smulbb  r7, r5, r6              ; (ip[4] * cospi8sqrt2minus1) >> 16
+    smulwt  r10, r4, r6             ; (ip[5] * sinpi8sqrt2) >> 16
+    smulwb  r8, r4, r6              ; (ip[4] * sinpi8sqrt2) >> 16
+
+    smulbt  r11, r5, r12            ; (ip[13] * cospi8sqrt2minus1) >> 16
+    pkhtb   r7, r9, r7, asr #16     ; 5c | 4c
+    pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s
+    uadd16  r6, r6, r7              ; 5c+5 | 4c+4
+
+    smulwt  r7, r4, r12             ; (ip[13] * sinpi8sqrt2) >> 16
+    smulbb  r9, r5, r12             ; (ip[12] * cospi8sqrt2minus1) >> 16
+    smulwb  r10, r4, r12            ; (ip[12] * sinpi8sqrt2) >> 16
+
+    subs    r5, r5, #1<<31          ; i--
+
+    pkhtb   r9, r11, r9, asr #16    ; 13c | 12c
+    ldr     r11, [r0]               ; i1 | i0
+    pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s
+    uadd16  r7, r12, r9             ; 13c+13 | 12c+12
+
+    usub16  r7, r8, r7              ; c
+    uadd16  r6, r6, r10             ; d
+    uadd16  r10, r11, r14           ; a
+    usub16  r8, r11, r14            ; b
+
+    uadd16  r9, r10, r6             ; a+d
+    usub16  r10, r10, r6            ; a-d
+    uadd16  r6, r8, r7              ; b+c
+    usub16  r7, r8, r7              ; b-c
+
+    ; use input buffer to store intermediate results
+    str      r6, [r0, #(4*2)]       ; o5 | o4
+    str      r7, [r0, #(8*2)]       ; o9 | o8
+    str      r10,[r0, #(12*2)]      ; o13|o12
+    str      r9, [r0], #4           ; o1 | o0
+
+    bcs loop1_dual
+
+    sub     r0, r0, #8              ; reset input/output
+    str     r0, [sp]
+
+loop2_dual
+
+    ldr     r6, [r0, #(4*2)]        ; i5 | i4
+    ldr     r12,[r0, #(2*2)]        ; i3 | i2
+    ldr     r14,[r0, #(6*2)]        ; i7 | i6
+    ldr     r0, [r0, #(0*2)]        ; i1 | i0
+
+    smulbt  r9, r5, r6              ; (ip[5] * cospi8sqrt2minus1) >> 16
+    smulbt  r7, r5, r0              ; (ip[1] * cospi8sqrt2minus1) >> 16
+    smulwt  r10, r4, r6             ; (ip[5] * sinpi8sqrt2) >> 16
+    smulwt  r8, r4, r0              ; (ip[1] * sinpi8sqrt2) >> 16
+
+    pkhbt   r11, r6, r0, lsl #16    ; i0 | i4
+    pkhtb   r7, r7, r9, asr #16     ; 1c | 5c
+    pkhtb   r0, r0, r6, asr #16     ; i1 | i5
+    pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1
+
+    uadd16  r0, r7, r0              ; 1c+1 | 5c+5 = temp2
+    pkhbt   r9, r14, r12, lsl #16   ; i2 | i6
+    uadd16  r10, r11, r9            ; a
+    usub16  r9, r11, r9             ; b
+    pkhtb   r6, r12, r14, asr #16   ; i3 | i7
+
+    subs    r5, r5, #1<<31          ; i--
+
+    smulbt  r7, r5, r6              ; (ip[3] * cospi8sqrt2minus1) >> 16
+    smulwt  r11, r4, r6             ; (ip[3] * sinpi8sqrt2) >> 16
+    smulbb  r12, r5, r6             ; (ip[7] * cospi8sqrt2minus1) >> 16
+    smulwb  r14, r4, r6             ; (ip[7] * sinpi8sqrt2) >> 16
+
+    pkhtb   r7, r7, r12, asr #16    ; 3c | 7c
+    pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1
+
+    uadd16  r6, r7, r6              ; 3c+3 | 7c+7 = temp2
+    usub16  r12, r8, r6             ; c (o1 | o5)
+    uadd16  r6, r11, r0             ; d (o3 | o7)
+    uadd16  r7, r10, r6             ; a+d
+
+    mov     r8, #4                  ; set up 4's
+    orr     r8, r8, #0x40000        ; 4|4
+
+    usub16  r6, r10, r6             ; a-d
+    uadd16  r6, r6, r8              ; a-d+4, 3|7
+    uadd16  r7, r7, r8              ; a+d+4, 0|4
+    uadd16  r10, r9, r12            ; b+c
+    usub16  r0, r9, r12             ; b-c
+    uadd16  r10, r10, r8            ; b+c+4, 1|5
+    uadd16  r8, r0, r8              ; b-c+4, 2|6
+
+    ldr     lr, [sp, #40]           ; dst stride
+
+    ldrb    r0, [r1]                ; pred p0
+    ldrb    r11, [r1, #1]           ; pred p1
+    ldrb    r12, [r1, #2]           ; pred p2
+
+    add     r0, r0, r7, asr #19     ; p0 + o0
+    add     r11, r11, r10, asr #19  ; p1 + o1
+    add     r12, r12, r8, asr #19   ; p2 + o2
+
+    usat    r0, #8, r0              ; d0 = clip8(p0 + o0)
+    usat    r11, #8, r11            ; d1 = clip8(p1 + o1)
+    usat    r12, #8, r12            ; d2 = clip8(p2 + o2)
+
+    add     r0, r0, r11, lsl #8     ; |--|--|d1|d0|
+
+    ldrb    r11, [r1, #3]           ; pred p3
+
+    add     r0, r0, r12, lsl #16    ; |--|d2|d1|d0|
+
+    add     r11, r11, r6, asr #19   ; p3 + o3
+
+    sxth    r7, r7                  ;
+    sxth    r10, r10                ;
+
+    usat    r11, #8, r11            ; d3 = clip8(p3 + o3)
+
+    sxth    r8, r8                  ;
+    sxth    r6, r6                  ;
+
+    add     r0, r0, r11, lsl #24    ; |d3|d2|d1|d0|
+
+    ldrb    r12, [r1, r2]!          ; pred p4
+    str     r0, [r3], lr
+    ldrb    r11, [r1, #1]           ; pred p5
+
+    add     r12, r12, r7, asr #3    ; p4 + o4
+    add     r11, r11, r10, asr #3   ; p5 + o5
+
+    usat    r12, #8, r12            ; d4 = clip8(p4 + o4)
+    usat    r11, #8, r11            ; d5 = clip8(p5 + o5)
+
+    ldrb    r7, [r1, #2]            ; pred p6
+    ldrb    r10, [r1, #3]           ; pred p6
+
+    add     r12, r12, r11, lsl #8   ; |--|--|d5|d4|
+
+    add     r7, r7, r8, asr #3      ; p6 + o6
+    add     r10, r10, r6, asr #3    ; p7 + o7
+
+    ldr     r0, [sp]                ; load input pointer
+
+    usat    r7, #8, r7              ; d6 = clip8(p6 + o6)
+    usat    r10, #8, r10            ; d7 = clip8(p7 + o7)
+
+    add     r12, r12, r7, lsl #16   ; |--|d6|d5|d4|
+    add     r12, r12, r10, lsl #24  ; |d7|d6|d5|d4|
+
+    str     r12, [r3], lr
+    add     r0, r0, #16
+    add     r1, r1, r2              ; pred + pitch
+
+    bcs loop2_dual
+
+    add     sp, sp, #4              ; idct_output buffer
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/intra4x4_predict_v6.asm b/libvpx/vp8/common/arm/armv6/intra4x4_predict_v6.asm
new file mode 100644
index 0000000..c5ec824
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/intra4x4_predict_v6.asm
@@ -0,0 +1,611 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_intra4x4_predict_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+
+;void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft,
+;                                B_PREDICTION_MODE left_stride, int b_mode,
+;                                unsigned char *dst, int dst_stride,
+;                                unsigned char top_left)
+
+; r0: *Above
+; r1: *yleft
+; r2: left_stride
+; r3: b_mode
+; sp + #40: dst
+; sp + #44: dst_stride
+; sp + #48: top_left
+|vp8_intra4x4_predict_armv6| PROC
+    push        {r4-r12, lr}
+
+    cmp         r3, #10
+    addlt       pc, pc, r3, lsl #2       ; position independent switch
+    pop         {r4-r12, pc}             ; default
+    b           b_dc_pred
+    b           b_tm_pred
+    b           b_ve_pred
+    b           b_he_pred
+    b           b_ld_pred
+    b           b_rd_pred
+    b           b_vr_pred
+    b           b_vl_pred
+    b           b_hd_pred
+    b           b_hu_pred
+
+b_dc_pred
+    ; load values
+    ldr         r8, [r0]                 ; Above
+    ldrb        r4, [r1], r2             ; Left[0]
+    mov         r9, #0
+    ldrb        r5, [r1], r2             ; Left[1]
+    ldrb        r6, [r1], r2             ; Left[2]
+    usad8       r12, r8, r9
+    ldrb        r7, [r1]                 ; Left[3]
+
+    ; calculate dc
+    add         r4, r4, r5
+    add         r4, r4, r6
+    add         r4, r4, r7
+    add         r4, r4, r12
+    add         r4, r4, #4
+    ldr         r0, [sp, #44]           ; dst_stride
+    mov         r12, r4, asr #3         ; (expected_dc + 4) >> 3
+
+    add         r12, r12, r12, lsl #8
+    ldr         r3, [sp, #40]           ; dst
+    add         r12, r12, r12, lsl #16
+
+    ; store values
+    str         r12, [r3], r0
+    str         r12, [r3], r0
+    str         r12, [r3], r0
+    str         r12, [r3]
+
+    pop        {r4-r12, pc}
+
+b_tm_pred
+    ldr         r8, [r0]                ; Above
+    ldrb        r9, [sp, #48]           ; top_left
+    ldrb        r4, [r1], r2            ; Left[0]
+    ldrb        r5, [r1], r2            ; Left[1]
+    ldrb        r6, [r1], r2            ; Left[2]
+    ldrb        r7, [r1]                ; Left[3]
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    add         r9, r9, r9, lsl #16     ; [tl|tl]
+    uxtb16      r10, r8                 ; a[2|0]
+    uxtb16      r11, r8, ror #8         ; a[3|1]
+    ssub16      r10, r10, r9            ; a[2|0] - [tl|tl]
+    ssub16      r11, r11, r9            ; a[3|1] - [tl|tl]
+
+    add         r4, r4, r4, lsl #16     ; l[0|0]
+    add         r5, r5, r5, lsl #16     ; l[1|1]
+    add         r6, r6, r6, lsl #16     ; l[2|2]
+    add         r7, r7, r7, lsl #16     ; l[3|3]
+
+    sadd16      r1, r4, r10             ; l[0|0] + a[2|0] - [tl|tl]
+    sadd16      r2, r4, r11             ; l[0|0] + a[3|1] - [tl|tl]
+    usat16      r1, #8, r1
+    usat16      r2, #8, r2
+
+    sadd16      r4, r5, r10             ; l[1|1] + a[2|0] - [tl|tl]
+    sadd16      r5, r5, r11             ; l[1|1] + a[3|1] - [tl|tl]
+
+    add         r12, r1, r2, lsl #8     ; [3|2|1|0]
+    str         r12, [r3], r0
+
+    usat16      r4, #8, r4
+    usat16      r5, #8, r5
+
+    sadd16      r1, r6, r10             ; l[2|2] + a[2|0] - [tl|tl]
+    sadd16      r2, r6, r11             ; l[2|2] + a[3|1] - [tl|tl]
+
+    add         r12, r4, r5, lsl #8     ; [3|2|1|0]
+    str         r12, [r3], r0
+
+    usat16      r1, #8, r1
+    usat16      r2, #8, r2
+
+    sadd16      r4, r7, r10             ; l[3|3] + a[2|0] - [tl|tl]
+    sadd16      r5, r7, r11             ; l[3|3] + a[3|1] - [tl|tl]
+
+    add         r12, r1, r2, lsl #8     ; [3|2|1|0]
+
+    usat16      r4, #8, r4
+    usat16      r5, #8, r5
+
+    str         r12, [r3], r0
+
+    add         r12, r4, r5, lsl #8     ; [3|2|1|0]
+    str         r12, [r3]
+
+    pop        {r4-r12, pc}
+
+b_ve_pred
+    ldr         r8, [r0]                ; a[3|2|1|0]
+    ldr         r11, c00FF00FF
+    ldrb        r9, [sp, #48]           ; top_left
+    ldrb        r10, [r0, #4]           ; a[4]
+
+    ldr         r0, c00020002
+
+    uxtb16      r4, r8                  ; a[2|0]
+    uxtb16      r5, r8, ror #8          ; a[3|1]
+    ldr         r2, [sp, #44]           ; dst_stride
+    pkhbt       r9, r9, r5, lsl #16     ; a[1|-1]
+
+    add         r9, r9, r4, lsl #1      ;[a[1]+2*a[2]       | tl+2*a[0]       ]
+    uxtab16     r9, r9, r5              ;[a[1]+2*a[2]+a[3]  | tl+2*a[0]+a[1]  ]
+    ldr         r3, [sp, #40]           ; dst
+    uxtab16     r9, r9, r0              ;[a[1]+2*a[2]+a[3]+2| tl+2*a[0]+a[1]+2]
+
+    add         r0, r0, r10, lsl #16    ;[a[4]+2            |                 2]
+    add         r0, r0, r4, asr #16     ;[a[4]+2            |            a[2]+2]
+    add         r0, r0, r5, lsl #1      ;[a[4]+2*a[3]+2     |     a[2]+2*a[1]+2]
+    uadd16      r4, r4, r0              ;[a[4]+2*a[3]+a[2]+2|a[2]+2*a[1]+a[0]+2]
+
+    and         r9, r11, r9, asr #2
+    and         r4, r11, r4, asr #2
+    add         r9, r9, r4, lsl #8
+
+    ; store values
+    str         r9, [r3], r2
+    str         r9, [r3], r2
+    str         r9, [r3], r2
+    str         r9, [r3]
+
+    pop        {r4-r12, pc}
+
+
+b_he_pred
+    ldrb        r4, [r1], r2            ; Left[0]
+    ldrb        r8, [sp, #48]           ; top_left
+    ldrb        r5, [r1], r2            ; Left[1]
+    ldrb        r6, [r1], r2            ; Left[2]
+    ldrb        r7, [r1]                ; Left[3]
+
+    add         r8, r8, r4              ; tl   + l[0]
+    add         r9, r4, r5              ; l[0] + l[1]
+    add         r10, r5, r6             ; l[1] + l[2]
+    add         r11, r6, r7             ; l[2] + l[3]
+
+    mov         r0, #2<<14
+
+    add         r8, r8, r9              ; tl + 2*l[0] + l[1]
+    add         r4, r9, r10             ; l[0] + 2*l[1] + l[2]
+    add         r5, r10, r11            ; l[1] + 2*l[2] + l[3]
+    add         r6, r11, r7, lsl #1     ; l[2] + 2*l[3] + l[3]
+
+
+    add         r8, r0, r8, lsl #14     ; (tl + 2*l[0] + l[1])>>2 in top half
+    add         r9, r0, r4, lsl #14     ; (l[0] + 2*l[1] + l[2])>>2 in top half
+    add         r10,r0, r5, lsl #14     ; (l[1] + 2*l[2] + l[3])>>2 in top half
+    add         r11,r0, r6, lsl #14     ; (l[2] + 2*l[3] + l[3])>>2 in top half
+
+    pkhtb       r8, r8, r8, asr #16     ; l[-|0|-|0]
+    pkhtb       r9, r9, r9, asr #16     ; l[-|1|-|1]
+    pkhtb       r10, r10, r10, asr #16  ; l[-|2|-|2]
+    pkhtb       r11, r11, r11, asr #16  ; l[-|3|-|3]
+
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    add         r8, r8, r8, lsl #8      ; l[0|0|0|0]
+    add         r9, r9, r9, lsl #8      ; l[1|1|1|1]
+    add         r10, r10, r10, lsl #8   ; l[2|2|2|2]
+    add         r11, r11, r11, lsl #8   ; l[3|3|3|3]
+
+    ; store values
+    str         r8, [r3], r0
+    str         r9, [r3], r0
+    str         r10, [r3], r0
+    str         r11, [r3]
+
+    pop        {r4-r12, pc}
+
+b_ld_pred
+    ldr         r4, [r0]                ; Above[0-3]
+    ldr         r12, c00020002
+    ldr         r5, [r0, #4]            ; Above[4-7]
+    ldr         lr,  c00FF00FF
+
+    uxtb16      r6, r4                  ; a[2|0]
+    uxtb16      r7, r4, ror #8          ; a[3|1]
+    uxtb16      r8, r5                  ; a[6|4]
+    uxtb16      r9, r5, ror #8          ; a[7|5]
+    pkhtb       r10, r6, r8             ; a[2|4]
+    pkhtb       r11, r7, r9             ; a[3|5]
+
+    add         r4, r6, r7, lsl #1      ; [a2+2*a3      |      a0+2*a1]
+    add         r4, r4, r10, ror #16    ; [a2+2*a3+a4   |   a0+2*a1+a2]
+    uxtab16     r4, r4, r12             ; [a2+2*a3+a4+2 | a0+2*a1+a2+2]
+
+    add         r5, r7, r10, ror #15    ; [a3+2*a4      |      a1+2*a2]
+    add         r5, r5, r11, ror #16    ; [a3+2*a4+a5   |   a1+2*a2+a3]
+    uxtab16     r5, r5, r12             ; [a3+2*a4+a5+2 | a1+2*a2+a3+2]
+
+    pkhtb       r7, r9, r8, asr #16
+    add         r6, r8, r9, lsl #1      ; [a6+2*a7      |      a4+2*a5]
+    uadd16      r6, r6, r7              ; [a6+2*a7+a7   |   a4+2*a5+a6]
+    uxtab16     r6, r6, r12             ; [a6+2*a7+a7+2 | a4+2*a5+a6+2]
+
+    uxth        r7, r9                  ; [                         a5]
+    add         r7, r7, r8, asr #15     ; [                    a5+2*a6]
+    add         r7, r7, r9, asr #16     ; [                 a5+2*a6+a7]
+    uxtah       r7, r7, r12             ; [               a5+2*a6+a7+2]
+
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    ; scale down
+    and         r4, lr, r4, asr #2
+    and         r5, lr, r5, asr #2
+    and         r6, lr, r6, asr #2
+    mov         r7, r7, asr #2
+
+    add         r8, r4, r5, lsl #8      ; [3|2|1|0]
+    str         r8, [r3], r0
+
+    mov         r9, r8, lsr #8
+    add         r9, r9, r6, lsl #24     ; [4|3|2|1]
+    str         r9, [r3], r0
+
+    mov         r10, r9, lsr #8
+    add         r10, r10, r7, lsl #24   ; [5|4|3|2]
+    str         r10, [r3], r0
+
+    mov         r6, r6, lsr #16
+    mov         r11, r10, lsr #8
+    add         r11, r11, r6, lsl #24   ; [6|5|4|3]
+    str         r11, [r3]
+
+    pop        {r4-r12, pc}
+
+b_rd_pred
+    ldrb        r7, [r1], r2            ; l[0] = pp[3]
+    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
+    ldrb        r8, [sp, #48]           ; tl   = pp[4]
+    ldrb        r6, [r1], r2            ; l[1] = pp[2]
+    ldrb        r5, [r1], r2            ; l[2] = pp[1]
+    ldrb        r4, [r1], r2            ; l[3] = pp[0]
+
+
+    uxtb16      r9, lr                  ; p[7|5]
+    uxtb16      r10, lr, ror #8         ; p[8|6]
+    add         r4, r4, r6, lsl #16     ; p[2|0]
+    add         r5, r5, r7, lsl #16     ; p[3|1]
+    add         r6, r6, r8, lsl #16     ; p[4|2]
+    pkhbt       r7, r7, r9, lsl #16     ; p[5|3]
+    pkhbt       r8, r8, r10, lsl #16    ; p[6|4]
+
+    ldr         r12, c00020002
+    ldr         lr,  c00FF00FF
+
+    add         r4, r4, r5, lsl #1      ; [p2+2*p3      |      p0+2*p1]
+    add         r4, r4, r6              ; [p2+2*p3+p4   |   p0+2*p1+p2]
+    uxtab16     r4, r4, r12             ; [p2+2*p3+p4+2 | p0+2*p1+p2+2]
+
+    add         r5, r5, r6, lsl #1      ; [p3+2*p4      |      p1+2*p2]
+    add         r5, r5, r7              ; [p3+2*p4+p5   |   p1+2*p2+p3]
+    uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
+
+    add         r6, r7, r8, lsl #1      ; [p5+2*p6      |      p3+2*p4]
+    add         r6, r6, r9              ; [p5+2*p6+p7   |   p3+2*p4+p5]
+    uxtab16     r6, r6, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
+
+    add         r7, r8, r9, lsl #1      ; [p6+2*p7      |      p4+2*p5]
+    add         r7, r7, r10             ; [p6+2*p7+p8   |   p4+2*p5+p6]
+    uxtab16     r7, r7, r12             ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
+
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    ; scale down
+    and         r7, lr, r7, asr #2
+    and         r6, lr, r6, asr #2
+    and         r5, lr, r5, asr #2
+    and         r4, lr, r4, asr #2
+
+    add         r8, r6, r7, lsl #8      ; [6|5|4|3]
+    str         r8, [r3], r0
+
+    mov         r9, r8, lsl #8          ; [5|4|3|-]
+    uxtab       r9, r9, r4, ror #16     ; [5|4|3|2]
+    str         r9, [r3], r0
+
+    mov         r10, r9, lsl #8         ; [4|3|2|-]
+    uxtab       r10, r10, r5            ; [4|3|2|1]
+    str         r10, [r3], r0
+
+    mov         r11, r10, lsl #8        ; [3|2|1|-]
+    uxtab       r11, r11, r4            ; [3|2|1|0]
+    str         r11, [r3]
+
+    pop        {r4-r12, pc}
+
+b_vr_pred
+    ldrb        r7, [r1], r2            ; l[0] = pp[3]
+    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
+    ldrb        r8, [sp, #48]           ; tl   = pp[4]
+    ldrb        r6, [r1], r2            ; l[1] = pp[2]
+    ldrb        r5, [r1], r2            ; l[2] = pp[1]
+    ldrb        r4, [r1]                ; l[3] = pp[0]
+
+    add         r5, r5, r7, lsl #16     ; p[3|1]
+    add         r6, r6, r8, lsl #16     ; p[4|2]
+    uxtb16      r9, lr                  ; p[7|5]
+    uxtb16      r10, lr, ror #8         ; p[8|6]
+    pkhbt       r7, r7, r9, lsl #16     ; p[5|3]
+    pkhbt       r8, r8, r10, lsl #16    ; p[6|4]
+
+    ldr         r4,  c00010001
+    ldr         r12, c00020002
+    ldr         lr,  c00FF00FF
+
+    add         r5, r5, r6, lsl #1      ; [p3+2*p4      |      p1+2*p2]
+    add         r5, r5, r7              ; [p3+2*p4+p5   |   p1+2*p2+p3]
+    uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
+
+    add         r6, r6, r7, lsl #1      ; [p4+2*p5      |      p2+2*p3]
+    add         r6, r6, r8              ; [p4+2*p5+p6   |   p2+2*p3+p4]
+    uxtab16     r6, r6, r12             ; [p4+2*p5+p6+2 | p2+2*p3+p4+2]
+
+    uadd16      r11, r8, r9             ; [p6+p7        |        p4+p5]
+    uhadd16     r11, r11, r4            ; [(p6+p7+1)>>1 | (p4+p5+1)>>1]
+                                        ; [F|E]
+
+    add         r7, r7, r8, lsl #1      ; [p5+2*p6      |      p3+2*p4]
+    add         r7, r7, r9              ; [p5+2*p6+p7   |   p3+2*p4+p5]
+    uxtab16     r7, r7, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
+
+    uadd16      r2, r9, r10             ; [p7+p8        |        p5+p6]
+    uhadd16     r2, r2, r4              ; [(p7+p8+1)>>1 | (p5+p6+1)>>1]
+                                        ; [J|I]
+
+    add         r8, r8, r9, lsl #1      ; [p6+2*p7      |      p4+2*p5]
+    add         r8, r8, r10             ; [p6+2*p7+p8   |   p4+2*p5+p6]
+    uxtab16     r8, r8, r12             ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
+
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    ; scale down
+    and         r5, lr, r5, asr #2      ; [B|A]
+    and         r6, lr, r6, asr #2      ; [D|C]
+    and         r7, lr, r7, asr #2      ; [H|G]
+    and         r8, lr, r8, asr #2      ; [L|K]
+
+    add         r12, r11, r2, lsl #8    ; [J|F|I|E]
+    str         r12, [r3], r0
+
+    add         r12, r7, r8, lsl #8     ; [L|H|K|G]
+    str         r12, [r3], r0
+
+    pkhbt       r2, r6, r2, lsl #16     ; [-|I|-|C]
+    add         r2, r2, r11, lsl #8     ; [F|I|E|C]
+
+    pkhtb       r12, r6, r5             ; [-|D|-|A]
+    pkhtb       r10, r7, r5, asr #16    ; [-|H|-|B]
+    str         r2, [r3], r0
+    add         r12, r12, r10, lsl #8   ; [H|D|B|A]
+    str         r12, [r3]
+
+    pop        {r4-r12, pc}
+
+b_vl_pred
+    ldr         r4, [r0]                ; [3|2|1|0] = Above[0-3]
+    ldr         r12, c00020002
+    ldr         r5, [r0, #4]            ; [7|6|5|4] = Above[4-7]
+    ldr         lr,  c00FF00FF
+    ldr         r2,  c00010001
+
+    mov         r0, r4, lsr #16         ; [-|-|3|2]
+    add         r0, r0, r5, lsl #16     ; [5|4|3|2]
+    uxtb16      r6, r4                  ; [2|0]
+    uxtb16      r7, r4, ror #8          ; [3|1]
+    uxtb16      r8, r0                  ; [4|2]
+    uxtb16      r9, r0, ror #8          ; [5|3]
+    uxtb16      r10, r5                 ; [6|4]
+    uxtb16      r11, r5, ror #8         ; [7|5]
+
+    uadd16      r4, r6, r7              ; [p2+p3        |        p0+p1]
+    uhadd16     r4, r4, r2              ; [(p2+p3+1)>>1 | (p0+p1+1)>>1]
+                                        ; [B|A]
+
+    add         r5, r6, r7, lsl #1      ; [p2+2*p3      |      p0+2*p1]
+    add         r5, r5, r8              ; [p2+2*p3+p4   |   p0+2*p1+p2]
+    uxtab16     r5, r5, r12             ; [p2+2*p3+p4+2 | p0+2*p1+p2+2]
+
+    uadd16      r6, r7, r8              ; [p3+p4        |        p1+p2]
+    uhadd16     r6, r6, r2              ; [(p3+p4+1)>>1 | (p1+p2+1)>>1]
+                                        ; [F|E]
+
+    add         r7, r7, r8, lsl #1      ; [p3+2*p4      |      p1+2*p2]
+    add         r7, r7, r9              ; [p3+2*p4+p5   |   p1+2*p2+p3]
+    uxtab16     r7, r7, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
+
+    add         r8, r8, r9, lsl #1      ; [p4+2*p5      |      p2+2*p3]
+    add         r8, r8, r10             ; [p4+2*p5+p6   |   p2+2*p3+p4]
+    uxtab16     r8, r8, r12             ; [p4+2*p5+p6+2 | p2+2*p3+p4+2]
+
+    add         r9, r9, r10, lsl #1     ; [p5+2*p6      |      p3+2*p4]
+    add         r9, r9, r11             ; [p5+2*p6+p7   |   p3+2*p4+p5]
+    uxtab16     r9, r9, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
+
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    ; scale down
+    and         r5, lr, r5, asr #2      ; [D|C]
+    and         r7, lr, r7, asr #2      ; [H|G]
+    and         r8, lr, r8, asr #2      ; [I|D]
+    and         r9, lr, r9, asr #2      ; [J|H]
+
+    add         r10, r4, r6, lsl #8     ; [F|B|E|A]
+    str         r10, [r3], r0
+
+    add         r5, r5, r7, lsl #8      ; [H|C|G|D]
+    str         r5, [r3], r0
+
+    pkhtb       r12, r8, r4, asr #16    ; [-|I|-|B]
+    pkhtb       r10, r9, r8             ; [-|J|-|D]
+
+    add         r12, r6, r12, lsl #8    ; [I|F|B|E]
+    str         r12, [r3], r0
+
+    add         r10, r7, r10, lsl #8    ; [J|H|D|G]
+    str         r10, [r3]
+
+    pop        {r4-r12, pc}
+
+b_hd_pred
+    ldrb        r7, [r1], r2            ; l[0] = pp[3]
+    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
+    ldrb        r8, [sp, #48]           ; tl   = pp[4]
+    ldrb        r6, [r1], r2            ; l[1] = pp[2]
+    ldrb        r5, [r1], r2            ; l[2] = pp[1]
+    ldrb        r4, [r1]                ; l[3] = pp[0]
+
+    uxtb16      r9, lr                  ; p[7|5]
+    uxtb16      r10, lr, ror #8         ; p[8|6]
+
+    add         r4, r4, r5, lsl #16     ; p[1|0]
+    add         r5, r5, r6, lsl #16     ; p[2|1]
+    add         r6, r6, r7, lsl #16     ; p[3|2]
+    add         r7, r7, r8, lsl #16     ; p[4|3]
+
+    ldr         r12, c00020002
+    ldr         lr,  c00FF00FF
+    ldr         r2,  c00010001
+
+    pkhtb       r8, r7, r9              ; p[4|5]
+    pkhtb       r1, r9, r10             ; p[7|6]
+    pkhbt       r10, r8, r10, lsl #16   ; p[6|5]
+
+    uadd16      r11, r4, r5             ; [p1+p2        |        p0+p1]
+    uhadd16     r11, r11, r2            ; [(p1+p2+1)>>1 | (p0+p1+1)>>1]
+                                        ; [B|A]
+
+    add         r4, r4, r5, lsl #1      ; [p1+2*p2      |      p0+2*p1]
+    add         r4, r4, r6              ; [p1+2*p2+p3   |   p0+2*p1+p2]
+    uxtab16     r4, r4, r12             ; [p1+2*p2+p3+2 | p0+2*p1+p2+2]
+
+    uadd16      r0, r6, r7              ; [p3+p4        |        p2+p3]
+    uhadd16     r0, r0, r2              ; [(p3+p4+1)>>1 | (p2+p3+1)>>1]
+                                        ; [F|E]
+
+    add         r5, r6, r7, lsl #1      ; [p3+2*p4      |      p2+2*p3]
+    add         r5, r5, r8, ror #16     ; [p3+2*p4+p5   |   p2+2*p3+p4]
+    uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p2+2*p3+p4+2]
+
+    add         r6, r12, r8, ror #16    ; [p5+2         |         p4+2]
+    add         r6, r6, r10, lsl #1     ; [p5+2+2*p6    |    p4+2+2*p5]
+    uxtab16     r6, r6, r1              ; [p5+2+2*p6+p7 | p4+2+2*p5+p6]
+
+    ; scale down
+    and         r4, lr, r4, asr #2      ; [D|C]
+    and         r5, lr, r5, asr #2      ; [H|G]
+    and         r6, lr, r6, asr #2      ; [J|I]
+
+    ldr         lr, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+
+    pkhtb       r2, r0, r6              ; [-|F|-|I]
+    pkhtb       r12, r6, r5, asr #16    ; [-|J|-|H]
+    add         r12, r12, r2, lsl #8    ; [F|J|I|H]
+    add         r2, r0, r5, lsl #8      ; [H|F|G|E]
+    mov         r12, r12, ror #24       ; [J|I|H|F]
+    str         r12, [r3], lr
+
+    mov         r7, r11, asr #16        ; [-|-|-|B]
+    str         r2, [r3], lr
+    add         r7, r7, r0, lsl #16     ; [-|E|-|B]
+    add         r7, r7, r4, asr #8      ; [-|E|D|B]
+    add         r7, r7, r5, lsl #24     ; [G|E|D|B]
+    str         r7, [r3], lr
+
+    add         r5, r11, r4, lsl #8     ; [D|B|C|A]
+    str         r5, [r3]
+
+    pop        {r4-r12, pc}
+
+
+
+b_hu_pred
+    ldrb        r4, [r1], r2            ; Left[0]
+    ldr         r12, c00020002
+    ldrb        r5, [r1], r2            ; Left[1]
+    ldr         lr,  c00FF00FF
+    ldrb        r6, [r1], r2            ; Left[2]
+    ldr         r2,  c00010001
+    ldrb        r7, [r1]                ; Left[3]
+
+    add         r4, r4, r5, lsl #16     ; [1|0]
+    add         r5, r5, r6, lsl #16     ; [2|1]
+    add         r9, r6, r7, lsl #16     ; [3|2]
+
+    uadd16      r8, r4, r5              ; [p1+p2        |        p0+p1]
+    uhadd16     r8, r8, r2              ; [(p1+p2+1)>>1 | (p0+p1+1)>>1]
+                                        ; [B|A]
+
+    add         r4, r4, r5, lsl #1      ; [p1+2*p2      |      p0+2*p1]
+    add         r4, r4, r9              ; [p1+2*p2+p3   |   p0+2*p1+p2]
+    uxtab16     r4, r4, r12             ; [p1+2*p2+p3+2 | p0+2*p1+p2+2]
+    ldr         r2, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
+    and         r4, lr, r4, asr #2      ; [D|C]
+
+    add         r10, r6, r7             ; [p2+p3]
+    add         r11, r10, r7, lsl #1    ; [p2+3*p3]
+    add         r10, r10, #1
+    add         r11, r11, #2
+    mov         r10, r10, asr #1        ; [E]
+    mov         r11, r11, asr #2        ; [F]
+
+    add         r9, r7, r9, asr #8      ; [-|-|G|G]
+    add         r0, r8, r4, lsl #8      ; [D|B|C|A]
+    add         r7, r9, r9, lsl #16     ; [G|G|G|G]
+
+    str         r0, [r3], r2
+
+    mov         r1, r8, asr #16         ; [-|-|-|B]
+    add         r1, r1, r4, asr #8      ; [-|-|D|B]
+    add         r1, r1, r10, lsl #16    ; [-|E|D|B]
+    add         r1, r1, r11, lsl #24    ; [F|E|D|B]
+    str         r1, [r3], r2
+
+    add         r10, r11, lsl #8        ; [-|-|F|E]
+    add         r10, r10, r9, lsl #16   ; [G|G|F|E]
+    str         r10, [r3], r2
+
+    str         r7, [r3]
+
+    pop        {r4-r12, pc}
+
+    ENDP
+
+; constants
+c00010001
+    DCD         0x00010001
+c00020002
+    DCD         0x00020002
+c00FF00FF
+    DCD         0x00FF00FF
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/iwalsh_v6.asm b/libvpx/vp8/common/arm/armv6/iwalsh_v6.asm
new file mode 100644
index 0000000..31ef09c
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/iwalsh_v6.asm
@@ -0,0 +1,136 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_inv_walsh4x4_v6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
+|vp8_short_inv_walsh4x4_v6| PROC
+
+    stmdb       sp!, {r4 - r12, lr}
+
+    ldr         r2, [r0, #0]         ; [1  |  0]
+    ldr         r3, [r0, #4]         ; [3  |  2]
+    ldr         r4, [r0, #8]         ; [5  |  4]
+    ldr         r5, [r0, #12]        ; [7  |  6]
+    ldr         r6, [r0, #16]        ; [9  |  8]
+    ldr         r7, [r0, #20]        ; [11 | 10]
+    ldr         r8, [r0, #24]        ; [13 | 12]
+    ldr         r9, [r0, #28]        ; [15 | 14]
+
+    qadd16      r10, r2, r8          ; a1 [1+13  |  0+12]
+    qadd16      r11, r4, r6          ; b1 [5+9   |  4+8]
+    qsub16      r12, r4, r6          ; c1 [5-9   |  4-8]
+    qsub16      lr, r2, r8           ; d1 [1-13  |  0-12]
+
+    qadd16      r2, r10, r11         ; a1 + b1 [1  |  0]
+    qadd16      r4, r12, lr          ; c1 + d1 [5  |  4]
+    qsub16      r6, r10, r11         ; a1 - b1 [9  |  8]
+    qsub16      r8, lr, r12          ; d1 - c1 [13 | 12]
+
+    qadd16      r10, r3, r9          ; a1 [3+15  |  2+14]
+    qadd16      r11, r5, r7          ; b1 [7+11  |  6+10]
+    qsub16      r12, r5, r7          ; c1 [7-11  |  6-10]
+    qsub16      lr, r3, r9           ; d1 [3-15  |  2-14]
+
+    qadd16      r3, r10, r11         ; a1 + b1 [3  |  2]
+    qadd16      r5, r12, lr          ; c1 + d1 [7  |  6]
+    qsub16      r7, r10, r11         ; a1 - b1 [11 | 10]
+    qsub16      r9, lr, r12          ; d1 - c1 [15 | 14]
+
+    ; first transform complete
+
+    qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3]
+    qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3]
+    qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7]
+    qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7]
+
+    qaddsubx    r2, r10, r11         ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r3, r11, r10         ; [a2|d2] [b1+a1 | d1-c1]
+    ldr         r10, c0x00030003
+    qaddsubx    r4, r12, lr          ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r5, lr, r12          ; [a2|d2] [b1+a1 | d1-c1]
+
+    qadd16      r2, r2, r10          ; [b2+3|c2+3]
+    qadd16      r3, r3, r10          ; [a2+3|d2+3]
+    qadd16      r4, r4, r10          ; [b2+3|c2+3]
+    qadd16      r5, r5, r10          ; [a2+3|d2+3]
+
+    asr         r12, r3, #19         ; [0]
+    strh        r12, [r1], #32
+    asr         lr, r2, #19          ; [1]
+    strh        lr, [r1], #32
+    sxth        r2, r2
+    sxth        r3, r3
+    asr         r2, r2, #3           ; [2]
+    strh        r2, [r1], #32
+    asr         r3, r3, #3           ; [3]
+    strh        r3, [r1], #32
+
+    asr         r12, r5, #19         ; [4]
+    strh        r12, [r1], #32
+    asr         lr, r4, #19          ; [5]
+    strh        lr, [r1], #32
+    sxth        r4, r4
+    sxth        r5, r5
+    asr         r4, r4, #3           ; [6]
+    strh        r4, [r1], #32
+    asr         r5, r5, #3           ; [7]
+    strh        r5, [r1], #32
+
+    qsubaddx    r2, r6, r7           ; [c1|a1] [9-10  |  8+11]
+    qaddsubx    r3, r6, r7           ; [b1|d1] [9+10  |  8-11]
+    qsubaddx    r4, r8, r9           ; [c1|a1] [13-14 | 12+15]
+    qaddsubx    r5, r8, r9           ; [b1|d1] [13+14 | 12-15]
+
+    qaddsubx    r6, r2, r3           ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r7, r3, r2           ; [a2|d2] [b1+a1 | d1-c1]
+    qaddsubx    r8, r4, r5           ; [b2|c2] [c1+d1 | a1-b1]
+    qaddsubx    r9, r5, r4           ; [a2|d2] [b1+a1 | d1-c1]
+
+    qadd16      r6, r6, r10          ; [b2+3|c2+3]
+    qadd16      r7, r7, r10          ; [a2+3|d2+3]
+    qadd16      r8, r8, r10          ; [b2+3|c2+3]
+    qadd16      r9, r9, r10          ; [a2+3|d2+3]
+
+    asr         r12, r7, #19         ; [8]
+    strh        r12, [r1], #32
+    asr         lr, r6, #19          ; [9]
+    strh        lr, [r1], #32
+    sxth        r6, r6
+    sxth        r7, r7
+    asr         r6, r6, #3           ; [10]
+    strh        r6, [r1], #32
+    asr         r7, r7, #3           ; [11]
+    strh        r7, [r1], #32
+
+    asr         r12, r9, #19         ; [12]
+    strh        r12, [r1], #32
+    asr         lr, r8, #19          ; [13]
+    strh        lr, [r1], #32
+    sxth        r8, r8
+    sxth        r9, r9
+    asr         r8, r8, #3           ; [14]
+    strh        r8, [r1], #32
+    asr         r9, r9, #3           ; [15]
+    strh        r9, [r1], #32
+
+    ldmia       sp!, {r4 - r12, pc}
+    ENDP        ; |vp8_short_inv_walsh4x4_v6|
+
+
+; Constant Pool
+c0x00030003 DCD 0x00030003
+    END
diff --git a/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm b/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm
new file mode 100644
index 0000000..1cbbbcd
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/loopfilter_v6.asm
@@ -0,0 +1,1282 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_loop_filter_horizontal_edge_armv6|
+    EXPORT |vp8_mbloop_filter_horizontal_edge_armv6|
+    EXPORT |vp8_loop_filter_vertical_edge_armv6|
+    EXPORT |vp8_mbloop_filter_vertical_edge_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+    MACRO
+    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+    ; a0: 03 02 01 00
+    ; a1: 13 12 11 10
+    ; a2: 23 22 21 20
+    ; a3: 33 32 31 30
+    ;     b3 b2 b1 b0
+
+    uxtb16      $b1, $a1                    ; xx 12 xx 10
+    uxtb16      $b0, $a0                    ; xx 02 xx 00
+    uxtb16      $b3, $a3                    ; xx 32 xx 30
+    uxtb16      $b2, $a2                    ; xx 22 xx 20
+    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
+    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
+
+    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
+    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
+    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
+    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
+    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
+    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
+
+    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
+    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
+
+    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
+    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
+    MEND
+
+
+src         RN  r0
+pstep       RN  r1
+count       RN  r5
+
+;r0     unsigned char *src_ptr,
+;r1     int src_pixel_step,
+;r2     const char *blimit,
+;r3     const char *limit,
+;stack  const char *thresh,
+;stack  int  count
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r6, [sp, #36]               ; load thresh address
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r9, [src], pstep            ; p3
+    ldrb        r4, [r2]                    ; blimit
+    ldr         r10, [src], pstep           ; p2
+    ldrb        r2, [r3]                    ; limit
+    ldr         r11, [src], pstep           ; p1
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
+    mov         count, count, lsl #1        ; 4-in-parallel
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
+
+|Hnext8|
+    ; vp8_filter_mask() function
+    ; calculate breakout conditions
+    ldr         r12, [src], pstep           ; p0
+
+    uqsub8      r6, r9, r10                 ; p3 - p2
+    uqsub8      r7, r10, r9                 ; p2 - p3
+    uqsub8      r8, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+
+    orr         r6, r6, r7                  ; abs (p3-p2)
+    orr         r8, r8, r10                 ; abs (p2-p1)
+    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
+    uqsub8      r8, r8, r2                  ; compare to limit
+    uqsub8      r6, r11, r12                ; p1 - p0
+    orr         lr, lr, r8
+    uqsub8      r7, r12, r11                ; p0 - p1
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src], pstep           ; q1
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r10                ; p1 - q1
+    uqsub8      r7, r10, r11                ; q1 - p1
+    uqsub8      r11, r12, r9                ; p0 - q0
+    uqsub8      r12, r9, r12                ; q0 - p0
+    orr         r6, r6, r7                  ; abs (p1-q1)
+    ldr         r7, c0x7F7F7F7F
+    orr         r12, r11, r12               ; abs (p0-q0)
+    ldr         r11, [src], pstep           ; q2
+    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
+    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r7, r9, r10                 ; q0 - q1
+    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r6, r10, r9                 ; q1 - q0
+    uqsub8      r12, r12, r4                ; compare to flimit
+    uqsub8      r9, r11, r10                ; q2 - q1
+
+    orr         lr, lr, r12
+
+    ldr         r12, [src], pstep           ; q3
+    uqsub8      r10, r10, r11               ; q1 - q2
+    orr         r6, r7, r6                  ; abs (q1-q0)
+    orr         r10, r9, r10                ; abs (q2-q1)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r10, r10, r2                ; compare to limit
+    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
+    orr         lr, lr, r7
+    orr         lr, lr, r10
+
+    uqsub8      r10, r12, r11               ; q3 - q2
+    uqsub8      r9, r11, r12                ; q2 - q3
+
+    mvn         r11, #0                     ; r11 == -1
+
+    orr         r10, r10, r9                ; abs (q3-q2)
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    mov         r12, #0
+    orr         lr, lr, r10
+    sub         src, src, pstep, lsl #2
+
+    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         hskip_filter                 ; skip filtering
+
+    sub         src, src, pstep, lsl #1     ; move src pointer down by 6 lines
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+    orr         r10, r6, r8                 ; calculate vp8_hevmask
+
+    ldr         r7, [src], pstep            ; p1
+
+    usub8       r10, r12, r10               ; use usub8 instead of ssub8
+    sel         r6, r12, r11                ; obtain vp8_hevmask: r6
+
+    ;vp8_filter() function
+    ldr         r8, [src], pstep            ; p0
+    ldr         r12, c0x80808080
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src], pstep           ; q1
+
+    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
+    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
+    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
+    eor         r10, r10, r12               ; q1 offset to convert to a signed value
+
+    str         r9, [sp]                    ; store qs0 temporarily
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    str         r7, [sp, #12]               ; store ps1 temporarily
+
+    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
+    qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+
+    and         r7, r7, r6                  ; vp8_filter (r7) &= hev
+
+    qadd8       r7, r7, r8
+    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
+
+    qadd8       r7, r7, r8
+    ldr         r10, c0x04040404
+
+    qadd8       r7, r7, r8
+    and         r7, r7, lr                  ; vp8_filter &= mask;
+
+    ;modify code for vp8 -- Filter1 = vp8_filter (r7)
+    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+    qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+
+    mov         r9, #0
+    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
+    shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
+    shadd8      r8 , r8 , r9
+    shadd8      r7 , r7 , r9
+    shadd8      lr , r8 , r9                ; lr: Filter2
+    shadd8      r7 , r7 , r9                ; r7: filter
+
+    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
+    ;sel        lr, r11, r9
+    ;usub8      r8, r10, r8
+    ;sel        r8, r11, r9
+    ;and        r8, r8, lr                  ; -1 for each element that equals 4
+
+    ;calculate output
+    ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
+
+    ldr         r8, [sp]                    ; load qs0
+    ldr         r9, [sp, #4]                ; load ps0
+
+    ldr         r10, c0x01010101
+
+    qsub8       r8 ,r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+    qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
+
+    ;end of modification for vp8
+
+    mov         lr, #0
+    sadd8       r7, r7 , r10                ; vp8_filter += 1
+    shadd8      r7, r7, lr                  ; vp8_filter >>= 1
+
+    ldr         r11, [sp, #12]              ; load ps1
+    ldr         r10, [sp, #8]               ; load qs1
+
+    bic         r7, r7, r6                  ; vp8_filter &= ~hev
+    sub         src, src, pstep, lsl #2
+
+    qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+    qsub8       r10, r10,r7                 ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+
+    eor         r11, r11, r12               ; *op1 = u^0x80
+    str         r11, [src], pstep           ; store op1
+    eor         r9, r9, r12                 ; *op0 = u^0x80
+    str         r9, [src], pstep            ; store op0 result
+    eor         r8, r8, r12                 ; *oq0 = u^0x80
+    str         r8, [src], pstep            ; store oq0 result
+    eor         r10, r10, r12               ; *oq1 = u^0x80
+    str         r10, [src], pstep           ; store oq1
+
+    sub         src, src, pstep, lsl #1
+
+|hskip_filter|
+    add         src, src, #4
+    sub         src, src, pstep, lsl #2
+
+    subs        count, count, #1
+
+    ldrne       r9, [src], pstep            ; p3
+    ldrne       r10, [src], pstep           ; p2
+    ldrne       r11, [src], pstep           ; p1
+
+    bne         Hnext8
+
+    add         sp, sp, #16
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_loop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r6, [sp, #36]               ; load thresh address
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r9, [src], pstep            ; p3
+    ldrb        r4, [r2]                    ; blimit
+    ldr         r10, [src], pstep           ; p2
+    ldrb        r2, [r3]                    ; limit
+    ldr         r11, [src], pstep           ; p1
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
+    mov         count, count, lsl #1        ; 4-in-parallel
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
+
+|MBHnext8|
+
+    ; vp8_filter_mask() function
+    ; calculate breakout conditions
+    ldr         r12, [src], pstep           ; p0
+
+    uqsub8      r6, r9, r10                 ; p3 - p2
+    uqsub8      r7, r10, r9                 ; p2 - p3
+    uqsub8      r8, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+
+    orr         r6, r6, r7                  ; abs (p3-p2)
+    orr         r8, r8, r10                 ; abs (p2-p1)
+    uqsub8      lr, r6, r2                  ; compare to limit. lr: vp8_filter_mask
+    uqsub8      r8, r8, r2                  ; compare to limit
+
+    uqsub8      r6, r11, r12                ; p1 - p0
+    orr         lr, lr, r8
+    uqsub8      r7, r12, r11                ; p0 - p1
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src], pstep           ; q1
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r8, r6, r3                  ; compare to thresh  -- save r8 for later
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r10                ; p1 - q1
+    uqsub8      r7, r10, r11                ; q1 - p1
+    uqsub8      r11, r12, r9                ; p0 - q0
+    uqsub8      r12, r9, r12                ; q0 - p0
+    orr         r6, r6, r7                  ; abs (p1-q1)
+    ldr         r7, c0x7F7F7F7F
+    orr         r12, r11, r12               ; abs (p0-q0)
+    ldr         r11, [src], pstep           ; q2
+    uqadd8      r12, r12, r12               ; abs (p0-q0) * 2
+    and         r6, r7, r6, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r7, r9, r10                 ; q0 - q1
+    uqadd8      r12, r12, r6                ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r6, r10, r9                 ; q1 - q0
+    uqsub8      r12, r12, r4                ; compare to flimit
+    uqsub8      r9, r11, r10                ; q2 - q1
+
+    orr         lr, lr, r12
+
+    ldr         r12, [src], pstep           ; q3
+
+    uqsub8      r10, r10, r11               ; q1 - q2
+    orr         r6, r7, r6                  ; abs (q1-q0)
+    orr         r10, r9, r10                ; abs (q2-q1)
+    uqsub8      r7, r6, r2                  ; compare to limit
+    uqsub8      r10, r10, r2                ; compare to limit
+    uqsub8      r6, r6, r3                  ; compare to thresh -- save r6 for later
+    orr         lr, lr, r7
+    orr         lr, lr, r10
+
+    uqsub8      r10, r12, r11               ; q3 - q2
+    uqsub8      r9, r11, r12                ; q2 - q3
+
+    mvn         r11, #0                     ; r11 == -1
+
+    orr         r10, r10, r9                ; abs (q3-q2)
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    mov         r12, #0
+
+    orr         lr, lr, r10
+
+    usub8       lr, r12, lr                 ; use usub8 instead of ssub8
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         mbhskip_filter               ; skip filtering
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 6 lines
+    sub         src, src, pstep, lsl #1
+
+    orr         r10, r6, r8
+    ldr         r7, [src], pstep            ; p1
+
+    usub8       r10, r12, r10
+    sel         r6, r12, r11                ; hev mask: r6
+
+    ;vp8_mbfilter() function
+    ;p2, q2 are only needed at the end. Don't need to load them in now.
+    ldr         r8, [src], pstep            ; p0
+    ldr         r12, c0x80808080
+    ldr         r9, [src], pstep            ; q0
+    ldr         r10, [src]                  ; q1
+
+    eor         r7, r7, r12                 ; ps1
+    eor         r8, r8, r12                 ; ps0
+    eor         r9, r9, r12                 ; qs0
+    eor         r10, r10, r12               ; qs1
+
+    qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    str         r7, [sp, #12]               ; store ps1 temporarily
+    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    qadd8       r7, r7, r12
+    str         r9, [sp]                    ; store qs0 temporarily
+    qadd8       r7, r7, r12
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    qadd8       r7, r7, r12                 ; vp8_filter: r7
+
+    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
+    ldr         r9, c0x04040404
+
+    and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
+
+    mov         r12, r7                     ; Filter2: r12
+    and         r12, r12, r6                ; Filter2 &= hev
+
+    ;modify code for vp8
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
+    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
+
+    mov         r10, #0
+    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
+    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
+    shadd8      r8 , r8 , r10
+    shadd8      r12 , r12 , r10
+    shadd8      r8 , r8 , r10               ; r8: Filter1
+    shadd8      r12 , r12 , r10             ; r12: Filter2
+
+    ldr         r9, [sp]                    ; load qs0
+    ldr         r11, [sp, #4]               ; load ps0
+
+    qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+    qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
+    ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+    ;mov            r10, #0
+    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
+    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
+    ;sel            lr, r11, r10
+    ;shadd8     r12 , r12 , r10
+    ;usub8      r8, r9, r8
+    ;sel            r8, r11, r10
+    ;ldr            r9, [sp]                    ; load qs0
+    ;ldr            r11, [sp, #4]               ; load ps0
+    ;shadd8     r12 , r12 , r10
+    ;and            r8, r8, lr                  ; -1 for each element that equals 4
+    ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
+    ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+    ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
+
+    ;end of modification for vp8
+
+    bic         r12, r7, r6                 ; vp8_filter &= ~hev    ( r6 is free)
+    ;mov        r12, r7
+
+    ;roughly 3/7th difference across boundary
+    mov         lr, #0x1b                   ; 27
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r7, r10, lr, r7
+    smultb      r10, r10, lr
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    add         r10, r10, #63
+    ssat        r7, #8, r7, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    ldr         lr, c0x80808080
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r7, r10, lsl #16
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    sub         src, src, pstep
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
+    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
+    eor         r8, r8, lr                  ; *oq0 = s^0x80
+    str         r8, [src]                   ; store *oq0
+    sub         src, src, pstep
+    eor         r10, r10, lr                ; *op0 = s^0x80
+    str         r10, [src]                  ; store *op0
+
+    ;roughly 2/7th difference across boundary
+    mov         lr, #0x12                   ; 18
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r9, r10, lr, r7
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r9, #8, r9, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    ldr         lr, c0x80808080
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r9, r10, lsl #16
+
+    ldr         r9, [sp, #8]                ; load qs1
+    ldr         r11, [sp, #12]              ; load ps1
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    sub         src, src, pstep
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+    qadd8       r11, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
+    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
+    eor         r11, r11, lr                ; *op1 = s^0x80
+    str         r11, [src], pstep           ; store *op1
+    eor         r8, r8, lr                  ; *oq1 = s^0x80
+    add         src, src, pstep, lsl #1
+
+    mov         r7, #0x3f                   ; 63
+
+    str         r8, [src], pstep            ; store *oq1
+
+    ;roughly 1/7th difference across boundary
+    mov         lr, #0x9                    ; 9
+    ldr         r9, [src]                   ; load q2
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r12, r10, lr, r7
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r12, #8, r12, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    sub         src, src, pstep, lsl #2
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r12, r10, lsl #16
+
+    sub         src, src, pstep
+    ldr         lr, c0x80808080
+
+    ldr         r11, [src]                  ; load p2
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    eor         r9, r9, lr
+    eor         r11, r11, lr
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+    qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
+    qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
+    eor         r8, r8, lr                  ; *op2 = s^0x80
+    str         r8, [src], pstep, lsl #2    ; store *op2
+    add         src, src, pstep
+    eor         r10, r10, lr                ; *oq2 = s^0x80
+    str         r10, [src], pstep, lsl #1   ; store *oq2
+
+|mbhskip_filter|
+    add         src, src, #4
+    sub         src, src, pstep, lsl #3
+    subs        count, count, #1
+
+    ldrne       r9, [src], pstep            ; p3
+    ldrne       r10, [src], pstep           ; p2
+    ldrne       r11, [src], pstep           ; p1
+
+    bne         MBHnext8
+
+    add         sp, sp, #16
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, #4                ; move src pointer down by 4
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r12, [sp, #36]              ; load thresh address
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r6, [src], pstep            ; load source data
+    ldrb        r4, [r2]                    ; blimit
+    ldr         r7, [src], pstep
+    ldrb        r2, [r3]                    ; limit
+    ldr         r8, [src], pstep
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
+    ldr         lr, [src], pstep
+    mov         count, count, lsl #1        ; 4-in-parallel
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
+
+|Vnext8|
+
+    ; vp8_filter_mask() function
+    ; calculate breakout conditions
+    ; transpose the source data for 4-in-parallel operation
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    uqsub8      r7, r9, r10                 ; p3 - p2
+    uqsub8      r8, r10, r9                 ; p2 - p3
+    uqsub8      r9, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+    orr         r7, r7, r8                  ; abs (p3-p2)
+    orr         r10, r9, r10                ; abs (p2-p1)
+    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         lr, lr, r10
+
+    uqsub8      r6, r11, r12                ; p1 - p0
+    uqsub8      r7, r12, r11                ; p0 - p1
+    add         src, src, #4                ; move src pointer up by 4
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    str         r11, [sp, #12]              ; save p1
+    uqsub8      r10, r6, r2                 ; compare to limit
+    uqsub8      r11, r6, r3                 ; compare to thresh
+    orr         lr, lr, r10
+
+    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+    ; transpose the source data for 4-in-parallel operation
+    ldr         r6, [src], pstep            ; load source data
+    str         r11, [sp]                   ; push r11 to stack
+    ldr         r7, [src], pstep
+    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
+    ldr         r8, [src], pstep
+    str         lr, [sp, #8]
+    ldr         lr, [src], pstep
+
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
+
+    uqsub8      r6, r12, r11                ; q3 - q2
+    uqsub8      r7, r11, r12                ; q2 - q3
+    uqsub8      r12, r11, r10               ; q2 - q1
+    uqsub8      r11, r10, r11               ; q1 - q2
+    orr         r6, r6, r7                  ; abs (q3-q2)
+    orr         r7, r12, r11                ; abs (q2-q1)
+    uqsub8      r6, r6, r2                  ; compare to limit
+    uqsub8      r7, r7, r2                  ; compare to limit
+    ldr         r11, [sp, #4]               ; load back p0
+    ldr         r12, [sp, #12]              ; load back p1
+    orr         lr, lr, r6
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r9                 ; p0 - q0
+    uqsub8      r7, r9, r11                 ; q0 - p0
+    uqsub8      r8, r12, r10                ; p1 - q1
+    uqsub8      r11, r10, r12               ; q1 - p1
+    orr         r6, r6, r7                  ; abs (p0-q0)
+    ldr         r7, c0x7F7F7F7F
+    orr         r8, r8, r11                 ; abs (p1-q1)
+    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
+    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r11, r10, r9                ; q1 - q0
+    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r12, r9, r10                ; q0 - q1
+    uqsub8      r6, r6, r4                  ; compare to flimit
+
+    orr         r9, r11, r12                ; abs (q1-q0)
+    uqsub8      r8, r9, r2                  ; compare to limit
+    uqsub8      r10, r9, r3                 ; compare to thresh
+    orr         lr, lr, r6
+    orr         lr, lr, r8
+
+    mvn         r11, #0                     ; r11 == -1
+    mov         r12, #0
+
+    usub8       lr, r12, lr
+    ldr         r9, [sp]                    ; load the compared result
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         vskip_filter                 ; skip filtering
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         r9, r9, r10
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    usub8       r9, r12, r9
+    sel         r6, r12, r11                ; hev mask: r6
+
+    ;vp8_filter() function
+    ; load soure data to r6, r11, r12, lr
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    pkhbt       r12, r7, r8, lsl #16
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    pkhbt       r11, r9, r10, lsl #16
+
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+    str         r6, [sp]
+    str         lr, [sp, #4]
+
+    pkhbt       r6, r7, r8, lsl #16
+    pkhbt       lr, r9, r10, lsl #16
+
+    ;transpose r12, r11, r6, lr to r7, r8, r9, r10
+    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+    ;load back hev_mask r6 and filter_mask lr
+    ldr         r12, c0x80808080
+    ldr         r6, [sp]
+    ldr         lr, [sp, #4]
+
+    eor         r7, r7, r12                 ; p1 offset to convert to a signed value
+    eor         r8, r8, r12                 ; p0 offset to convert to a signed value
+    eor         r9, r9, r12                 ; q0 offset to convert to a signed value
+    eor         r10, r10, r12               ; q1 offset to convert to a signed value
+
+    str         r9, [sp]                    ; store qs0 temporarily
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    str         r7, [sp, #12]               ; store ps1 temporarily
+
+    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
+    qsub8       r8, r9, r8                  ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+
+    and         r7, r7, r6                  ;  vp8_filter (r7) &= hev (r7 : filter)
+
+    qadd8       r7, r7, r8
+    ldr         r9, c0x03030303             ; r9 = 3 --modified for vp8
+
+    qadd8       r7, r7, r8
+    ldr         r10, c0x04040404
+
+    qadd8       r7, r7, r8
+    ;mvn         r11, #0                     ; r11 == -1
+
+    and         r7, r7, lr                  ; vp8_filter &= mask
+
+    ;modify code for vp8 -- Filter1 = vp8_filter (r7)
+    qadd8       r8 , r7 , r9                ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
+    qadd8       r7 , r7 , r10               ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4)
+
+    mov         r9, #0
+    shadd8      r8 , r8 , r9                ; Filter2 >>= 3
+    shadd8      r7 , r7 , r9                ; vp8_filter >>= 3
+    shadd8      r8 , r8 , r9
+    shadd8      r7 , r7 , r9
+    shadd8      lr , r8 , r9                ; lr: filter2
+    shadd8      r7 , r7 , r9                ; r7: filter
+
+    ;usub8      lr, r8, r10                 ; s = (s==4)*-1
+    ;sel            lr, r11, r9
+    ;usub8      r8, r10, r8
+    ;sel            r8, r11, r9
+    ;and            r8, r8, lr                  ; -1 for each element that equals 4 -- r8: s
+
+    ;calculate output
+    ;qadd8      lr, r8, r7                  ; u = vp8_signed_char_clamp(s + vp8_filter)
+
+    ldr         r8, [sp]                    ; load qs0
+    ldr         r9, [sp, #4]                ; load ps0
+
+    ldr         r10, c0x01010101
+
+    qsub8       r8, r8, r7                  ; u = vp8_signed_char_clamp(qs0 - vp8_filter)
+    qadd8       r9, r9, lr                  ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    ;end of modification for vp8
+
+    eor         r8, r8, r12
+    eor         r9, r9, r12
+
+    mov         lr, #0
+
+    sadd8       r7, r7, r10
+    shadd8      r7, r7, lr
+
+    ldr         r10, [sp, #8]               ; load qs1
+    ldr         r11, [sp, #12]              ; load ps1
+
+    bic         r7, r7, r6                  ; r7: vp8_filter
+
+    qsub8       r10 , r10, r7               ; u = vp8_signed_char_clamp(qs1 - vp8_filter)
+    qadd8       r11, r11, r7                ; u = vp8_signed_char_clamp(ps1 + vp8_filter)
+    eor         r10, r10, r12
+    eor         r11, r11, r12
+
+    sub         src, src, pstep, lsl #2
+
+    ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1
+    ;output is b0, b1, b2, b3
+    ;b0: 03 02 01 00
+    ;b1: 13 12 11 10
+    ;b2: 23 22 21 20
+    ;b3: 33 32 31 30
+    ;    p1 p0 q0 q1
+    ;   (a3 a2 a1 a0)
+    TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr
+
+    strh        r6, [src, #-2]              ; store the result
+    mov         r6, r6, lsr #16
+    strh        r6, [src], pstep
+
+    strh        r7, [src, #-2]
+    mov         r7, r7, lsr #16
+    strh        r7, [src], pstep
+
+    strh        r12, [src, #-2]
+    mov         r12, r12, lsr #16
+    strh        r12, [src], pstep
+
+    strh        lr, [src, #-2]
+    mov         lr, lr, lsr #16
+    strh        lr, [src], pstep
+
+|vskip_filter|
+    sub         src, src, #4
+    subs        count, count, #1
+
+    ldrne       r6, [src], pstep            ; load source data
+    ldrne       r7, [src], pstep
+    ldrne       r8, [src], pstep
+    ldrne       lr, [src], pstep
+
+    bne         Vnext8
+
+    add         sp, sp, #16
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_loop_filter_vertical_edge_armv6|
+
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_mbloop_filter_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    sub         src, src, #4                ; move src pointer down by 4
+    ldr         count, [sp, #40]            ; count for 8-in-parallel
+    ldr         r12, [sp, #36]              ; load thresh address
+    pld         [src, #23]                  ; preload for next block
+    sub         sp, sp, #16                 ; create temp buffer
+
+    ldr         r6, [src], pstep            ; load source data
+    ldrb        r4, [r2]                    ; blimit
+    pld         [src, #23]
+    ldr         r7, [src], pstep
+    ldrb        r2, [r3]                    ; limit
+    pld         [src, #23]
+    ldr         r8, [src], pstep
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
+    pld         [src, #23]
+    ldr         lr, [src], pstep
+    mov         count, count, lsl #1        ; 4-in-parallel
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
+
+|MBVnext8|
+    ; vp8_filter_mask() function
+    ; calculate breakout conditions
+    ; transpose the source data for 4-in-parallel operation
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    uqsub8      r7, r9, r10                 ; p3 - p2
+    uqsub8      r8, r10, r9                 ; p2 - p3
+    uqsub8      r9, r10, r11                ; p2 - p1
+    uqsub8      r10, r11, r10               ; p1 - p2
+    orr         r7, r7, r8                  ; abs (p3-p2)
+    orr         r10, r9, r10                ; abs (p2-p1)
+    uqsub8      lr, r7, r2                  ; compare to limit. lr: vp8_filter_mask
+    uqsub8      r10, r10, r2                ; compare to limit
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         lr, lr, r10
+
+    uqsub8      r6, r11, r12                ; p1 - p0
+    uqsub8      r7, r12, r11                ; p0 - p1
+    add         src, src, #4                ; move src pointer up by 4
+    orr         r6, r6, r7                  ; abs (p1-p0)
+    str         r11, [sp, #12]              ; save p1
+    uqsub8      r10, r6, r2                 ; compare to limit
+    uqsub8      r11, r6, r3                 ; compare to thresh
+    orr         lr, lr, r10
+
+    ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now
+    ; transpose the source data for 4-in-parallel operation
+    ldr         r6, [src], pstep            ; load source data
+    str         r11, [sp]                   ; push r11 to stack
+    ldr         r7, [src], pstep
+    str         r12, [sp, #4]               ; save current reg before load q0 - q3 data
+    ldr         r8, [src], pstep
+    str         lr, [sp, #8]
+    ldr         lr, [src], pstep
+
+
+    TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
+
+    ldr         lr, [sp, #8]                ; load back (f)limit accumulator
+
+    uqsub8      r6, r12, r11                ; q3 - q2
+    uqsub8      r7, r11, r12                ; q2 - q3
+    uqsub8      r12, r11, r10               ; q2 - q1
+    uqsub8      r11, r10, r11               ; q1 - q2
+    orr         r6, r6, r7                  ; abs (q3-q2)
+    orr         r7, r12, r11                ; abs (q2-q1)
+    uqsub8      r6, r6, r2                  ; compare to limit
+    uqsub8      r7, r7, r2                  ; compare to limit
+    ldr         r11, [sp, #4]               ; load back p0
+    ldr         r12, [sp, #12]              ; load back p1
+    orr         lr, lr, r6
+    orr         lr, lr, r7
+
+    uqsub8      r6, r11, r9                 ; p0 - q0
+    uqsub8      r7, r9, r11                 ; q0 - p0
+    uqsub8      r8, r12, r10                ; p1 - q1
+    uqsub8      r11, r10, r12               ; q1 - p1
+    orr         r6, r6, r7                  ; abs (p0-q0)
+    ldr         r7, c0x7F7F7F7F
+    orr         r8, r8, r11                 ; abs (p1-q1)
+    uqadd8      r6, r6, r6                  ; abs (p0-q0) * 2
+    and         r8, r7, r8, lsr #1          ; abs (p1-q1) / 2
+    uqsub8      r11, r10, r9                ; q1 - q0
+    uqadd8      r6, r8, r6                  ; abs (p0-q0)*2 + abs (p1-q1)/2
+    uqsub8      r12, r9, r10                ; q0 - q1
+    uqsub8      r6, r6, r4                  ; compare to flimit
+
+    orr         r9, r11, r12                ; abs (q1-q0)
+    uqsub8      r8, r9, r2                  ; compare to limit
+    uqsub8      r10, r9, r3                 ; compare to thresh
+    orr         lr, lr, r6
+    orr         lr, lr, r8
+
+    mvn         r11, #0                     ; r11 == -1
+    mov         r12, #0
+
+    usub8       lr, r12, lr
+    ldr         r9, [sp]                    ; load the compared result
+    sel         lr, r11, r12                ; filter mask: lr
+
+    cmp         lr, #0
+    beq         mbvskip_filter               ; skip filtering
+
+
+
+    ;vp8_hevmask() function
+    ;calculate high edge variance
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         r9, r9, r10
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    usub8       r9, r12, r9
+    sel         r6, r12, r11                ; hev mask: r6
+
+
+    ; vp8_mbfilter() function
+    ; p2, q2 are only needed at the end. Don't need to load them in now.
+    ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first
+    ; load soure data to r6, r11, r12, lr
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    pkhbt       r12, r7, r8, lsl #16
+
+    ldrh        r7, [src, #-2]
+    ldrh        r8, [src], pstep
+
+    pkhbt       r11, r9, r10, lsl #16
+
+    ldrh        r9, [src, #-2]
+    ldrh        r10, [src], pstep
+
+    str         r6, [sp]                    ; save r6
+    str         lr, [sp, #4]                ; save lr
+
+    pkhbt       r6, r7, r8, lsl #16
+    pkhbt       lr, r9, r10, lsl #16
+
+    ;transpose r12, r11, r6, lr to p1, p0, q0, q1
+    TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10
+
+    ;load back hev_mask r6 and filter_mask lr
+    ldr         r12, c0x80808080
+    ldr         r6, [sp]
+    ldr         lr, [sp, #4]
+
+    eor         r7, r7, r12                 ; ps1
+    eor         r8, r8, r12                 ; ps0
+    eor         r9, r9, r12                 ; qs0
+    eor         r10, r10, r12               ; qs1
+
+    qsub8       r12, r9, r8                 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    str         r7, [sp, #12]               ; store ps1 temporarily
+    qsub8       r7, r7, r10                 ; vp8_signed_char_clamp(ps1-qs1)
+    str         r10, [sp, #8]               ; store qs1 temporarily
+    qadd8       r7, r7, r12
+    str         r9, [sp]                    ; store qs0 temporarily
+    qadd8       r7, r7, r12
+    str         r8, [sp, #4]                ; store ps0 temporarily
+    qadd8       r7, r7, r12                 ; vp8_filter: r7
+
+    ldr         r10, c0x03030303            ; r10 = 3 --modified for vp8
+    ldr         r9, c0x04040404
+    ;mvn         r11, #0                     ; r11 == -1
+
+    and         r7, r7, lr                  ; vp8_filter &= mask (lr is free)
+
+    mov         r12, r7                     ; Filter2: r12
+    and         r12, r12, r6                ; Filter2 &= hev
+
+    ;modify code for vp8
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    qadd8       r8 , r12 , r9               ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4)
+    qadd8       r12 , r12 , r10             ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3)
+
+    mov         r10, #0
+    shadd8      r8 , r8 , r10               ; Filter1 >>= 3
+    shadd8      r12 , r12 , r10             ; Filter2 >>= 3
+    shadd8      r8 , r8 , r10
+    shadd8      r12 , r12 , r10
+    shadd8      r8 , r8 , r10               ; r8: Filter1
+    shadd8      r12 , r12 , r10             ; r12: Filter2
+
+    ldr         r9, [sp]                    ; load qs0
+    ldr         r11, [sp, #4]               ; load ps0
+
+    qsub8       r9 , r9, r8                 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1)
+    qadd8       r11, r11, r12               ; ps0 = vp8_signed_char_clamp(ps0 + Filter2)
+
+    ;save bottom 3 bits so that we round one side +4 and the other +3
+    ;and            r8, r12, r10                ; s = Filter2 & 7 (s: r8)
+    ;qadd8      r12 , r12 , r9              ; Filter2 = vp8_signed_char_clamp(Filter2+4)
+    ;mov            r10, #0
+    ;shadd8     r12 , r12 , r10             ; Filter2 >>= 3
+    ;usub8      lr, r8, r9                  ; s = (s==4)*-1
+    ;sel            lr, r11, r10
+    ;shadd8     r12 , r12 , r10
+    ;usub8      r8, r9, r8
+    ;sel            r8, r11, r10
+    ;ldr            r9, [sp]                    ; load qs0
+    ;ldr            r11, [sp, #4]               ; load ps0
+    ;shadd8     r12 , r12 , r10
+    ;and            r8, r8, lr                  ; -1 for each element that equals 4
+    ;qadd8      r10, r8, r12                ; u = vp8_signed_char_clamp(s + Filter2)
+    ;qsub8      r9 , r9, r12                ; qs0 = vp8_signed_char_clamp(qs0 - Filter2)
+    ;qadd8      r11, r11, r10               ; ps0 = vp8_signed_char_clamp(ps0 + u)
+
+    ;end of modification for vp8
+
+    bic         r12, r7, r6                 ;vp8_filter &= ~hev    ( r6 is free)
+    ;mov            r12, r7
+
+    ;roughly 3/7th difference across boundary
+    mov         lr, #0x1b                   ; 27
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r7, r10, lr, r7
+    smultb      r10, r10, lr
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    add         r10, r10, #63
+    ssat        r7, #8, r7, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    ldr         lr, c0x80808080
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r7, r10, lsl #16
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+
+    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs0 - u)
+    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps0 + u)
+    eor         r8, r8, lr                  ; *oq0 = s^0x80
+    eor         r10, r10, lr                ; *op0 = s^0x80
+
+    strb        r10, [src, #-1]             ; store op0 result
+    strb        r8, [src], pstep            ; store oq0 result
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    strb        r10, [src, #-1]
+    strb        r8, [src], pstep
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    strb        r10, [src, #-1]
+    strb        r8, [src], pstep
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    strb        r10, [src, #-1]
+    strb        r8, [src], pstep
+
+    ;roughly 2/7th difference across boundary
+    mov         lr, #0x12                   ; 18
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r9, r10, lr, r7
+
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r9, #8, r9, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    sub         src, src, pstep, lsl #2     ; move src pointer down by 4 lines
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r9, r10, lsl #16
+
+    ldr         r9, [sp, #8]                ; load qs1
+    ldr         r11, [sp, #12]              ; load ps1
+    ldr         lr, c0x80808080
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    add         src, src, #2
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+
+    qsub8       r8, r9, r10                 ; s = vp8_signed_char_clamp(qs1 - u)
+    qadd8       r10, r11, r10               ; s = vp8_signed_char_clamp(ps1 + u)
+    eor         r8, r8, lr                  ; *oq1 = s^0x80
+    eor         r10, r10, lr                ; *op1 = s^0x80
+
+    ldrb        r11, [src, #-5]             ; load p2 for 1/7th difference across boundary
+    strb        r10, [src, #-4]             ; store op1
+    strb        r8, [src, #-1]              ; store oq1
+    ldrb        r9, [src], pstep            ; load q2 for 1/7th difference across boundary
+
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+
+    ldrb        r6, [src, #-5]
+    strb        r10, [src, #-4]
+    strb        r8, [src, #-1]
+    ldrb        r7, [src], pstep
+
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    orr         r11, r11, r6, lsl #8
+    orr         r9, r9, r7, lsl #8
+
+    ldrb        r6, [src, #-5]
+    strb        r10, [src, #-4]
+    strb        r8, [src, #-1]
+    ldrb        r7, [src], pstep
+
+    mov         r10, r10, lsr #8
+    mov         r8, r8, lsr #8
+    orr         r11, r11, r6, lsl #16
+    orr         r9, r9, r7, lsl #16
+
+    ldrb        r6, [src, #-5]
+    strb        r10, [src, #-4]
+    strb        r8, [src, #-1]
+    ldrb        r7, [src], pstep
+    orr         r11, r11, r6, lsl #24
+    orr         r9, r9, r7, lsl #24
+
+    ;roughly 1/7th difference across boundary
+    eor         r9, r9, lr
+    eor         r11, r11, lr
+
+    mov         lr, #0x9                    ; 9
+    mov         r7, #0x3f                   ; 63
+
+    sxtb16      r6, r12
+    sxtb16      r10, r12, ror #8
+    smlabb      r8, r6, lr, r7
+    smlatb      r6, r6, lr, r7
+    smlabb      r12, r10, lr, r7
+    smlatb      r10, r10, lr, r7
+    ssat        r8, #8, r8, asr #7
+    ssat        r6, #8, r6, asr #7
+    ssat        r12, #8, r12, asr #7
+    ssat        r10, #8, r10, asr #7
+
+    sub         src, src, pstep, lsl #2
+
+    pkhbt       r6, r8, r6, lsl #16
+    pkhbt       r10, r12, r10, lsl #16
+
+    uxtb16      r6, r6
+    uxtb16      r10, r10
+
+    ldr         lr, c0x80808080
+
+    orr         r10, r6, r10, lsl #8        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+    qadd8       r8, r11, r10                ; s = vp8_signed_char_clamp(ps2 + u)
+    qsub8       r10, r9, r10                ; s = vp8_signed_char_clamp(qs2 - u)
+    eor         r8, r8, lr                  ; *op2 = s^0x80
+    eor         r10, r10, lr                ; *oq2 = s^0x80
+
+    strb        r8, [src, #-5]              ; store *op2
+    strb        r10, [src], pstep           ; store *oq2
+    mov         r8, r8, lsr #8
+    mov         r10, r10, lsr #8
+    strb        r8, [src, #-5]
+    strb        r10, [src], pstep
+    mov         r8, r8, lsr #8
+    mov         r10, r10, lsr #8
+    strb        r8, [src, #-5]
+    strb        r10, [src], pstep
+    mov         r8, r8, lsr #8
+    mov         r10, r10, lsr #8
+    strb        r8, [src, #-5]
+    strb        r10, [src], pstep
+
+    ;adjust src pointer for next loop
+    sub         src, src, #2
+
+|mbvskip_filter|
+    sub         src, src, #4
+    subs        count, count, #1
+
+    pld         [src, #23]                  ; preload for next block
+    ldrne       r6, [src], pstep            ; load source data
+    pld         [src, #23]
+    ldrne       r7, [src], pstep
+    pld         [src, #23]
+    ldrne       r8, [src], pstep
+    pld         [src, #23]
+    ldrne       lr, [src], pstep
+
+    bne         MBVnext8
+
+    add         sp, sp, #16
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD     0x80808080
+c0x03030303 DCD     0x03030303
+c0x04040404 DCD     0x04040404
+c0x01010101 DCD     0x01010101
+c0x7F7F7F7F DCD     0x7F7F7F7F
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm
new file mode 100644
index 0000000..5e00cf0
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -0,0 +1,286 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6|
+    EXPORT |vp8_loop_filter_simple_vertical_edge_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+    MACRO
+    TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
+    ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
+    ; a0: 03 02 01 00
+    ; a1: 13 12 11 10
+    ; a2: 23 22 21 20
+    ; a3: 33 32 31 30
+    ;     b3 b2 b1 b0
+
+    uxtb16      $b1, $a1                    ; xx 12 xx 10
+    uxtb16      $b0, $a0                    ; xx 02 xx 00
+    uxtb16      $b3, $a3                    ; xx 32 xx 30
+    uxtb16      $b2, $a2                    ; xx 22 xx 20
+    orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00
+    orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20
+
+    uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11
+    uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31
+    uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01
+    uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21
+    orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01
+    orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21
+
+    pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1
+    pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3
+
+    pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0
+    pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2
+    MEND
+
+
+
+src         RN  r0
+pstep       RN  r1
+
+;r0     unsigned char *src_ptr,
+;r1     int src_pixel_step,
+;r2     const char *blimit
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldrb        r12, [r2]                   ; blimit
+    ldr         r3, [src, -pstep, lsl #1]   ; p1
+    ldr         r4, [src, -pstep]           ; p0
+    ldr         r5, [src]                   ; q0
+    ldr         r6, [src, pstep]            ; q1
+    orr         r12, r12, r12, lsl #8       ; blimit
+    ldr         r2, c0x80808080
+    orr         r12, r12, r12, lsl #16      ; blimit
+    mov         r9, #4                      ; double the count. we're doing 4 at a time
+    mov         lr, #0                      ; need 0 in a couple places
+
+|simple_hnext8|
+    ; vp8_simple_filter_mask()
+
+    uqsub8      r7, r3, r6                  ; p1 - q1
+    uqsub8      r8, r6, r3                  ; q1 - p1
+    uqsub8      r10, r4, r5                 ; p0 - q0
+    uqsub8      r11, r5, r4                 ; q0 - p0
+    orr         r8, r8, r7                  ; abs(p1 - q1)
+    orr         r10, r10, r11               ; abs(p0 - q0)
+    uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2
+    uhadd8      r8, r8, lr                  ; abs(p1 - q2) >> 1
+    uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+    mvn         r8, #0
+    usub8       r10, r12, r10               ; compare to flimit. usub8 sets GE flags
+    sel         r10, r8, lr                 ; filter mask: F or 0
+    cmp         r10, #0
+    beq         simple_hskip_filter         ; skip filtering if all masks are 0x00
+
+    ;vp8_simple_filter()
+
+    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
+    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
+    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
+    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
+
+    qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
+    qsub8       r6, r5, r4                  ; q0 - p0
+    qadd8       r3, r3, r6                  ; += q0 - p0
+    ldr         r7, c0x04040404
+    qadd8       r3, r3, r6                  ; += q0 - p0
+    ldr         r8, c0x03030303
+    qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
+    ;STALL
+    and         r3, r3, r10                 ; vp8_filter &= mask
+
+    qadd8       r7 , r3 , r7                ; Filter1 = vp8_filter + 4
+    qadd8       r8 , r3 , r8                ; Filter2 = vp8_filter + 3
+
+    shadd8      r7 , r7 , lr
+    shadd8      r8 , r8 , lr
+    shadd8      r7 , r7 , lr
+    shadd8      r8 , r8 , lr
+    shadd8      r7 , r7 , lr                ; Filter1 >>= 3
+    shadd8      r8 , r8 , lr                ; Filter2 >>= 3
+
+    qsub8       r5 ,r5, r7                  ; u = q0 - Filter1
+    qadd8       r4, r4, r8                  ; u = p0 + Filter2
+    eor         r5, r5, r2                  ; *oq0 = u^0x80
+    str         r5, [src]                   ; store oq0 result
+    eor         r4, r4, r2                  ; *op0 = u^0x80
+    str         r4, [src, -pstep]           ; store op0 result
+
+|simple_hskip_filter|
+    subs        r9, r9, #1
+    addne       src, src, #4                ; next row
+
+    ldrne       r3, [src, -pstep, lsl #1]   ; p1
+    ldrne       r4, [src, -pstep]           ; p0
+    ldrne       r5, [src]                   ; q0
+    ldrne       r6, [src, pstep]            ; q1
+
+    bne         simple_hnext8
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_armv6|
+
+
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+|vp8_loop_filter_simple_vertical_edge_armv6| PROC
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldrb        r12, [r2]                   ; r12: blimit
+    ldr         r2, c0x80808080
+    orr         r12, r12, r12, lsl #8
+
+    ; load soure data to r7, r8, r9, r10
+    ldrh        r3, [src, #-2]
+    pld         [src, #23]                  ; preload for next block
+    ldrh        r4, [src], pstep
+    orr         r12, r12, r12, lsl #16
+
+    ldrh        r5, [src, #-2]
+    pld         [src, #23]
+    ldrh        r6, [src], pstep
+
+    pkhbt       r7, r3, r4, lsl #16
+
+    ldrh        r3, [src, #-2]
+    pld         [src, #23]
+    ldrh        r4, [src], pstep
+
+    pkhbt       r8, r5, r6, lsl #16
+
+    ldrh        r5, [src, #-2]
+    pld         [src, #23]
+    ldrh        r6, [src], pstep
+    mov         r11, #4                     ; double the count. we're doing 4 at a time
+
+|simple_vnext8|
+    ; vp8_simple_filter_mask() function
+    pkhbt       r9, r3, r4, lsl #16
+    pkhbt       r10, r5, r6, lsl #16
+
+    ;transpose r7, r8, r9, r10 to r3, r4, r5, r6
+    TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
+
+    uqsub8      r7, r3, r6                  ; p1 - q1
+    uqsub8      r8, r6, r3                  ; q1 - p1
+    uqsub8      r9, r4, r5                  ; p0 - q0
+    uqsub8      r10, r5, r4                 ; q0 - p0
+    orr         r7, r7, r8                  ; abs(p1 - q1)
+    orr         r9, r9, r10                 ; abs(p0 - q0)
+    mov         r8, #0
+    uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2
+    uhadd8      r7, r7, r8                  ; abs(p1 - q1) / 2
+    uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2
+    mvn         r10, #0                     ; r10 == -1
+
+    usub8       r7, r12, r7                 ; compare to flimit
+    sel         lr, r10, r8                 ; filter mask
+
+    cmp         lr, #0
+    beq         simple_vskip_filter         ; skip filtering
+
+    ;vp8_simple_filter() function
+    eor         r3, r3, r2                  ; p1 offset to convert to a signed value
+    eor         r6, r6, r2                  ; q1 offset to convert to a signed value
+    eor         r4, r4, r2                  ; p0 offset to convert to a signed value
+    eor         r5, r5, r2                  ; q0 offset to convert to a signed value
+
+    qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1
+    qsub8       r6, r5, r4                  ; q0 - p0
+
+    qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
+    ldr         r9, c0x03030303             ; r9 = 3
+
+    qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0
+    ldr         r7, c0x04040404
+
+    qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0))
+    ;STALL
+    and         r3, r3, lr                  ; vp8_filter &= mask
+
+    qadd8       r9 , r3 , r9                ; Filter2 = vp8_filter + 3
+    qadd8       r3 , r3 , r7                ; Filter1 = vp8_filter + 4
+
+    shadd8      r9 , r9 , r8
+    shadd8      r3 , r3 , r8
+    shadd8      r9 , r9 , r8
+    shadd8      r3 , r3 , r8
+    shadd8      r9 , r9 , r8                ; Filter2 >>= 3
+    shadd8      r3 , r3 , r8                ; Filter1 >>= 3
+
+    ;calculate output
+    sub         src, src, pstep, lsl #2
+
+    qadd8       r4, r4, r9                  ; u = p0 + Filter2
+    qsub8       r5, r5, r3                  ; u = q0 - Filter1
+    eor         r4, r4, r2                  ; *op0 = u^0x80
+    eor         r5, r5, r2                  ; *oq0 = u^0x80
+
+    strb        r4, [src, #-1]              ; store the result
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    mov         r4, r4, lsr #8
+    strb        r5, [src], pstep
+    mov         r5, r5, lsr #8
+
+    strb        r4, [src, #-1]
+    strb        r5, [src], pstep
+
+|simple_vskip_filter|
+    subs        r11, r11, #1
+
+    ; load soure data to r7, r8, r9, r10
+    ldrneh      r3, [src, #-2]
+    pld         [src, #23]                  ; preload for next block
+    ldrneh      r4, [src], pstep
+
+    ldrneh      r5, [src, #-2]
+    pld         [src, #23]
+    ldrneh      r6, [src], pstep
+
+    pkhbt       r7, r3, r4, lsl #16
+
+    ldrneh      r3, [src, #-2]
+    pld         [src, #23]
+    ldrneh      r4, [src], pstep
+
+    pkhbt       r8, r5, r6, lsl #16
+
+    ldrneh      r5, [src, #-2]
+    pld         [src, #23]
+    ldrneh      r6, [src], pstep
+
+    bne         simple_vnext8
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_loop_filter_simple_vertical_edge_armv6|
+
+; Constant Pool
+c0x80808080 DCD     0x80808080
+c0x03030303 DCD     0x03030303
+c0x04040404 DCD     0x04040404
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
new file mode 100644
index 0000000..e81aef5
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/sixtappredict8x4_v6.asm
@@ -0,0 +1,273 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict8x4_armv6|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+;-------------------------------------
+; r0    unsigned char *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack unsigned char *dst_ptr,
+; stack int  dst_pitch
+;-------------------------------------
+;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
+;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
+;and the result is stored in transpose.
+|vp8_sixtap_predict8x4_armv6| PROC
+    stmdb       sp!, {r4 - r11, lr}
+    str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset
+
+    cmp         r2, #0                      ;skip first_pass filter if xoffset=0
+    add         lr, sp, #4                  ;point to temporary buffer
+    beq         skip_firstpass_filter
+
+;first-pass filter
+    adr         r12, filter8_coeff
+    sub         r0, r0, r1, lsl #1
+
+    add         r3, r1, #10                 ; preload next low
+    pld         [r0, r3]
+
+    add         r2, r12, r2, lsl #4         ;calculate filter location
+    add         r0, r0, #3                  ;adjust src only for loading convinience
+
+    ldr         r3, [r2]                    ; load up packed filter coefficients
+    ldr         r4, [r2, #4]
+    ldr         r5, [r2, #8]
+
+    mov         r2, #0x90000                ; height=9 is top part of counter
+
+    sub         r1, r1, #8
+
+|first_pass_hloop_v6|
+    ldrb        r6, [r0, #-5]               ; load source data
+    ldrb        r7, [r0, #-4]
+    ldrb        r8, [r0, #-3]
+    ldrb        r9, [r0, #-2]
+    ldrb        r10, [r0, #-1]
+
+    orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2
+
+    pkhbt       r6, r6, r7, lsl #16         ; r7 | r6
+    pkhbt       r7, r7, r8, lsl #16         ; r8 | r7
+
+    pkhbt       r8, r8, r9, lsl #16         ; r9 | r8
+    pkhbt       r9, r9, r10, lsl #16        ; r10 | r9
+
+|first_pass_wloop_v6|
+    smuad       r11, r6, r3                 ; vp8_filter[0], vp8_filter[1]
+    smuad       r12, r7, r3
+
+    ldrb        r6, [r0], #1
+
+    smlad       r11, r8, r4, r11            ; vp8_filter[2], vp8_filter[3]
+    ldrb        r7, [r0], #1
+    smlad       r12, r9, r4, r12
+
+    pkhbt       r10, r10, r6, lsl #16       ; r10 | r9
+    pkhbt       r6, r6, r7, lsl #16         ; r11 | r10
+    smlad       r11, r10, r5, r11           ; vp8_filter[4], vp8_filter[5]
+    smlad       r12, r6, r5, r12
+
+    sub         r2, r2, #1
+
+    add         r11, r11, #0x40             ; round_shift_and_clamp
+    tst         r2, #0xff                   ; test loop counter
+    usat        r11, #8, r11, asr #7
+    add         r12, r12, #0x40
+    strh        r11, [lr], #20              ; result is transposed and stored, which
+    usat        r12, #8, r12, asr #7
+
+    strh        r12, [lr], #20
+
+    movne       r11, r6
+    movne       r12, r7
+
+    movne       r6, r8
+    movne       r7, r9
+    movne       r8, r10
+    movne       r9, r11
+    movne       r10, r12
+
+    bne         first_pass_wloop_v6
+
+    ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines
+    ;;IF ARCHITECTURE=6
+    ;pld        [src, ppl]
+    ;;pld       [src, r9]
+    ;;ENDIF
+
+    subs        r2, r2, #0x10000
+
+    sub         lr, lr, #158
+
+    add         r0, r0, r1                  ; move to next input line
+
+    add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier
+    pld         [r0, r11]
+
+    bne         first_pass_hloop_v6
+
+;second pass filter
+secondpass_filter
+    ldr         r3, [sp], #4                ; load back yoffset
+    ldr         r0, [sp, #216]              ; load dst address from stack 180+36
+    ldr         r1, [sp, #220]              ; load dst stride from stack 180+40
+
+    cmp         r3, #0
+    beq         skip_secondpass_filter
+
+    adr         r12, filter8_coeff
+    add         lr, r12, r3, lsl #4         ;calculate filter location
+
+    mov         r2, #0x00080000
+
+    ldr         r3, [lr]                    ; load up packed filter coefficients
+    ldr         r4, [lr, #4]
+    ldr         r5, [lr, #8]
+
+    pkhbt       r12, r4, r3                 ; pack the filter differently
+    pkhbt       r11, r5, r4
+
+second_pass_hloop_v6
+    ldr         r6, [sp]                    ; load the data
+    ldr         r7, [sp, #4]
+
+    orr         r2, r2, #2                  ; loop counter
+
+second_pass_wloop_v6
+    smuad       lr, r3, r6                  ; apply filter
+    smulbt      r10, r3, r6
+
+    ldr         r8, [sp, #8]
+
+    smlad       lr, r4, r7, lr
+    smladx      r10, r12, r7, r10
+
+    ldrh        r9, [sp, #12]
+
+    smlad       lr, r5, r8, lr
+    smladx      r10, r11, r8, r10
+
+    add         sp, sp, #4
+    smlatb      r10, r5, r9, r10
+
+    sub         r2, r2, #1
+
+    add         lr, lr, #0x40               ; round_shift_and_clamp
+    tst         r2, #0xff
+    usat        lr, #8, lr, asr #7
+    add         r10, r10, #0x40
+    strb        lr, [r0], r1                ; the result is transposed back and stored
+    usat        r10, #8, r10, asr #7
+
+    strb        r10, [r0],r1
+
+    movne       r6, r7
+    movne       r7, r8
+
+    bne         second_pass_wloop_v6
+
+    subs        r2, r2, #0x10000
+    add         sp, sp, #12                 ; updata src for next loop (20-8)
+    sub         r0, r0, r1, lsl #2
+    add         r0, r0, #1
+
+    bne         second_pass_hloop_v6
+
+    add         sp, sp, #20
+    ldmia       sp!, {r4 - r11, pc}
+
+;--------------------
+skip_firstpass_filter
+    sub         r0, r0, r1, lsl #1
+    sub         r1, r1, #8
+    mov         r2, #9
+
+skip_firstpass_hloop
+    ldrb        r4, [r0], #1                ; load data
+    subs        r2, r2, #1
+    ldrb        r5, [r0], #1
+    strh        r4, [lr], #20               ; store it to immediate buffer
+    ldrb        r6, [r0], #1                ; load data
+    strh        r5, [lr], #20
+    ldrb        r7, [r0], #1
+    strh        r6, [lr], #20
+    ldrb        r8, [r0], #1
+    strh        r7, [lr], #20
+    ldrb        r9, [r0], #1
+    strh        r8, [lr], #20
+    ldrb        r10, [r0], #1
+    strh        r9, [lr], #20
+    ldrb        r11, [r0], #1
+    strh        r10, [lr], #20
+    add         r0, r0, r1                  ; move to next input line
+    strh        r11, [lr], #20
+
+    sub         lr, lr, #158                ; move over to next column
+    bne         skip_firstpass_hloop
+
+    b           secondpass_filter
+
+;--------------------
+skip_secondpass_filter
+    mov         r2, #8
+    add         sp, sp, #4                  ;start from src[0] instead of src[-2]
+
+skip_secondpass_hloop
+    ldr         r6, [sp], #4
+    subs        r2, r2, #1
+    ldr         r8, [sp], #4
+
+    mov         r7, r6, lsr #16             ; unpack
+    strb        r6, [r0], r1
+    mov         r9, r8, lsr #16
+    strb        r7, [r0], r1
+    add         sp, sp, #12                 ; 20-8
+    strb        r8, [r0], r1
+    strb        r9, [r0], r1
+
+    sub         r0, r0, r1, lsl #2
+    add         r0, r0, #1
+
+    bne         skip_secondpass_hloop
+
+    add         sp, sp, #16                 ; 180 - (160 +4)
+
+    ldmia       sp!, {r4 - r11, pc}
+
+    ENDP
+
+;-----------------
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+filter8_coeff
+    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
+    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
+    DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
+    DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
+    DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
+    DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000
+    DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000
+    DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000
+
+    ;DCD        0,  0,  128,    0,   0,  0
+    ;DCD        0, -6,  123,   12,  -1,  0
+    ;DCD        2, -11, 108,   36,  -8,  1
+    ;DCD        0, -9,   93,   50,  -6,  0
+    ;DCD        3, -16,  77,   77, -16,  3
+    ;DCD        0, -6,   50,   93,  -9,  0
+    ;DCD        1, -8,   36,  108, -11,  2
+    ;DCD        0, -1,   12,  123,  -6,  0
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm
new file mode 100644
index 0000000..1b4f5cf
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm
@@ -0,0 +1,96 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sad16x16_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    const unsigned char *src_ptr
+; r1    int  src_stride
+; r2    const unsigned char *ref_ptr
+; r3    int  ref_stride
+; stack max_sad (not used)
+|vp8_sad16x16_armv6| PROC
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    mov     r4, #0              ; sad = 0;
+    mov     r5, #8              ; loop count
+
+loop
+    ; 1st row
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (1A)
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (1A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (1A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (1A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (1B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (1B)
+
+    usada8  r4, r8, r6, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (1B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (1B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (2A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (2A)
+    add     r4, r4, r8          ; add partial sad values
+
+    ; 2nd row
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (2A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (2A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (2B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (2B)
+
+    usada8  r4, r6, r8, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (2B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (2B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    pld     [r0, r1, lsl #1]
+    pld     [r2, r3, lsl #1]
+
+    subs    r5, r5, #1          ; decrement loop counter
+    add     r4, r4, r8          ; add partial sad values
+
+    bne     loop
+
+    mov     r0, r4              ; return sad
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
+
diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
new file mode 100644
index 0000000..dc84c30
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
@@ -0,0 +1,154 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance16x16_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
+
diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
new file mode 100644
index 0000000..adc353d
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
@@ -0,0 +1,101 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance8x8_armv6|
+
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance8x8_armv6| PROC
+
+    push    {r4-r10, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #8             ; set loop counter to 8 (=block height)
+    mov     r4, #0              ; initialize sum = 0
+    mov     r5, #0              ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels
+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r6, r7          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+    ; calculate total sum
+    add    r4, r4, r6           ; add positive differences to sum
+    sub    r4, r4, r7           ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r0, #0x4]      ; load 4 src pixels
+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r6, r7          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r4, r4, r6          ; add positive differences to sum
+    sub     r4, r4, r7          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1        ; next row
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r8, [sp, #32]       ; get address of sse
+    mul     r1, r4, r4          ; sum * sum
+    str     r5, [r8]            ; store sse
+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
+
+    pop     {r4-r10, pc}
+
+    ENDP
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
new file mode 100644
index 0000000..dd2ce68
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
@@ -0,0 +1,182 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_h_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_h_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
new file mode 100644
index 0000000..f972d9b
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
@@ -0,0 +1,222 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_hv_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_hv_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; pointer to pixels on the next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load source pixels a, row N
+    ldr     r6, [r0, #1]        ; load source pixels b, row N
+    ldr     r5, [r9, #0]        ; load source pixels c, row N+1
+    ldr     r7, [r9, #1]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load source pixels a, row N
+    ldr     r6, [r0, #5]        ; load source pixels b, row N
+    ldr     r5, [r9, #4]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #5]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load source pixels a, row N
+    ldr     r6, [r0, #9]        ; load source pixels b, row N
+    ldr     r5, [r9, #8]        ; load source pixels c, row N+1
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    ldr     r7, [r9, #9]        ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load source pixels a, row N
+    ldr     r6, [r0, #13]       ; load source pixels b, row N
+    ldr     r5, [r9, #12]       ; load source pixels c, row N+1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+    ldr     r7, [r9, #13]       ; load source pixels d, row N+1
+
+    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
+    mvn     r7, r7
+    uhsub8  r5, r5, r7
+    eor     r5, r5, r10
+    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
+    mvn     r5, r5
+    uhsub8  r4, r4, r5
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
new file mode 100644
index 0000000..f5da9c0
--- /dev/null
+++ b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
@@ -0,0 +1,184 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_v_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance_halfpixvar16x16_v_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    ldr     r10, c80808080
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     lr, #0              ; constant zero
+loop
+    add     r9, r0, r1          ; set src pointer to next row
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r6, [r9, #0]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r6, [r9, #4]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r6, [r9, #8]        ; load 4 src pixels from next row
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r6, [r9, #12]       ; load 4 src pixels from next row
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+
+    ; bilinear interpolation
+    mvn     r6, r6
+    uhsub8  r4, r4, r6
+    eor     r4, r4, r10
+
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r6, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r6, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+c80808080
+    DCD     0x80808080
+
+    END
+
diff --git a/libvpx/vp8/common/arm/bilinearfilter_arm.c b/libvpx/vp8/common/arm/bilinearfilter_arm.c
new file mode 100644
index 0000000..c63073c
--- /dev/null
+++ b/libvpx/vp8/common/arm/bilinearfilter_arm.c
@@ -0,0 +1,113 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include <math.h>
+#include "vp8/common/filter.h"
+#include "bilinearfilter_arm.h"
+
+void vp8_filter_block2d_bil_armv6
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
+    unsigned int   dst_pitch,
+    const short   *HFilter,
+    const short   *VFilter,
+    int            Width,
+    int            Height
+)
+{
+    unsigned short FData[36*16]; /* Temp data buffer used in filtering */
+
+    /* First filter 1-D horizontally... */
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+    /* then 1-D vertically... */
+    vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+
+void vp8_bilinear_predict4x4_armv6
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+}
+
+void vp8_bilinear_predict8x8_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+}
+
+void vp8_bilinear_predict8x4_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+}
+
+void vp8_bilinear_predict16x16_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
diff --git a/libvpx/vp8/common/arm/bilinearfilter_arm.h b/libvpx/vp8/common/arm/bilinearfilter_arm.h
new file mode 100644
index 0000000..b7155d3
--- /dev/null
+++ b/libvpx/vp8/common/arm/bilinearfilter_arm.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BILINEARFILTER_ARM_H
+#define BILINEARFILTER_ARM_H
+
+extern void vp8_filter_block2d_bil_first_pass_armv6
+(
+    const unsigned char  *src_ptr,
+    unsigned short       *dst_ptr,
+    unsigned int          src_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short          *vp8_filter
+);
+
+extern void vp8_filter_block2d_bil_second_pass_armv6
+(
+    const unsigned short *src_ptr,
+    unsigned char        *dst_ptr,
+    int                   dst_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short         *vp8_filter
+);
+
+#endif /* BILINEARFILTER_ARM_H */
diff --git a/libvpx/vp8/common/arm/dequantize_arm.c b/libvpx/vp8/common/arm/dequantize_arm.c
new file mode 100644
index 0000000..70e72aa
--- /dev/null
+++ b/libvpx/vp8/common/arm/dequantize_arm.c
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8/common/blockd.h"
+
+#if HAVE_NEON
+extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_MEDIA
+extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
+#endif
+
+#if HAVE_NEON
+
+void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
+{
+    short *DQ  = d->dqcoeff;
+    short *Q   = d->qcoeff;
+
+    vp8_dequantize_b_loop_neon(Q, DQC, DQ);
+}
+#endif
+
+#if HAVE_MEDIA
+void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
+{
+    short *DQ  = d->dqcoeff;
+    short *Q   = d->qcoeff;
+
+    vp8_dequantize_b_loop_v6(Q, DQC, DQ);
+}
+#endif
diff --git a/libvpx/vp8/common/arm/filter_arm.c b/libvpx/vp8/common/arm/filter_arm.c
new file mode 100644
index 0000000..148951a
--- /dev/null
+++ b/libvpx/vp8/common/arm/filter_arm.c
@@ -0,0 +1,221 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include <math.h>
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h"
+
+extern void vp8_filter_block2d_first_pass_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
+
+// 8x8
+extern void vp8_filter_block2d_first_pass_8x8_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
+
+// 16x16
+extern void vp8_filter_block2d_first_pass_16x16_armv6
+(
+    unsigned char *src_ptr,
+    short         *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    const short *vp8_filter
+);
+
+extern void vp8_filter_block2d_second_pass_armv6
+(
+    short         *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int output_pitch,
+    unsigned int cnt,
+    const short *vp8_filter
+);
+
+extern void vp8_filter4_block2d_second_pass_armv6
+(
+    short         *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int output_pitch,
+    unsigned int cnt,
+    const short *vp8_filter
+);
+
+extern void vp8_filter_block2d_first_pass_only_armv6
+(
+    unsigned char *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int cnt,
+    unsigned int output_pitch,
+    const short *vp8_filter
+);
+
+
+extern void vp8_filter_block2d_second_pass_only_armv6
+(
+    unsigned char *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int cnt,
+    unsigned int output_pitch,
+    const short *vp8_filter
+);
+
+#if HAVE_MEDIA
+void vp8_sixtap_predict4x4_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */
+
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* Vfilter is null. First pass only */
+    if (xoffset && !yoffset)
+    {
+        /*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
+        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
+
+        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
+    }
+    /* Hfilter is null. Second pass only */
+    else if (!xoffset && yoffset)
+    {
+        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
+    }
+    else
+    {
+        /* Vfilter is a 4 tap filter */
+        if (yoffset & 0x1)
+        {
+            vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
+            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+        }
+        /* Vfilter is 6 tap filter */
+        else
+        {
+            vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
+            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
+        }
+    }
+}
+
+void vp8_sixtap_predict8x8_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    if (xoffset && !yoffset)
+    {
+        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
+    }
+    /* Hfilter is null. Second pass only */
+    else if (!xoffset && yoffset)
+    {
+        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
+    }
+    else
+    {
+        if (yoffset & 0x1)
+        {
+            vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
+            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+        }
+        else
+        {
+            vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
+            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
+        }
+    }
+}
+
+
+void vp8_sixtap_predict16x16_armv6
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    if (xoffset && !yoffset)
+    {
+        vp8_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
+    }
+    /* Hfilter is null. Second pass only */
+    else if (!xoffset && yoffset)
+    {
+        vp8_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
+    }
+    else
+    {
+        if (yoffset & 0x1)
+        {
+            vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
+            vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+        }
+        else
+        {
+            vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
+            vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
+        }
+    }
+
+}
+#endif
diff --git a/libvpx/vp8/common/arm/loopfilter_arm.c b/libvpx/vp8/common/arm/loopfilter_arm.c
new file mode 100644
index 0000000..b8f9bd9
--- /dev/null
+++ b/libvpx/vp8/common/arm/loopfilter_arm.c
@@ -0,0 +1,181 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
+
+#define prototype_loopfilter(sym) \
+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+             const unsigned char *limit, const unsigned char *thresh, int count)
+
+#if HAVE_MEDIA
+extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
+extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
+extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
+#endif
+
+#if HAVE_NEON
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+        unsigned char blimit, unsigned char limit, unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+        unsigned char blimit, unsigned char limit, unsigned char thresh,
+        unsigned char *v);
+
+extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
+
+extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
+extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
+#endif
+
+#if HAVE_MEDIA
+/* ARMV6/MEDIA loopfilter functions*/
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+#if HAVE_NEON
+/* NEON loopfilter functions */
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned char mblim = *lfi->mblim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+    vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned char mblim = *lfi->mblim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned char blim = *lfi->blim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned char blim = *lfi->blim;
+    unsigned char lim = *lfi->lim;
+    unsigned char hev_thr = *lfi->hev_thr;
+
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
+    vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
+}
+#endif
diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm b/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
new file mode 100644
index 0000000..e392786
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm
@@ -0,0 +1,357 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict16x16_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(r5) int  dst_pitch
+
+|vp8_bilinear_predict16x16_neon| PROC
+    push            {r4-r5, lr}
+
+    adr             r12, bifilter16_coeff
+    ldr             r4, [sp, #12]           ;load parameters from stack
+    ldr             r5, [sp, #16]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_bfilter16x16_only
+
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {d31}, [r2]             ;load first_pass filter
+
+    beq             firstpass_bfilter16x16_only
+
+    sub             sp, sp, #272            ;reserve space on stack for temporary storage
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    mov             lr, sp
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    mov             r2, #3                  ;loop counter
+    vld1.u8         {d8, d9, d10}, [r0], r1
+
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    vdup.8          d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+filt_blk2d_fp16x16_loop_neon
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vqrshrn.u16    d21, q14, #7
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vst1.u8         {d18, d19, d20, d21}, [lr]!
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    bne             filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+    vld1.u8         {d14, d15, d16}, [r0], r1
+
+    vmull.u8        q9, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q10, d3, d0
+    vmull.u8        q11, d5, d0
+    vmull.u8        q12, d6, d0
+    vmull.u8        q13, d8, d0
+    vmull.u8        q14, d9, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+
+    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q11, d5, d1
+    vmlal.u8        q13, d8, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+
+    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q12, d6, d1
+    vmlal.u8        q14, d9, d1
+
+    vmull.u8        q1, d11, d0
+    vmull.u8        q2, d12, d0
+    vmull.u8        q3, d14, d0
+    vmull.u8        q4, d15, d0
+
+    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]
+    vext.8          d14, d14, d15, #1
+
+    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q3, d14, d1
+
+    vext.8          d12, d12, d13, #1
+    vext.8          d15, d15, d16, #1
+
+    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q4, d15, d1
+
+    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d11, q10, #7
+    vqrshrn.u16    d12, q11, #7
+    vqrshrn.u16    d13, q12, #7
+    vqrshrn.u16    d14, q13, #7
+    vqrshrn.u16    d15, q14, #7
+    vqrshrn.u16    d16, q1, #7
+    vqrshrn.u16    d17, q2, #7
+    vqrshrn.u16    d18, q3, #7
+    vqrshrn.u16    d19, q4, #7
+
+    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result
+    vst1.u8         {d14, d15, d16, d17}, [lr]!
+    vst1.u8         {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+    add             r3, r12, r3, lsl #3
+    sub             lr, lr, #272
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+
+    vld1.u8         {d22, d23}, [lr]!       ;load src data
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+    mov             r12, #4                 ;loop counter
+
+filt_blk2d_sp16x16_loop_neon
+    vld1.u8         {d24, d25}, [lr]!
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
+    vld1.u8         {d26, d27}, [lr]!
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [lr]!
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [lr]!
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    subs            r12, r12, #1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r4], r5      ;store result
+    vst1.u8         {d4, d5}, [r4], r5
+    vst1.u8         {d6, d7}, [r4], r5
+    vmov            q11, q15
+    vst1.u8         {d8, d9}, [r4], r5
+
+    bne             filt_blk2d_sp16x16_loop_neon
+
+    add             sp, sp, #272
+
+    pop             {r4-r5,pc}
+
+;--------------------
+firstpass_bfilter16x16_only
+    mov             r2, #4                      ;loop counter
+    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)
+    vdup.8          d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (16x16)
+filt_blk2d_fpo16x16_loop_neon
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vld1.u8         {d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * vp8_filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+    vst1.u8         {d14, d15}, [r4], r5        ;store result
+    vqrshrn.u16    d21, q14, #7
+
+    vst1.u8         {d16, d17}, [r4], r5
+    vst1.u8         {d18, d19}, [r4], r5
+    vst1.u8         {d20, d21}, [r4], r5
+
+    bne             filt_blk2d_fpo16x16_loop_neon
+    pop             {r4-r5,pc}
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+    add             r3, r12, r3, lsl #3
+    mov             r12, #4                     ;loop counter
+    vld1.u32        {d31}, [r3]                 ;load second_pass filter
+    vld1.u8         {d22, d23}, [r0], r1        ;load src data
+
+    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+filt_blk2d_spo16x16_loop_neon
+    vld1.u8         {d24, d25}, [r0], r1
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
+    vld1.u8         {d26, d27}, [r0], r1
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [r0], r1
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [r0], r1
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r4], r5      ;store result
+    subs            r12, r12, #1
+    vst1.u8         {d4, d5}, [r4], r5
+    vmov            q11, q15
+    vst1.u8         {d6, d7}, [r4], r5
+    vst1.u8         {d8, d9}, [r4], r5
+
+    bne             filt_blk2d_spo16x16_loop_neon
+    pop             {r4-r5,pc}
+
+    ENDP
+
+;-----------------
+
+bifilter16_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm b/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
new file mode 100644
index 0000000..0ac6243
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm
@@ -0,0 +1,130 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict4x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_bilinear_predict4x4_neon| PROC
+    push            {r4, lr}
+
+    adr             r12, bifilter4_coeff
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (5x4)
+    vld1.u8         {d2}, [r0], r1          ;load src data
+    add             r2, r12, r2, lsl #3     ;calculate Hfilter location (2coeffsx4bytes=8bytes)
+
+    vld1.u8         {d3}, [r0], r1
+    vld1.u32        {d31}, [r2]             ;first_pass filter
+
+    vld1.u8         {d4}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0-d1)
+    vld1.u8         {d5}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {d6}, [r0], r1
+
+    vshr.u64        q4, q1, #8              ;construct src_ptr[1]
+    vshr.u64        q5, q2, #8
+    vshr.u64        d12, d6, #8
+
+    vzip.32         d2, d3                  ;put 2-line data in 1 register (src_ptr[0])
+    vzip.32         d4, d5
+    vzip.32         d8, d9                  ;put 2-line data in 1 register (src_ptr[1])
+    vzip.32         d10, d11
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q8, d4, d0
+    vmull.u8        q9, d6, d0
+
+    vmlal.u8        q7, d8, d1              ;(src_ptr[1] * vp8_filter[1])
+    vmlal.u8        q8, d10, d1
+    vmlal.u8        q9, d12, d1
+
+    vqrshrn.u16    d28, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d29, q8, #7
+    vqrshrn.u16    d30, q9, #7
+
+;Second pass: 4x4
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             skip_secondpass_filter
+
+    add             r3, r12, r3, lsl #3 ;calculate Vfilter location
+    vld1.u32        {d31}, [r3]         ;load second_pass filter
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d28, d0
+    vmull.u8        q2, d29, d0
+
+    vext.8          d26, d28, d29, #4       ;construct src_ptr[pixel_step]
+    vext.8          d27, d29, d30, #4
+
+    vmlal.u8        q1, d26, d1
+    vmlal.u8        q2, d27, d1
+
+    add             r0, r4, lr
+    add             r1, r0, lr
+    add             r2, r1, lr
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+
+    vst1.32         {d2[0]}, [r4]           ;store result
+    vst1.32         {d2[1]}, [r0]
+    vst1.32         {d3[0]}, [r1]
+    vst1.32         {d3[1]}, [r2]
+
+    pop             {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+
+    vld1.32         {d28[0]}, [r0], r1      ;load src data
+    vld1.32         {d28[1]}, [r0], r1
+    vld1.32         {d29[0]}, [r0], r1
+    vld1.32         {d29[1]}, [r0], r1
+    vld1.32         {d30[0]}, [r0], r1
+
+    b               secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+    vst1.32         {d28[0]}, [r4], lr      ;store result
+    vst1.32         {d28[1]}, [r4], lr
+    vst1.32         {d29[0]}, [r4], lr
+    vst1.32         {d29[1]}, [r4], lr
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+
+bifilter4_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm b/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
new file mode 100644
index 0000000..41f5c45
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm
@@ -0,0 +1,135 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict8x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_bilinear_predict8x4_neon| PROC
+    push            {r4, lr}
+
+    adr             r12, bifilter8x4_coeff
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (5x8)
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vld1.u32        {d31}, [r2]             ;load first_pass filter
+    vld1.u8         {q2}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {q3}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {q4}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vld1.u8         {q5}, [r0], r1
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+    vext.8          d11, d10, d11, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+    vmlal.u8        q10, d11, d1
+
+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d23, q7, #7
+    vqrshrn.u16    d24, q8, #7
+    vqrshrn.u16    d25, q9, #7
+    vqrshrn.u16    d26, q10, #7
+
+;Second pass: 4x8
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             skip_secondpass_filter
+
+    add             r3, r12, r3, lsl #3
+    add             r0, r4, lr
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+    add             r1, r0, lr
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q2, d23, d0
+    vmull.u8        q3, d24, d0
+    vmull.u8        q4, d25, d0
+
+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
+    vmlal.u8        q2, d24, d1
+    vmlal.u8        q3, d25, d1
+    vmlal.u8        q4, d26, d1
+
+    add             r2, r1, lr
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+
+    vst1.u8         {d2}, [r4]              ;store result
+    vst1.u8         {d3}, [r0]
+    vst1.u8         {d4}, [r1]
+    vst1.u8         {d5}, [r2]
+
+    pop             {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+    vld1.u8         {d22}, [r0], r1         ;load src data
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+
+    b               secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+    vst1.u8         {d22}, [r4], lr         ;store result
+    vst1.u8         {d23}, [r4], lr
+    vst1.u8         {d24}, [r4], lr
+    vst1.u8         {d25}, [r4], lr
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+
+bifilter8x4_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm b/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
new file mode 100644
index 0000000..c4711bc
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm
@@ -0,0 +1,183 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_bilinear_predict8x8_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_bilinear_predict8x8_neon| PROC
+    push            {r4, lr}
+
+    adr             r12, bifilter8_coeff
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vld1.u32        {d31}, [r2]             ;load first_pass filter
+    vld1.u8         {q2}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {q3}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {q4}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
+    vld1.u8         {q2}, [r0], r1
+    vqrshrn.u16    d23, q7, #7
+    vld1.u8         {q3}, [r0], r1
+    vqrshrn.u16    d24, q8, #7
+    vld1.u8         {q4}, [r0], r1
+    vqrshrn.u16    d25, q9, #7
+
+    ;first_pass filtering on the rest 5-line data
+    vld1.u8         {q5}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+    vext.8          d11, d10, d11, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * vp8_filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+    vmlal.u8        q10, d11, d1
+
+    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d27, q7, #7
+    vqrshrn.u16    d28, q8, #7
+    vqrshrn.u16    d29, q9, #7
+    vqrshrn.u16    d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             skip_secondpass_filter
+
+    add             r3, r12, r3, lsl #3
+    add             r0, r4, lr
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+    add             r1, r0, lr
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * vp8_filter[0])
+    vmull.u8        q2, d23, d0
+    vmull.u8        q3, d24, d0
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * vp8_filter[1])
+    vmlal.u8        q2, d24, d1
+    vmlal.u8        q3, d25, d1
+    vmlal.u8        q4, d26, d1
+    vmlal.u8        q5, d27, d1
+    vmlal.u8        q6, d28, d1
+    vmlal.u8        q7, d29, d1
+    vmlal.u8        q8, d30, d1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2}, [r4]              ;store result
+    vst1.u8         {d3}, [r0]
+    vst1.u8         {d4}, [r1], lr
+    vst1.u8         {d5}, [r1], lr
+    vst1.u8         {d6}, [r1], lr
+    vst1.u8         {d7}, [r1], lr
+    vst1.u8         {d8}, [r1], lr
+    vst1.u8         {d9}, [r1], lr
+
+    pop             {r4, pc}
+
+;--------------------
+skip_firstpass_filter
+    vld1.u8         {d22}, [r0], r1         ;load src data
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+    vld1.u8         {d27}, [r0], r1
+    vld1.u8         {d28}, [r0], r1
+    vld1.u8         {d29}, [r0], r1
+    vld1.u8         {d30}, [r0], r1
+
+    b               secondpass_filter
+
+;---------------------
+skip_secondpass_filter
+    vst1.u8         {d22}, [r4], lr         ;store result
+    vst1.u8         {d23}, [r4], lr
+    vst1.u8         {d24}, [r4], lr
+    vst1.u8         {d25}, [r4], lr
+    vst1.u8         {d26}, [r4], lr
+    vst1.u8         {d27}, [r4], lr
+    vst1.u8         {d28}, [r4], lr
+    vst1.u8         {d29}, [r4], lr
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+
+bifilter8_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
new file mode 100644
index 0000000..e3ea91f
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
@@ -0,0 +1,584 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_build_intra_predictors_mby_neon_func|
+    EXPORT  |vp8_build_intra_predictors_mby_s_neon_func|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char *y_buffer
+; r1    unsigned char *ypred_ptr
+; r2    int y_stride
+; r3    int mode
+; stack int Up
+; stack int Left
+
+|vp8_build_intra_predictors_mby_neon_func| PROC
+    push            {r4-r8, lr}
+
+    cmp             r3, #0
+    beq             case_dc_pred
+    cmp             r3, #1
+    beq             case_v_pred
+    cmp             r3, #2
+    beq             case_h_pred
+    cmp             r3, #3
+    beq             case_tm_pred
+
+case_dc_pred
+    ldr             r4, [sp, #24]       ; Up
+    ldr             r5, [sp, #28]       ; Left
+
+    ; Default the DC average to 128
+    mov             r12, #128
+    vdup.u8         q0, r12
+
+    ; Zero out running sum
+    mov             r12, #0
+
+    ; compute shift and jump
+    adds            r7, r4, r5
+    beq             skip_dc_pred_up_left
+
+    ; Load above row, if it exists
+    cmp             r4, #0
+    beq             skip_dc_pred_up
+
+    sub             r6, r0, r2
+    vld1.8          {q1}, [r6]
+    vpaddl.u8       q2, q1
+    vpaddl.u16      q3, q2
+    vpaddl.u32      q4, q3
+
+    vmov.32         r4, d8[0]
+    vmov.32         r6, d9[0]
+
+    add             r12, r4, r6
+
+    ; Move back to interger registers
+
+skip_dc_pred_up
+
+    cmp             r5, #0
+    beq             skip_dc_pred_left
+
+    sub             r0, r0, #1
+
+    ; Load left row, if it exists
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0]
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+skip_dc_pred_left
+    add             r7, r7, #3          ; Shift
+    sub             r4, r7, #1
+    mov             r5, #1
+    add             r12, r12, r5, lsl r4
+    mov             r5, r12, lsr r7     ; expected_dc
+
+    vdup.u8         q0, r5
+
+skip_dc_pred_up_left
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+
+    pop             {r4-r8,pc}
+case_v_pred
+    ; Copy down above row
+    sub             r6, r0, r2
+    vld1.8          {q0}, [r6]
+
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q0}, [r1]!
+    pop             {r4-r8,pc}
+
+case_h_pred
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    pop             {r4-r8,pc}
+
+case_tm_pred
+    ; Load yabove_row
+    sub             r3, r0, r2
+    vld1.8          {q8}, [r3]
+
+    ; Load ytop_left
+    sub             r3, r3, #1
+    ldrb            r7, [r3]
+
+    vdup.u16        q7, r7
+
+    ; Compute yabove_row - ytop_left
+    mov             r3, #1
+    vdup.u8         q0, r3
+
+    vmull.u8        q4, d16, d0
+    vmull.u8        q5, d17, d0
+
+    vsub.s16        q4, q4, q7
+    vsub.s16        q5, q5, q7
+
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+    mov             r12, #4
+
+case_tm_pred_loop
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u16        q0, r3
+    vdup.u16        q1, r4
+    vdup.u16        q2, r5
+    vdup.u16        q3, r6
+
+    vqadd.s16       q8, q0, q4
+    vqadd.s16       q9, q0, q5
+
+    vqadd.s16       q10, q1, q4
+    vqadd.s16       q11, q1, q5
+
+    vqadd.s16       q12, q2, q4
+    vqadd.s16       q13, q2, q5
+
+    vqadd.s16       q14, q3, q4
+    vqadd.s16       q15, q3, q5
+
+    vqshrun.s16     d0, q8, #0
+    vqshrun.s16     d1, q9, #0
+
+    vqshrun.s16     d2, q10, #0
+    vqshrun.s16     d3, q11, #0
+
+    vqshrun.s16     d4, q12, #0
+    vqshrun.s16     d5, q13, #0
+
+    vqshrun.s16     d6, q14, #0
+    vqshrun.s16     d7, q15, #0
+
+    vst1.u8         {q0}, [r1]!
+    vst1.u8         {q1}, [r1]!
+    vst1.u8         {q2}, [r1]!
+    vst1.u8         {q3}, [r1]!
+
+    subs            r12, r12, #1
+    bne             case_tm_pred_loop
+
+    pop             {r4-r8,pc}
+
+    ENDP
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; r0    unsigned char *y_buffer
+; r1    unsigned char *ypred_ptr
+; r2    int y_stride
+; r3    int mode
+; stack int Up
+; stack int Left
+
+|vp8_build_intra_predictors_mby_s_neon_func| PROC
+    push            {r4-r8, lr}
+
+    mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
+
+    cmp             r3, #0
+    beq             case_dc_pred_s
+    cmp             r3, #1
+    beq             case_v_pred_s
+    cmp             r3, #2
+    beq             case_h_pred_s
+    cmp             r3, #3
+    beq             case_tm_pred_s
+
+case_dc_pred_s
+    ldr             r4, [sp, #24]       ; Up
+    ldr             r5, [sp, #28]       ; Left
+
+    ; Default the DC average to 128
+    mov             r12, #128
+    vdup.u8         q0, r12
+
+    ; Zero out running sum
+    mov             r12, #0
+
+    ; compute shift and jump
+    adds            r7, r4, r5
+    beq             skip_dc_pred_up_left_s
+
+    ; Load above row, if it exists
+    cmp             r4, #0
+    beq             skip_dc_pred_up_s
+
+    sub             r6, r0, r2
+    vld1.8          {q1}, [r6]
+    vpaddl.u8       q2, q1
+    vpaddl.u16      q3, q2
+    vpaddl.u32      q4, q3
+
+    vmov.32         r4, d8[0]
+    vmov.32         r6, d9[0]
+
+    add             r12, r4, r6
+
+    ; Move back to interger registers
+
+skip_dc_pred_up_s
+
+    cmp             r5, #0
+    beq             skip_dc_pred_left_s
+
+    sub             r0, r0, #1
+
+    ; Load left row, if it exists
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0]
+
+    add             r12, r12, r3
+    add             r12, r12, r4
+    add             r12, r12, r5
+    add             r12, r12, r6
+
+skip_dc_pred_left_s
+    add             r7, r7, #3          ; Shift
+    sub             r4, r7, #1
+    mov             r5, #1
+    add             r12, r12, r5, lsl r4
+    mov             r5, r12, lsr r7     ; expected_dc
+
+    vdup.u8         q0, r5
+
+skip_dc_pred_up_left_s
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+
+    pop             {r4-r8,pc}
+case_v_pred_s
+    ; Copy down above row
+    sub             r6, r0, r2
+    vld1.8          {q0}, [r6]
+
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q0}, [r1], r2
+    pop             {r4-r8,pc}
+
+case_h_pred_s
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u8         q0, r3
+    vdup.u8         q1, r4
+    vdup.u8         q2, r5
+    vdup.u8         q3, r6
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    pop             {r4-r8,pc}
+
+case_tm_pred_s
+    ; Load yabove_row
+    sub             r3, r0, r2
+    vld1.8          {q8}, [r3]
+
+    ; Load ytop_left
+    sub             r3, r3, #1
+    ldrb            r7, [r3]
+
+    vdup.u16        q7, r7
+
+    ; Compute yabove_row - ytop_left
+    mov             r3, #1
+    vdup.u8         q0, r3
+
+    vmull.u8        q4, d16, d0
+    vmull.u8        q5, d17, d0
+
+    vsub.s16        q4, q4, q7
+    vsub.s16        q5, q5, q7
+
+    ; Load 4x yleft_col
+    sub             r0, r0, #1
+    mov             r12, #4
+
+case_tm_pred_loop_s
+    ldrb            r3, [r0], r2
+    ldrb            r4, [r0], r2
+    ldrb            r5, [r0], r2
+    ldrb            r6, [r0], r2
+    vdup.u16        q0, r3
+    vdup.u16        q1, r4
+    vdup.u16        q2, r5
+    vdup.u16        q3, r6
+
+    vqadd.s16       q8, q0, q4
+    vqadd.s16       q9, q0, q5
+
+    vqadd.s16       q10, q1, q4
+    vqadd.s16       q11, q1, q5
+
+    vqadd.s16       q12, q2, q4
+    vqadd.s16       q13, q2, q5
+
+    vqadd.s16       q14, q3, q4
+    vqadd.s16       q15, q3, q5
+
+    vqshrun.s16     d0, q8, #0
+    vqshrun.s16     d1, q9, #0
+
+    vqshrun.s16     d2, q10, #0
+    vqshrun.s16     d3, q11, #0
+
+    vqshrun.s16     d4, q12, #0
+    vqshrun.s16     d5, q13, #0
+
+    vqshrun.s16     d6, q14, #0
+    vqshrun.s16     d7, q15, #0
+
+    vst1.u8         {q0}, [r1], r2
+    vst1.u8         {q1}, [r1], r2
+    vst1.u8         {q2}, [r1], r2
+    vst1.u8         {q3}, [r1], r2
+
+    subs            r12, r12, #1
+    bne             case_tm_pred_loop_s
+
+    pop             {r4-r8,pc}
+
+    ENDP
+
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm b/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm
new file mode 100644
index 0000000..bda4b96
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm
@@ -0,0 +1,59 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem16x16_neon|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem16x16_neon| PROC
+
+    vld1.u8     {q0}, [r0], r1
+    vld1.u8     {q1}, [r0], r1
+    vld1.u8     {q2}, [r0], r1
+    vst1.u8     {q0}, [r2], r3
+    vld1.u8     {q3}, [r0], r1
+    vst1.u8     {q1}, [r2], r3
+    vld1.u8     {q4}, [r0], r1
+    vst1.u8     {q2}, [r2], r3
+    vld1.u8     {q5}, [r0], r1
+    vst1.u8     {q3}, [r2], r3
+    vld1.u8     {q6}, [r0], r1
+    vst1.u8     {q4}, [r2], r3
+    vld1.u8     {q7}, [r0], r1
+    vst1.u8     {q5}, [r2], r3
+    vld1.u8     {q8}, [r0], r1
+    vst1.u8     {q6}, [r2], r3
+    vld1.u8     {q9}, [r0], r1
+    vst1.u8     {q7}, [r2], r3
+    vld1.u8     {q10}, [r0], r1
+    vst1.u8     {q8}, [r2], r3
+    vld1.u8     {q11}, [r0], r1
+    vst1.u8     {q9}, [r2], r3
+    vld1.u8     {q12}, [r0], r1
+    vst1.u8     {q10}, [r2], r3
+    vld1.u8     {q13}, [r0], r1
+    vst1.u8     {q11}, [r2], r3
+    vld1.u8     {q14}, [r0], r1
+    vst1.u8     {q12}, [r2], r3
+    vld1.u8     {q15}, [r0], r1
+    vst1.u8     {q13}, [r2], r3
+    vst1.u8     {q14}, [r2], r3
+    vst1.u8     {q15}, [r2], r3
+
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem16x16_neon|
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm b/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm
new file mode 100644
index 0000000..35c0f67
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm
@@ -0,0 +1,34 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x4_neon|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x4_neon| PROC
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d1}, [r0], r1
+    vst1.u8     {d0}, [r2], r3
+    vld1.u8     {d2}, [r0], r1
+    vst1.u8     {d1}, [r2], r3
+    vld1.u8     {d3}, [r0], r1
+    vst1.u8     {d2}, [r2], r3
+    vst1.u8     {d3}, [r2], r3
+
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x4_neon|
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm b/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm
new file mode 100644
index 0000000..1f5b941
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm
@@ -0,0 +1,43 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_copy_mem8x8_neon|
+    ; ARM
+    ; REQUIRE8
+    ; PRESERVE8
+
+    AREA    Block, CODE, READONLY ; name this block of code
+;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
+;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+|vp8_copy_mem8x8_neon| PROC
+
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d1}, [r0], r1
+    vst1.u8     {d0}, [r2], r3
+    vld1.u8     {d2}, [r0], r1
+    vst1.u8     {d1}, [r2], r3
+    vld1.u8     {d3}, [r0], r1
+    vst1.u8     {d2}, [r2], r3
+    vld1.u8     {d4}, [r0], r1
+    vst1.u8     {d3}, [r2], r3
+    vld1.u8     {d5}, [r0], r1
+    vst1.u8     {d4}, [r2], r3
+    vld1.u8     {d6}, [r0], r1
+    vst1.u8     {d5}, [r2], r3
+    vld1.u8     {d7}, [r0], r1
+    vst1.u8     {d6}, [r2], r3
+    vst1.u8     {d7}, [r2], r3
+
+    mov     pc, lr
+
+    ENDP  ; |vp8_copy_mem8x8_neon|
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm
new file mode 100644
index 0000000..79ff02c
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm
@@ -0,0 +1,54 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dc_only_idct_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+;                            int pred_stride, unsigned char *dst_ptr,
+;                            int dst_stride)
+
+; r0  input_dc
+; r1  pred_ptr
+; r2  pred_stride
+; r3  dst_ptr
+; sp  dst_stride
+
+|vp8_dc_only_idct_add_neon| PROC
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    ldr             r12, [sp]
+    vdup.16         q0, r0
+
+    vld1.32         {d2[0]}, [r1], r2
+    vld1.32         {d2[1]}, [r1], r2
+    vld1.32         {d4[0]}, [r1], r2
+    vld1.32         {d4[1]}, [r1]
+
+    vaddw.u8        q1, q0, d2
+    vaddw.u8        q2, q0, d4
+
+    vqmovun.s16     d2, q1
+    vqmovun.s16     d4, q2
+
+    vst1.32         {d2[0]}, [r3], r12
+    vst1.32         {d2[1]}, [r3], r12
+    vst1.32         {d4[0]}, [r3], r12
+    vst1.32         {d4[1]}, [r3]
+
+    bx              lr
+
+    ENDP
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm b/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm
new file mode 100644
index 0000000..602cce6
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm
@@ -0,0 +1,131 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequant_idct_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_dequant_idct_add_neon(short *input, short *dq,
+;                           unsigned char *dest, int stride)
+; r0    short *input,
+; r1    short *dq,
+; r2    unsigned char *dest
+; r3    int stride
+
+|vp8_dequant_idct_add_neon| PROC
+    vld1.16         {q3, q4}, [r0]
+    vld1.16         {q5, q6}, [r1]
+
+    add             r1, r2, r3              ; r1 = dest + stride
+    lsl             r3, #1                  ; 2x stride
+
+    vld1.32         {d14[0]}, [r2], r3
+    vld1.32         {d14[1]}, [r1], r3
+    vld1.32         {d15[0]}, [r2]
+    vld1.32         {d15[1]}, [r1]
+
+    adr             r12, cospi8sqrt2minus1  ; pointer to the first constant
+
+    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon
+    vmul.i16        q2, q4, q6
+
+;|short_idct4x4llm_neon| PROC
+    vld1.16         {d0}, [r12]
+    vswp            d3, d4                  ;q2(vp[4] vp[12])
+
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2
+    vqadd.s16       q4, q4, q2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+; memset(input, 0, 32) -- 32bytes
+    vmov.i16        q14, #0
+
+    vswp            d3, d4
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vmov            q15, q14
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2
+    vqadd.s16       q4, q4, q2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vst1.16         {q14, q15}, [r0]
+
+    vrshr.s16       d2, d2, #3
+    vrshr.s16       d3, d3, #3
+    vrshr.s16       d4, d4, #3
+    vrshr.s16       d5, d5, #3
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vaddw.u8        q1, q1, d14
+    vaddw.u8        q2, q2, d15
+
+    sub             r2, r2, r3
+    sub             r1, r1, r3
+
+    vqmovun.s16     d0, q1
+    vqmovun.s16     d1, q2
+
+    vst1.32         {d0[0]}, [r2], r3
+    vst1.32         {d0[1]}, [r1], r3
+    vst1.32         {d1[0]}, [r2]
+    vst1.32         {d1[1]}, [r1]
+
+    bx             lr
+
+    ENDP           ; |vp8_dequant_idct_add_neon|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x4e7b4e7b
+sinpi8sqrt2       DCD 0x8a8c8a8c
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm b/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm
new file mode 100644
index 0000000..c8e0c31
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm
@@ -0,0 +1,34 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_dequantize_b_loop_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    short *Q,
+; r1    short *DQC
+; r2    short *DQ
+|vp8_dequantize_b_loop_neon| PROC
+    vld1.16         {q0, q1}, [r0]
+    vld1.16         {q2, q3}, [r1]
+
+    vmul.i16        q4, q0, q2
+    vmul.i16        q5, q1, q3
+
+    vst1.16         {q4, q5}, [r2]
+
+    bx             lr
+
+    ENDP
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/libvpx/vp8/common/arm/neon/idct_blk_neon.c
new file mode 100644
index 0000000..ee7f223
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/idct_blk_neon.c
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+/* place these declarations here because we don't want to maintain them
+ * outside of this scope
+ */
+void idct_dequant_full_2x_neon(short *q, short *dq,
+                               unsigned char *dst, int stride);
+void idct_dequant_0_2x_neon(short *q, short dq,
+                            unsigned char *dst, int stride);
+
+
+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
+                                       unsigned char *dst,
+                                       int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (((short *)(eobs))[0])
+        {
+            if (((short *)eobs)[0] & 0xfefe)
+                idct_dequant_full_2x_neon (q, dq, dst, stride);
+            else
+                idct_dequant_0_2x_neon (q, dq[0], dst, stride);
+        }
+
+        if (((short *)(eobs))[1])
+        {
+            if (((short *)eobs)[1] & 0xfefe)
+                idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);
+            else
+                idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);
+        }
+        q    += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
+                                        unsigned char *dstu,
+                                        unsigned char *dstv,
+                                        int stride, char *eobs)
+{
+    if (((short *)(eobs))[0])
+    {
+        if (((short *)eobs)[0] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, dstu, stride);
+        else
+            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+    }
+
+    q    += 32;
+    dstu += 4*stride;
+
+    if (((short *)(eobs))[1])
+    {
+        if (((short *)eobs)[1] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, dstu, stride);
+        else
+            idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+    }
+
+    q += 32;
+
+    if (((short *)(eobs))[2])
+    {
+        if (((short *)eobs)[2] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, dstv, stride);
+        else
+            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+    }
+
+    q    += 32;
+    dstv += 4*stride;
+
+    if (((short *)(eobs))[3])
+    {
+        if (((short *)eobs)[3] & 0xfefe)
+            idct_dequant_full_2x_neon (q, dq, dstv, stride);
+        else
+            idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+    }
+}
diff --git a/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
new file mode 100644
index 0000000..6c29c55
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
@@ -0,0 +1,79 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_0_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_0_2x_neon(short *q, short dq,
+;                            unsigned char *dst, int stride);
+; r0   *q
+; r1   dq
+; r2   *dst
+; r3   stride
+|idct_dequant_0_2x_neon| PROC
+    push            {r4, r5}
+
+    add             r12, r2, #4
+    vld1.32         {d2[0]}, [r2], r3
+    vld1.32         {d8[0]}, [r12], r3
+    vld1.32         {d2[1]}, [r2], r3
+    vld1.32         {d8[1]}, [r12], r3
+    vld1.32         {d4[0]}, [r2], r3
+    vld1.32         {d10[0]}, [r12], r3
+    vld1.32         {d4[1]}, [r2], r3
+    vld1.32         {d10[1]}, [r12], r3
+
+    ldrh            r12, [r0]               ; lo q
+    ldrh            r4, [r0, #32]           ; hi q
+    mov             r5, #0
+    strh            r5, [r0]
+    strh            r5, [r0, #32]
+
+    sxth            r12, r12                ; lo
+    mul             r0, r12, r1
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q0, r0
+    sxth            r4, r4                  ; hi
+    mul             r0, r4, r1
+    add             r0, r0, #4
+    asr             r0, r0, #3
+    vdup.16         q3, r0
+
+    vaddw.u8        q1, q0, d2              ; lo
+    vaddw.u8        q2, q0, d4
+    vaddw.u8        q4, q3, d8              ; hi
+    vaddw.u8        q5, q3, d10
+
+    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
+    add             r0, r2, #4
+
+    vqmovun.s16     d2, q1                  ; lo
+    vqmovun.s16     d4, q2
+    vqmovun.s16     d8, q4                  ; hi
+    vqmovun.s16     d10, q5
+
+    vst1.32         {d2[0]}, [r2], r3       ; lo
+    vst1.32         {d8[0]}, [r0], r3       ; hi
+    vst1.32         {d2[1]}, [r2], r3
+    vst1.32         {d8[1]}, [r0], r3
+    vst1.32         {d4[0]}, [r2], r3
+    vst1.32         {d10[0]}, [r0], r3
+    vst1.32         {d4[1]}, [r2]
+    vst1.32         {d10[1]}, [r0]
+
+    pop             {r4, r5}
+    bx              lr
+
+    ENDP            ; |idct_dequant_0_2x_neon|
+    END
diff --git a/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
new file mode 100644
index 0000000..d5dce63
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
@@ -0,0 +1,196 @@
+;
+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |idct_dequant_full_2x_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void idct_dequant_full_2x_neon(short *q, short *dq,
+;                               unsigned char *dst, int stride);
+; r0    *q,
+; r1    *dq,
+; r2    *dst
+; r3    stride
+|idct_dequant_full_2x_neon| PROC
+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
+    vld1.16         {q2, q3}, [r0]          ; l q
+    add             r0, r0, #32
+    vld1.16         {q4, q5}, [r0]          ; r q
+    add             r12, r2, #4
+
+    ; interleave the predictors
+    vld1.32         {d28[0]}, [r2],  r3     ; l pre
+    vld1.32         {d28[1]}, [r12], r3     ; r pre
+    vld1.32         {d29[0]}, [r2],  r3
+    vld1.32         {d29[1]}, [r12], r3
+    vld1.32         {d30[0]}, [r2],  r3
+    vld1.32         {d30[1]}, [r12], r3
+    vld1.32         {d31[0]}, [r2],  r3
+    vld1.32         {d31[1]}, [r12]
+
+    adr             r1, cospi8sqrt2minus1   ; pointer to the first constant
+
+    ; dequant: q[i] = q[i] * dq[i]
+    vmul.i16        q2, q2, q0
+    vmul.i16        q3, q3, q1
+    vmul.i16        q4, q4, q0
+    vmul.i16        q5, q5, q1
+
+    vld1.16         {d0}, [r1]
+
+    ; q2: l0r0  q3: l8r8
+    ; q4: l4r4  q5: l12r12
+    vswp            d5, d8
+    vswp            d7, d10
+
+    ; _CONSTANTS_ * 4,12 >> 16
+    ; q6:  4 * sinpi : c1/temp1
+    ; q7: 12 * sinpi : d1/temp2
+    ; q8:  4 * cospi
+    ; q9: 12 * cospi
+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q7, q5, d0[2]
+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1
+    vqdmulh.s16     q9, q5, d0[0]
+
+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8
+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8
+
+    ; vqdmulh only accepts signed values. this was a problem because
+    ; our constant had the high bit set, and was treated as a negative value.
+    ; vqdmulh also doubles the value before it shifts by 16. we need to
+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
+    ; so we can shift the constant without losing precision. this avoids
+    ; shift again afterward, but also avoids the sign issue. win win!
+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
+    ; pre-shift it
+    vshr.s16        q8, q8, #1
+    vshr.s16        q9, q9, #1
+
+    ; q4:  4 +  4 * cospi : d1/temp1
+    ; q5: 12 + 12 * cospi : c1/temp2
+    vqadd.s16       q4, q4, q8
+    vqadd.s16       q5, q5, q9
+
+    ; c1 = temp1 - temp2
+    ; d1 = temp1 + temp2
+    vqsub.s16       q2, q6, q5
+    vqadd.s16       q3, q4, q7
+
+    ; [0]: a1+d1
+    ; [1]: b1+c1
+    ; [2]: b1-c1
+    ; [3]: a1-d1
+    vqadd.s16       q4, q10, q3
+    vqadd.s16       q5, q11, q2
+    vqsub.s16       q6, q11, q2
+    vqsub.s16       q7, q10, q3
+
+    ; rotate
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+    ; idct loop 2
+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12
+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13
+    ; q6: l 2, 6,10,14 r 2, 6,10,14
+    ; q7: l 3, 7,11,15 r 3, 7,11,15
+
+    ; q8:  1 * sinpi : c1/temp1
+    ; q9:  3 * sinpi : d1/temp2
+    ; q10: 1 * cospi
+    ; q11: 3 * cospi
+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2
+    vqdmulh.s16     q9, q7, d0[2]
+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1
+    vqdmulh.s16     q11, q7, d0[0]
+
+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2
+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2
+
+    ; see note on shifting above
+    vshr.s16        q10, q10, #1
+    vshr.s16        q11, q11, #1
+
+    ; q10: 1 + 1 * cospi : d1/temp1
+    ; q11: 3 + 3 * cospi : c1/temp2
+    vqadd.s16       q10, q5, q10
+    vqadd.s16       q11, q7, q11
+
+    ; q8: c1 = temp1 - temp2
+    ; q9: d1 = temp1 + temp2
+    vqsub.s16       q8, q8, q11
+    vqadd.s16       q9, q10, q9
+
+    ; a1+d1
+    ; b1+c1
+    ; b1-c1
+    ; a1-d1
+    vqadd.s16       q4, q2, q9
+    vqadd.s16       q5, q3, q8
+    vqsub.s16       q6, q3, q8
+    vqsub.s16       q7, q2, q9
+
+    ; +4 >> 3 (rounding)
+    vrshr.s16       q4, q4, #3              ; lo
+    vrshr.s16       q5, q5, #3
+    vrshr.s16       q6, q6, #3              ; hi
+    vrshr.s16       q7, q7, #3
+
+    vtrn.32         q4, q6
+    vtrn.32         q5, q7
+    vtrn.16         q4, q5
+    vtrn.16         q6, q7
+
+    ; adding pre
+    ; input is still packed. pre was read interleaved
+    vaddw.u8        q4, q4, d28
+    vaddw.u8        q5, q5, d29
+    vaddw.u8        q6, q6, d30
+    vaddw.u8        q7, q7, d31
+
+    vmov.i16        q14, #0
+    vmov            q15, q14
+    vst1.16         {q14, q15}, [r0]        ; write over high input
+    sub             r0, r0, #32
+    vst1.16         {q14, q15}, [r0]        ; write over low input
+
+    sub             r2, r2, r3, lsl #2      ; dst - 4*stride
+    add             r1, r2, #4              ; hi
+
+    ;saturate and narrow
+    vqmovun.s16     d0, q4                  ; lo
+    vqmovun.s16     d1, q5
+    vqmovun.s16     d2, q6                  ; hi
+    vqmovun.s16     d3, q7
+
+    vst1.32         {d0[0]}, [r2], r3       ; lo
+    vst1.32         {d0[1]}, [r1], r3       ; hi
+    vst1.32         {d1[0]}, [r2], r3
+    vst1.32         {d1[1]}, [r1], r3
+    vst1.32         {d2[0]}, [r2], r3
+    vst1.32         {d2[1]}, [r1], r3
+    vst1.32         {d3[0]}, [r2]
+    vst1.32         {d3[1]}, [r1]
+
+    bx             lr
+
+    ENDP           ; |idct_dequant_full_2x_neon|
+
+; Constant Pool
+cospi8sqrt2minus1 DCD 0x4e7b
+; because the lowest bit in 0x8a8c is 0, we can pre-shift this
+sinpi8sqrt2       DCD 0x4546
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/iwalsh_neon.asm b/libvpx/vp8/common/arm/neon/iwalsh_neon.asm
new file mode 100644
index 0000000..e8ea2a6
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/iwalsh_neon.asm
@@ -0,0 +1,87 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+    EXPORT  |vp8_short_inv_walsh4x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
+|vp8_short_inv_walsh4x4_neon| PROC
+
+    ; read in all four lines of values: d0->d3
+    vld1.i16 {q0-q1}, [r0@128]
+
+    ; first for loop
+    vadd.s16 d4, d0, d3 ;a = [0] + [12]
+    vadd.s16 d6, d1, d2 ;b = [4] + [8]
+    vsub.s16 d5, d0, d3 ;d = [0] - [12]
+    vsub.s16 d7, d1, d2 ;c = [4] - [8]
+
+    vadd.s16 q0, q2, q3 ; a+b d+c
+    vsub.s16 q1, q2, q3 ; a-b d-c
+
+    vtrn.32 d0, d2 ;d0:  0  1  8  9
+                   ;d2:  2  3 10 11
+    vtrn.32 d1, d3 ;d1:  4  5 12 13
+                   ;d3:  6  7 14 15
+
+    vtrn.16 d0, d1 ;d0:  0  4  8 12
+                   ;d1:  1  5  9 13
+    vtrn.16 d2, d3 ;d2:  2  6 10 14
+                   ;d3:  3  7 11 15
+
+    ; second for loop
+
+    vadd.s16 d4, d0, d3 ;a = [0] + [3]
+    vadd.s16 d6, d1, d2 ;b = [1] + [2]
+    vsub.s16 d5, d0, d3 ;d = [0] - [3]
+    vsub.s16 d7, d1, d2 ;c = [1] - [2]
+
+    vmov.i16 q8, #3
+
+    vadd.s16 q0, q2, q3 ; a+b d+c
+    vsub.s16 q1, q2, q3 ; a-b d-c
+
+    vadd.i16 q0, q0, q8 ;e/f += 3
+    vadd.i16 q1, q1, q8 ;g/h += 3
+
+    vshr.s16 q0, q0, #3 ;e/f >> 3
+    vshr.s16 q1, q1, #3 ;g/h >> 3
+
+    mov      r2, #64
+    add      r3, r1, #32
+
+    vst1.i16 d0[0], [r1],r2
+    vst1.i16 d1[0], [r3],r2
+    vst1.i16 d2[0], [r1],r2
+    vst1.i16 d3[0], [r3],r2
+
+    vst1.i16 d0[1], [r1],r2
+    vst1.i16 d1[1], [r3],r2
+    vst1.i16 d2[1], [r1],r2
+    vst1.i16 d3[1], [r3],r2
+
+    vst1.i16 d0[2], [r1],r2
+    vst1.i16 d1[2], [r3],r2
+    vst1.i16 d2[2], [r1],r2
+    vst1.i16 d3[2], [r3],r2
+
+    vst1.i16 d0[3], [r1],r2
+    vst1.i16 d1[3], [r3],r2
+    vst1.i16 d2[3], [r1]
+    vst1.i16 d3[3], [r3]
+
+    bx lr
+    ENDP    ; |vp8_short_inv_walsh4x4_neon|
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/loopfilter_neon.asm b/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
new file mode 100644
index 0000000..e44be0a
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/loopfilter_neon.asm
@@ -0,0 +1,397 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_loop_filter_horizontal_edge_y_neon|
+    EXPORT  |vp8_loop_filter_horizontal_edge_uv_neon|
+    EXPORT  |vp8_loop_filter_vertical_edge_y_neon|
+    EXPORT  |vp8_loop_filter_vertical_edge_uv_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src
+; r1    int pitch
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+|vp8_loop_filter_horizontal_edge_y_neon| PROC
+    push        {lr}
+    vdup.u8     q0, r2                     ; duplicate blimit
+    vdup.u8     q1, r3                     ; duplicate limit
+    sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines
+    ldr         r3, [sp, #4]               ; load thresh
+    add         r12, r2, r1
+    add         r1, r1, r1
+
+    vdup.u8     q2, r3                     ; duplicate thresh
+
+    vld1.u8     {q3}, [r2@128], r1              ; p3
+    vld1.u8     {q4}, [r12@128], r1             ; p2
+    vld1.u8     {q5}, [r2@128], r1              ; p1
+    vld1.u8     {q6}, [r12@128], r1             ; p0
+    vld1.u8     {q7}, [r2@128], r1              ; q0
+    vld1.u8     {q8}, [r12@128], r1             ; q1
+    vld1.u8     {q9}, [r2@128]                  ; q2
+    vld1.u8     {q10}, [r12@128]                ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r12, r12, r1, lsl #1
+
+    bl          vp8_loop_filter_neon
+
+    vst1.u8     {q5}, [r2@128], r1              ; store op1
+    vst1.u8     {q6}, [r12@128], r1             ; store op0
+    vst1.u8     {q7}, [r2@128], r1              ; store oq0
+    vst1.u8     {q8}, [r12@128], r1             ; store oq1
+
+    pop         {pc}
+    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
+
+
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+; sp+4  unsigned char *v
+|vp8_loop_filter_horizontal_edge_uv_neon| PROC
+    push        {lr}
+    vdup.u8     q0, r2                      ; duplicate blimit
+    vdup.u8     q1, r3                      ; duplicate limit
+    ldr         r12, [sp, #4]               ; load thresh
+    ldr         r2, [sp, #8]                ; load v ptr
+    vdup.u8     q2, r12                     ; duplicate thresh
+
+    sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines
+    sub         r12, r2, r1, lsl #2         ; move v pointer down by 4 lines
+
+    vld1.u8     {d6}, [r3@64], r1              ; p3
+    vld1.u8     {d7}, [r12@64], r1             ; p3
+    vld1.u8     {d8}, [r3@64], r1              ; p2
+    vld1.u8     {d9}, [r12@64], r1             ; p2
+    vld1.u8     {d10}, [r3@64], r1             ; p1
+    vld1.u8     {d11}, [r12@64], r1            ; p1
+    vld1.u8     {d12}, [r3@64], r1             ; p0
+    vld1.u8     {d13}, [r12@64], r1            ; p0
+    vld1.u8     {d14}, [r3@64], r1             ; q0
+    vld1.u8     {d15}, [r12@64], r1            ; q0
+    vld1.u8     {d16}, [r3@64], r1             ; q1
+    vld1.u8     {d17}, [r12@64], r1            ; q1
+    vld1.u8     {d18}, [r3@64], r1             ; q2
+    vld1.u8     {d19}, [r12@64], r1            ; q2
+    vld1.u8     {d20}, [r3@64]                 ; q3
+    vld1.u8     {d21}, [r12@64]                ; q3
+
+    bl          vp8_loop_filter_neon
+
+    sub         r0, r0, r1, lsl #1
+    sub         r2, r2, r1, lsl #1
+
+    vst1.u8     {d10}, [r0@64], r1             ; store u op1
+    vst1.u8     {d11}, [r2@64], r1             ; store v op1
+    vst1.u8     {d12}, [r0@64], r1             ; store u op0
+    vst1.u8     {d13}, [r2@64], r1             ; store v op0
+    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
+    vst1.u8     {d15}, [r2@64], r1             ; store v oq0
+    vst1.u8     {d16}, [r0@64]                 ; store u oq1
+    vst1.u8     {d17}, [r2@64]                 ; store v oq1
+
+    pop         {pc}
+    ENDP        ; |vp8_loop_filter_horizontal_edge_uv_neon|
+
+; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
+;                                           const signed char *flimit,
+;                                           const signed char *limit,
+;                                           const signed char *thresh,
+;                                           int count)
+; r0    unsigned char *src
+; r1    int pitch
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+
+|vp8_loop_filter_vertical_edge_y_neon| PROC
+    push        {lr}
+    vdup.u8     q0, r2                     ; duplicate blimit
+    vdup.u8     q1, r3                     ; duplicate limit
+    sub         r2, r0, #4                 ; src ptr down by 4 columns
+    add         r1, r1, r1
+    ldr         r3, [sp, #4]               ; load thresh
+    add         r12, r2, r1, asr #1
+
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d8}, [r12], r1
+    vld1.u8     {d10}, [r2], r1
+    vld1.u8     {d12}, [r12], r1
+    vld1.u8     {d14}, [r2], r1
+    vld1.u8     {d16}, [r12], r1
+    vld1.u8     {d18}, [r2], r1
+    vld1.u8     {d20}, [r12], r1
+
+    vld1.u8     {d7}, [r2], r1              ; load second 8-line src data
+    vld1.u8     {d9}, [r12], r1
+    vld1.u8     {d11}, [r2], r1
+    vld1.u8     {d13}, [r12], r1
+    vld1.u8     {d15}, [r2], r1
+    vld1.u8     {d17}, [r12], r1
+    vld1.u8     {d19}, [r2]
+    vld1.u8     {d21}, [r12]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vdup.u8     q2, r3                     ; duplicate thresh
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    bl          vp8_loop_filter_neon
+
+    vswp        d12, d11
+    vswp        d16, d13
+
+    sub         r0, r0, #2                 ; dst ptr
+
+    vswp        d14, d12
+    vswp        d16, d15
+
+    add         r12, r0, r1, asr #1
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
+
+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]
+
+    pop         {pc}
+    ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|
+
+; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
+;                                            const signed char *flimit,
+;                                            const signed char *limit,
+;                                            const signed char *thresh,
+;                                            unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+; sp+4  unsigned char *v
+|vp8_loop_filter_vertical_edge_uv_neon| PROC
+    push        {lr}
+    vdup.u8     q0, r2                      ; duplicate blimit
+    sub         r12, r0, #4                 ; move u pointer down by 4 columns
+    ldr         r2, [sp, #8]                ; load v ptr
+    vdup.u8     q1, r3                      ; duplicate limit
+    sub         r3, r2, #4                  ; move v pointer down by 4 columns
+
+    vld1.u8     {d6}, [r12], r1             ;load u data
+    vld1.u8     {d7}, [r3], r1              ;load v data
+    vld1.u8     {d8}, [r12], r1
+    vld1.u8     {d9}, [r3], r1
+    vld1.u8     {d10}, [r12], r1
+    vld1.u8     {d11}, [r3], r1
+    vld1.u8     {d12}, [r12], r1
+    vld1.u8     {d13}, [r3], r1
+    vld1.u8     {d14}, [r12], r1
+    vld1.u8     {d15}, [r3], r1
+    vld1.u8     {d16}, [r12], r1
+    vld1.u8     {d17}, [r3], r1
+    vld1.u8     {d18}, [r12], r1
+    vld1.u8     {d19}, [r3], r1
+    vld1.u8     {d20}, [r12]
+    vld1.u8     {d21}, [r3]
+
+    ldr        r12, [sp, #4]               ; load thresh
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vdup.u8     q2, r12                     ; duplicate thresh
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    bl          vp8_loop_filter_neon
+
+    vswp        d12, d11
+    vswp        d16, d13
+    vswp        d14, d12
+    vswp        d16, d15
+
+    sub         r0, r0, #2
+    sub         r2, r2, #2
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
+    vst4.8      {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
+    vst4.8      {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
+    vst4.8      {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
+    vst4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
+    vst4.8      {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
+    vst4.8      {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
+    vst4.8      {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
+    vst4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
+    vst4.8      {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
+    vst4.8      {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
+    vst4.8      {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
+    vst4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
+    vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
+    vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]
+    vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
+
+    pop         {pc}
+    ENDP        ; |vp8_loop_filter_vertical_edge_uv_neon|
+
+; void vp8_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+
+; r0-r3 PRESERVE
+; q0    flimit
+; q1    limit
+; q2    thresh
+; q3    p3
+; q4    p2
+; q5    p1
+; q6    p0
+; q7    q0
+; q8    q1
+; q9    q2
+; q10   q3
+|vp8_loop_filter_neon| PROC
+
+    ; vp8_filter_mask
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; abs(q3 - q2)
+
+    vmax.u8     q11, q11, q12
+    vmax.u8     q12, q13, q14
+    vmax.u8     q3, q3, q4
+    vmax.u8     q15, q11, q12
+
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    ; vp8_hevmask
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     q15, q15, q3
+
+    vmov.u8     q10, #0x80                   ; 0x80
+
+    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
+
+    vcge.u8     q15, q1, q15
+
+    ; vp8_filter() function
+    ; convert to signed
+    veor        q7, q7, q10                 ; qs0
+    vshr.u8     q2, q2, #1                  ; a = a / 2
+    veor        q6, q6, q10                 ; ps0
+
+    veor        q5, q5, q10                 ; ps1
+    vqadd.u8    q9, q9, q2                  ; a = b + a
+
+    veor        q8, q8, q10                 ; qs1
+
+    vmov.u8     q10, #3                     ; #3
+
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vcge.u8     q9, q0, q9                  ; (a > flimit * 2 + limit) * -1
+
+    vmovl.u8    q4, d20
+
+    vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; vp8_hevmask
+
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; vp8_filter &= hev
+    vand        q15, q15, q9                ; vp8_filter_mask
+
+    vaddw.s8    q2, q2, d2
+    vaddw.s8    q11, q11, d3
+
+    vmov.u8     q9, #4                      ; #4
+
+    ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d2, q2
+    vqmovn.s16  d3, q11
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+
+    vqadd.s8    q2, q1, q10                 ; Filter2 = clamp(vp8_filter+3)
+    vqadd.s8    q1, q1, q9                  ; Filter1 = clamp(vp8_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; Filter1 >>= 3
+
+
+    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + Filter2)
+    vqsub.s8    q10, q7, q1                 ; u = clamp(qs0 - Filter1)
+
+    ; outer tap adjustments: ++vp8_filter >> 1
+    vrshr.s8    q1, q1, #1
+    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
+    vmov.u8     q0, #0x80                   ; 0x80
+    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + vp8_filter)
+    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - vp8_filter)
+
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    veor        q5, q13, q0                 ; *op1 = u^0x80
+    veor        q8, q12, q0                 ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
+
+;-----------------
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
new file mode 100644
index 0000000..adf848b
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -0,0 +1,117 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    ;EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
+    EXPORT  |vp8_loop_filter_bhs_neon|
+    EXPORT  |vp8_loop_filter_mbhs_neon|
+    ARM
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *s, PRESERVE
+; r1    int p, PRESERVE
+; q1    limit, PRESERVE
+
+|vp8_loop_filter_simple_horizontal_edge_neon| PROC
+
+    sub         r3, r0, r1, lsl #1          ; move src pointer down by 2 lines
+
+    vld1.u8     {q7}, [r0@128], r1          ; q0
+    vld1.u8     {q5}, [r3@128], r1          ; p0
+    vld1.u8     {q8}, [r0@128]              ; q1
+    vld1.u8     {q6}, [r3@128]              ; p1
+
+    vabd.u8     q15, q6, q7                 ; abs(p0 - q0)
+    vabd.u8     q14, q5, q8                 ; abs(p1 - q1)
+
+    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
+    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
+    vmov.u8     q0, #0x80                   ; 0x80
+    vmov.s16    q13, #3
+    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+    veor        q7, q7, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q6, q6, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q8, q8, q0                  ; qs1: q1 offset to convert to a signed value
+
+    vcge.u8     q15, q1, q15                ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
+
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q3, d15, d13
+
+    vqsub.s8    q4, q5, q8                  ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+
+    vmul.s16    q2, q2, q13                 ;  3 * ( qs0 - ps0)
+    vmul.s16    q3, q3, q13
+
+    vmov.u8     q10, #0x03                  ; 0x03
+    vmov.u8     q9, #0x04                   ; 0x04
+
+    vaddw.s8    q2, q2, d8                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q3, q3, d9
+
+    vqmovn.s16  d8, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d9, q3
+
+    vand        q14, q4, q15                ; vp8_filter &= mask
+
+    vqadd.s8    q2, q14, q10                ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q3, q14, q9                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q4, q3, #3                  ; Filter1 >>= 3
+
+    sub         r0, r0, r1
+
+    ;calculate output
+    vqadd.s8    q11, q6, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    vqsub.s8    q10, q7, q4                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
+
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+
+    vst1.u8     {q6}, [r3@128]              ; store op0
+    vst1.u8     {q7}, [r0@128]              ; store oq0
+
+    bx          lr
+    ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_bhs_neon| PROC
+    push        {r4, lr}
+    ldrb        r3, [r2]                    ; load blim from mem
+    vdup.s8     q1, r3                      ; duplicate blim
+
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 4 * y_stride
+    bl          vp8_loop_filter_simple_horizontal_edge_neon
+    ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 8* y_stride
+    bl          vp8_loop_filter_simple_horizontal_edge_neon
+    add         r0, r0, r1, lsl #2          ; src = y_ptr + 12 * y_stride
+    pop         {r4, lr}
+    b           vp8_loop_filter_simple_horizontal_edge_neon
+    ENDP        ;|vp8_loop_filter_bhs_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_mbhs_neon| PROC
+    ldrb        r3, [r2]                   ; load blim from mem
+    vdup.s8     q1, r3                     ; duplicate mblim
+    b           vp8_loop_filter_simple_horizontal_edge_neon
+    ENDP        ;|vp8_loop_filter_bhs_neon|
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
new file mode 100644
index 0000000..e690df2
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -0,0 +1,154 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    ;EXPORT  |vp8_loop_filter_simple_vertical_edge_neon|
+    EXPORT |vp8_loop_filter_bvs_neon|
+    EXPORT |vp8_loop_filter_mbvs_neon|
+    ARM
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *s, PRESERVE
+; r1    int p, PRESERVE
+; q1    limit, PRESERVE
+
+|vp8_loop_filter_simple_vertical_edge_neon| PROC
+    sub         r0, r0, #2                  ; move src pointer down by 2 columns
+    add         r12, r1, r1
+    add         r3, r0, r1
+
+    vld4.8      {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
+    vld4.8      {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
+    vld4.8      {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
+    vld4.8      {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
+    vld4.8      {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
+    vld4.8      {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
+    vld4.8      {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
+    vld4.8      {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
+
+    vld4.8      {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
+    vld4.8      {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
+    vld4.8      {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
+    vld4.8      {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
+    vld4.8      {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
+    vld4.8      {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
+    vld4.8      {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
+    vld4.8      {d10[7], d11[7], d12[7], d13[7]}, [r3]
+
+    vswp        d7, d10
+    vswp        d12, d9
+
+    ;vp8_filter_mask() function
+    ;vp8_hevmask() function
+    sub         r0, r0, r1, lsl #4
+    vabd.u8     q15, q5, q4                 ; abs(p0 - q0)
+    vabd.u8     q14, q3, q6                 ; abs(p1 - q1)
+
+    vqadd.u8    q15, q15, q15               ; abs(p0 - q0) * 2
+    vshr.u8     q14, q14, #1                ; abs(p1 - q1) / 2
+    vmov.u8     q0, #0x80                   ; 0x80
+    vmov.s16    q11, #3
+    vqadd.u8    q15, q15, q14               ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+    veor        q4, q4, q0                  ; qs0: q0 offset to convert to a signed value
+    veor        q5, q5, q0                  ; ps0: p0 offset to convert to a signed value
+    veor        q3, q3, q0                  ; ps1: p1 offset to convert to a signed value
+    veor        q6, q6, q0                  ; qs1: q1 offset to convert to a signed value
+
+    vcge.u8     q15, q1, q15                ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
+
+    vsubl.s8    q2, d8, d10                 ; ( qs0 - ps0)
+    vsubl.s8    q13, d9, d11
+
+    vqsub.s8    q14, q3, q6                  ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
+
+    vmul.s16    q2, q2, q11                 ;  3 * ( qs0 - ps0)
+    vmul.s16    q13, q13, q11
+
+    vmov.u8     q11, #0x03                  ; 0x03
+    vmov.u8     q12, #0x04                  ; 0x04
+
+    vaddw.s8    q2, q2, d28                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q13, q13, d29
+
+    vqmovn.s16  d28, q2                      ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d29, q13
+
+    add         r0, r0, #1
+    add         r3, r0, r1
+
+    vand        q14, q14, q15                 ; vp8_filter &= mask
+
+    vqadd.s8    q2, q14, q11                 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+    vqadd.s8    q3, q14, q12                 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+    vshr.s8     q2, q2, #3                  ; Filter2 >>= 3
+    vshr.s8     q14, q3, #3                  ; Filter1 >>= 3
+
+    ;calculate output
+    vqadd.s8    q11, q5, q2                 ; u = vp8_signed_char_clamp(ps0 + Filter2)
+    vqsub.s8    q10, q4, q14                 ; u = vp8_signed_char_clamp(qs0 - Filter1)
+
+    veor        q6, q11, q0                 ; *op0 = u^0x80
+    veor        q7, q10, q0                 ; *oq0 = u^0x80
+    add         r12, r1, r1
+    vswp        d13, d14
+
+    ;store op1, op0, oq0, oq1
+    vst2.8      {d12[0], d13[0]}, [r0], r12
+    vst2.8      {d12[1], d13[1]}, [r3], r12
+    vst2.8      {d12[2], d13[2]}, [r0], r12
+    vst2.8      {d12[3], d13[3]}, [r3], r12
+    vst2.8      {d12[4], d13[4]}, [r0], r12
+    vst2.8      {d12[5], d13[5]}, [r3], r12
+    vst2.8      {d12[6], d13[6]}, [r0], r12
+    vst2.8      {d12[7], d13[7]}, [r3], r12
+    vst2.8      {d14[0], d15[0]}, [r0], r12
+    vst2.8      {d14[1], d15[1]}, [r3], r12
+    vst2.8      {d14[2], d15[2]}, [r0], r12
+    vst2.8      {d14[3], d15[3]}, [r3], r12
+    vst2.8      {d14[4], d15[4]}, [r0], r12
+    vst2.8      {d14[5], d15[5]}, [r3], r12
+    vst2.8      {d14[6], d15[6]}, [r0], r12
+    vst2.8      {d14[7], d15[7]}, [r3]
+
+    bx          lr
+    ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_bvs_neon| PROC
+    push        {r4, lr}
+    ldrb        r3, [r2]                   ; load blim from mem
+    mov         r4, r0
+    add         r0, r0, #4
+    vdup.s8     q1, r3                     ; duplicate blim
+    bl          vp8_loop_filter_simple_vertical_edge_neon
+    ; vp8_loop_filter_simple_vertical_edge_neon preserves  r1 and q1
+    add         r0, r4, #8
+    bl          vp8_loop_filter_simple_vertical_edge_neon
+    add         r0, r4, #12
+    pop         {r4, lr}
+    b           vp8_loop_filter_simple_vertical_edge_neon
+    ENDP        ;|vp8_loop_filter_bvs_neon|
+
+; r0    unsigned char *y
+; r1    int ystride
+; r2    const unsigned char *blimit
+
+|vp8_loop_filter_mbvs_neon| PROC
+    ldrb        r3, [r2]                   ; load mblim from mem
+    vdup.s8     q1, r3                     ; duplicate mblim
+    b           vp8_loop_filter_simple_vertical_edge_neon
+    ENDP        ;|vp8_loop_filter_bvs_neon|
+    END
diff --git a/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm b/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm
new file mode 100644
index 0000000..f41c156
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -0,0 +1,469 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mbloop_filter_horizontal_edge_y_neon|
+    EXPORT  |vp8_mbloop_filter_horizontal_edge_uv_neon|
+    EXPORT  |vp8_mbloop_filter_vertical_edge_y_neon|
+    EXPORT  |vp8_mbloop_filter_vertical_edge_uv_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
+;                                               const unsigned char *blimit,
+;                                               const unsigned char *limit,
+;                                               const unsigned char *thresh)
+; r0    unsigned char *src,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
+    push        {lr}
+    add         r1, r1, r1                  ; double stride
+    ldr         r12, [sp, #4]               ; load thresh
+    sub         r0, r0, r1, lsl #1          ; move src pointer down by 4 lines
+    vdup.u8     q2, r12                     ; thresh
+    add         r12, r0, r1,  lsr #1        ; move src pointer up by 1 line
+
+    vld1.u8     {q3}, [r0@128], r1              ; p3
+    vld1.u8     {q4}, [r12@128], r1             ; p2
+    vld1.u8     {q5}, [r0@128], r1              ; p1
+    vld1.u8     {q6}, [r12@128], r1             ; p0
+    vld1.u8     {q7}, [r0@128], r1              ; q0
+    vld1.u8     {q8}, [r12@128], r1             ; q1
+    vld1.u8     {q9}, [r0@128], r1              ; q2
+    vld1.u8     {q10}, [r12@128], r1            ; q3
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r12, r12, r1, lsl #2
+    add         r0, r12, r1, lsr #1
+
+    vst1.u8     {q4}, [r12@128],r1         ; store op2
+    vst1.u8     {q5}, [r0@128],r1          ; store op1
+    vst1.u8     {q6}, [r12@128], r1        ; store op0
+    vst1.u8     {q7}, [r0@128],r1          ; store oq0
+    vst1.u8     {q8}, [r12@128]            ; store oq1
+    vst1.u8     {q9}, [r0@128]             ; store oq2
+
+    pop         {pc}
+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
+
+; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
+;                                                const unsigned char *blimit,
+;                                                const unsigned char *limit,
+;                                                const unsigned char *thresh,
+;                                                unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+; sp+4  unsigned char *v
+
+|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
+    push        {lr}
+    ldr         r12, [sp, #4]                 ; load thresh
+    sub         r0, r0, r1, lsl #2            ; move u pointer down by 4 lines
+    vdup.u8     q2, r12                       ; thresh
+    ldr         r12, [sp, #8]                 ; load v ptr
+    sub         r12, r12, r1, lsl #2          ; move v pointer down by 4 lines
+
+    vld1.u8     {d6}, [r0@64], r1              ; p3
+    vld1.u8     {d7}, [r12@64], r1              ; p3
+    vld1.u8     {d8}, [r0@64], r1              ; p2
+    vld1.u8     {d9}, [r12@64], r1              ; p2
+    vld1.u8     {d10}, [r0@64], r1             ; p1
+    vld1.u8     {d11}, [r12@64], r1             ; p1
+    vld1.u8     {d12}, [r0@64], r1             ; p0
+    vld1.u8     {d13}, [r12@64], r1             ; p0
+    vld1.u8     {d14}, [r0@64], r1             ; q0
+    vld1.u8     {d15}, [r12@64], r1             ; q0
+    vld1.u8     {d16}, [r0@64], r1             ; q1
+    vld1.u8     {d17}, [r12@64], r1             ; q1
+    vld1.u8     {d18}, [r0@64], r1             ; q2
+    vld1.u8     {d19}, [r12@64], r1             ; q2
+    vld1.u8     {d20}, [r0@64], r1             ; q3
+    vld1.u8     {d21}, [r12@64], r1             ; q3
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r0, r0, r1, lsl #3
+    sub         r12, r12, r1, lsl #3
+
+    add         r0, r0, r1
+    add         r12, r12, r1
+
+    vst1.u8     {d8}, [r0@64], r1              ; store u op2
+    vst1.u8     {d9}, [r12@64], r1              ; store v op2
+    vst1.u8     {d10}, [r0@64], r1             ; store u op1
+    vst1.u8     {d11}, [r12@64], r1             ; store v op1
+    vst1.u8     {d12}, [r0@64], r1             ; store u op0
+    vst1.u8     {d13}, [r12@64], r1             ; store v op0
+    vst1.u8     {d14}, [r0@64], r1             ; store u oq0
+    vst1.u8     {d15}, [r12@64], r1             ; store v oq0
+    vst1.u8     {d16}, [r0@64], r1             ; store u oq1
+    vst1.u8     {d17}, [r12@64], r1             ; store v oq1
+    vst1.u8     {d18}, [r0@64], r1             ; store u oq2
+    vst1.u8     {d19}, [r12@64], r1             ; store v oq2
+
+    pop         {pc}
+    ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
+
+; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
+;                                             const unsigned char *blimit,
+;                                             const unsigned char *limit,
+;                                             const unsigned char *thresh)
+; r0    unsigned char *src,
+; r1    int pitch,
+; r2    unsigned char blimit
+; r3    unsigned char limit
+; sp    unsigned char thresh,
+|vp8_mbloop_filter_vertical_edge_y_neon| PROC
+    push        {lr}
+    ldr         r12, [sp, #4]               ; load thresh
+    sub         r0, r0, #4                  ; move src pointer down by 4 columns
+    vdup.s8     q2, r12                     ; thresh
+    add         r12, r0, r1, lsl #3         ; move src pointer down by 8 lines
+
+    vld1.u8     {d6}, [r0], r1              ; load first 8-line src data
+    vld1.u8     {d7}, [r12], r1             ; load second 8-line src data
+    vld1.u8     {d8}, [r0], r1
+    vld1.u8     {d9}, [r12], r1
+    vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d11}, [r12], r1
+    vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d13}, [r12], r1
+    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d15}, [r12], r1
+    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d17}, [r12], r1
+    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d19}, [r12], r1
+    vld1.u8     {d20}, [r0], r1
+    vld1.u8     {d21}, [r12], r1
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    sub         r0, r0, r1, lsl #3
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r12, r12, r1, lsl #3
+
+    ;transpose to 16x8 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    ;store op2, op1, op0, oq0, oq1, oq2
+    vst1.8      {d6}, [r0], r1
+    vst1.8      {d7}, [r12], r1
+    vst1.8      {d8}, [r0], r1
+    vst1.8      {d9}, [r12], r1
+    vst1.8      {d10}, [r0], r1
+    vst1.8      {d11}, [r12], r1
+    vst1.8      {d12}, [r0], r1
+    vst1.8      {d13}, [r12], r1
+    vst1.8      {d14}, [r0], r1
+    vst1.8      {d15}, [r12], r1
+    vst1.8      {d16}, [r0], r1
+    vst1.8      {d17}, [r12], r1
+    vst1.8      {d18}, [r0], r1
+    vst1.8      {d19}, [r12], r1
+    vst1.8      {d20}, [r0]
+    vst1.8      {d21}, [r12]
+
+    pop         {pc}
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
+
+; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
+;                                              const unsigned char *blimit,
+;                                              const unsigned char *limit,
+;                                              const unsigned char *thresh,
+;                                              unsigned char *v)
+; r0    unsigned char *u,
+; r1    int pitch,
+; r2    const signed char *flimit,
+; r3    const signed char *limit,
+; sp    const signed char *thresh,
+; sp+4  unsigned char *v
+|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
+    push        {lr}
+    ldr         r12, [sp, #4]               ; load thresh
+    sub         r0, r0, #4                  ; move u pointer down by 4 columns
+    vdup.u8     q2, r12                     ; thresh
+    ldr         r12, [sp, #8]               ; load v ptr
+    sub         r12, r12, #4                ; move v pointer down by 4 columns
+
+    vld1.u8     {d6}, [r0], r1              ;load u data
+    vld1.u8     {d7}, [r12], r1             ;load v data
+    vld1.u8     {d8}, [r0], r1
+    vld1.u8     {d9}, [r12], r1
+    vld1.u8     {d10}, [r0], r1
+    vld1.u8     {d11}, [r12], r1
+    vld1.u8     {d12}, [r0], r1
+    vld1.u8     {d13}, [r12], r1
+    vld1.u8     {d14}, [r0], r1
+    vld1.u8     {d15}, [r12], r1
+    vld1.u8     {d16}, [r0], r1
+    vld1.u8     {d17}, [r12], r1
+    vld1.u8     {d18}, [r0], r1
+    vld1.u8     {d19}, [r12], r1
+    vld1.u8     {d20}, [r0], r1
+    vld1.u8     {d21}, [r12], r1
+
+    ;transpose to 8x16 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    sub         r0, r0, r1, lsl #3
+
+    bl          vp8_mbloop_filter_neon
+
+    sub         r12, r12, r1, lsl #3
+
+    ;transpose to 16x8 matrix
+    vtrn.32     q3, q7
+    vtrn.32     q4, q8
+    vtrn.32     q5, q9
+    vtrn.32     q6, q10
+
+    vtrn.16     q3, q5
+    vtrn.16     q4, q6
+    vtrn.16     q7, q9
+    vtrn.16     q8, q10
+
+    vtrn.8      q3, q4
+    vtrn.8      q5, q6
+    vtrn.8      q7, q8
+    vtrn.8      q9, q10
+
+    ;store op2, op1, op0, oq0, oq1, oq2
+    vst1.8      {d6}, [r0], r1
+    vst1.8      {d7}, [r12], r1
+    vst1.8      {d8}, [r0], r1
+    vst1.8      {d9}, [r12], r1
+    vst1.8      {d10}, [r0], r1
+    vst1.8      {d11}, [r12], r1
+    vst1.8      {d12}, [r0], r1
+    vst1.8      {d13}, [r12], r1
+    vst1.8      {d14}, [r0], r1
+    vst1.8      {d15}, [r12], r1
+    vst1.8      {d16}, [r0], r1
+    vst1.8      {d17}, [r12], r1
+    vst1.8      {d18}, [r0], r1
+    vst1.8      {d19}, [r12], r1
+    vst1.8      {d20}, [r0]
+    vst1.8      {d21}, [r12]
+
+    pop         {pc}
+    ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|
+
+; void vp8_mbloop_filter_neon()
+; This is a helper function for the macroblock loopfilters. The individual
+; functions do the necessary load, transpose (if necessary), preserve (if
+; necessary) and store.
+
+; r0,r1 PRESERVE
+; r2    mblimit
+; r3    limit
+
+; q2    thresh
+; q3    p3 PRESERVE
+; q4    p2
+; q5    p1
+; q6    p0
+; q7    q0
+; q8    q1
+; q9    q2
+; q10   q3 PRESERVE
+
+|vp8_mbloop_filter_neon| PROC
+
+    ; vp8_filter_mask
+    vabd.u8     q11, q3, q4                 ; abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; abs(q1 - q0)
+    vabd.u8     q1, q9, q8                  ; abs(q2 - q1)
+    vabd.u8     q0, q10, q9                 ; abs(q3 - q2)
+
+    vmax.u8     q11, q11, q12
+    vmax.u8     q12, q13, q14
+    vmax.u8     q1, q1, q0
+    vmax.u8     q15, q11, q12
+
+    vabd.u8     q12, q6, q7                 ; abs(p0 - q0)
+
+    ; vp8_hevmask
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh) * -1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh) * -1
+    vmax.u8     q15, q15, q1
+
+    vdup.u8     q1, r3                      ; limit
+    vdup.u8     q2, r2                      ; mblimit
+
+    vmov.u8     q0, #0x80                   ; 0x80
+
+    vcge.u8     q15, q1, q15
+
+    vabd.u8     q1, q5, q8                  ; a = abs(p1 - q1)
+    vqadd.u8    q12, q12, q12               ; b = abs(p0 - q0) * 2
+    vmov.u16    q11, #3                     ; #3
+
+    ; vp8_filter
+    ; convert to signed
+    veor        q7, q7, q0                  ; qs0
+    vshr.u8     q1, q1, #1                  ; a = a / 2
+    veor        q6, q6, q0                  ; ps0
+    veor        q5, q5, q0                  ; ps1
+
+    vqadd.u8    q12, q12, q1                ; a = b + a
+
+    veor        q8, q8, q0                  ; qs1
+    veor        q4, q4, q0                  ; ps2
+    veor        q9, q9, q0                  ; qs2
+
+    vorr        q14, q13, q14               ; vp8_hevmask
+
+    vcge.u8     q12, q2, q12                ; (a > flimit * 2 + limit) * -1
+
+    vsubl.s8    q2, d14, d12                ; qs0 - ps0
+    vsubl.s8    q13, d15, d13
+
+    vqsub.s8    q1, q5, q8                  ; vp8_filter = clamp(ps1-qs1)
+
+    vmul.i16    q2, q2, q11                 ; 3 * ( qs0 - ps0)
+
+    vand        q15, q15, q12               ; vp8_filter_mask
+
+    vmul.i16    q13, q13, q11
+
+    vmov.u8     q12, #3                     ; #3
+
+    vaddw.s8    q2, q2, d2                  ; vp8_filter + 3 * ( qs0 - ps0)
+    vaddw.s8    q13, q13, d3
+
+    vmov.u8     q11, #4                     ; #4
+
+    ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d2, q2
+    vqmovn.s16  d3, q13
+
+    vand        q1, q1, q15                 ; vp8_filter &= mask
+
+    vmov.u16    q15, #63                    ; #63
+
+    vand        q13, q1, q14                ; Filter2 &= hev
+
+    vqadd.s8    q2, q13, q11                ; Filter1 = clamp(Filter2+4)
+    vqadd.s8    q13, q13, q12               ; Filter2 = clamp(Filter2+3)
+
+    vmov        q0, q15
+
+    vshr.s8     q2, q2, #3                  ; Filter1 >>= 3
+    vshr.s8     q13, q13, #3                ; Filter2 >>= 3
+
+    vmov        q11, q15
+    vmov        q12, q15
+
+    vqsub.s8    q7, q7, q2                  ; qs0 = clamp(qs0 - Filter1)
+
+    vqadd.s8    q6, q6, q13                 ; ps0 = clamp(ps0 + Filter2)
+
+    vbic        q1, q1, q14                 ; vp8_filter &= ~hev
+
+    ; roughly 1/7th difference across boundary
+    ; roughly 2/7th difference across boundary
+    ; roughly 3/7th difference across boundary
+
+    vmov.u8     d5, #9                      ; #9
+    vmov.u8     d4, #18                     ; #18
+
+    vmov        q13, q15
+    vmov        q14, q15
+
+    vmlal.s8    q0, d2, d5                  ; 63 + Filter2 * 9
+    vmlal.s8    q11, d3, d5
+    vmov.u8     d5, #27                     ; #27
+    vmlal.s8    q12, d2, d4                 ; 63 + Filter2 * 18
+    vmlal.s8    q13, d3, d4
+    vmlal.s8    q14, d2, d5                 ; 63 + Filter2 * 27
+    vmlal.s8    q15, d3, d5
+
+    vqshrn.s16  d0, q0, #7                  ; u = clamp((63 + Filter2 * 9)>>7)
+    vqshrn.s16  d1, q11, #7
+    vqshrn.s16  d24, q12, #7                ; u = clamp((63 + Filter2 * 18)>>7)
+    vqshrn.s16  d25, q13, #7
+    vqshrn.s16  d28, q14, #7                ; u = clamp((63 + Filter2 * 27)>>7)
+    vqshrn.s16  d29, q15, #7
+
+    vmov.u8     q1, #0x80                   ; 0x80
+
+    vqsub.s8    q11, q9, q0                 ; s = clamp(qs2 - u)
+    vqadd.s8    q0, q4, q0                  ; s = clamp(ps2 + u)
+    vqsub.s8    q13, q8, q12                ; s = clamp(qs1 - u)
+    vqadd.s8    q12, q5, q12                ; s = clamp(ps1 + u)
+    vqsub.s8    q15, q7, q14                ; s = clamp(qs0 - u)
+    vqadd.s8    q14, q6, q14                ; s = clamp(ps0 + u)
+
+    veor        q9, q11, q1                 ; *oq2 = s^0x80
+    veor        q4, q0, q1                  ; *op2 = s^0x80
+    veor        q8, q13, q1                 ; *oq1 = s^0x80
+    veor        q5, q12, q1                 ; *op2 = s^0x80
+    veor        q7, q15, q1                 ; *oq0 = s^0x80
+    veor        q6, q14, q1                 ; *op0 = s^0x80
+
+    bx          lr
+    ENDP        ; |vp8_mbloop_filter_neon|
+
+;-----------------
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/sad16_neon.asm b/libvpx/vp8/common/arm/neon/sad16_neon.asm
new file mode 100644
index 0000000..d7c590e
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/sad16_neon.asm
@@ -0,0 +1,207 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sad16x16_neon|
+    EXPORT  |vp8_sad16x8_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int  src_stride
+; r2    unsigned char *ref_ptr
+; r3    int  ref_stride
+|vp8_sad16x16_neon| PROC
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+    vabdl.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0]
+    vld1.8          {q7}, [r2]
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vadd.u16        q0, q12, q13
+
+    vpaddl.u16      q1, q0
+    vpaddl.u32      q0, q1
+
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+;==============================
+;unsigned int vp8_sad16x8_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+|vp8_sad16x8_neon| PROC
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+    vabdl.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vadd.u16        q0, q12, q13
+
+    vpaddl.u16      q1, q0
+    vpaddl.u32      q0, q1
+
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/sad8_neon.asm b/libvpx/vp8/common/arm/neon/sad8_neon.asm
new file mode 100644
index 0000000..23ba6df
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/sad8_neon.asm
@@ -0,0 +1,209 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sad8x8_neon|
+    EXPORT  |vp8_sad8x16_neon|
+    EXPORT  |vp8_sad4x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; unsigned int vp8_sad8x8_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+
+|vp8_sad8x8_neon| PROC
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q12, d6, d14
+
+    vpaddl.u16      q1, q12
+    vpaddl.u32      q0, q1
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+;============================
+;unsigned int vp8_sad8x16_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+
+|vp8_sad8x16_neon| PROC
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q12, d6, d14
+
+    vpaddl.u16      q1, q12
+    vpaddl.u32      q0, q1
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+;===========================
+;unsigned int vp8_sad4x4_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+
+|vp8_sad4x4_neon| PROC
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q12, d6, d14
+
+    vpaddl.u16      d1, d24
+    vpaddl.u32      d0, d1
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/save_reg_neon.asm b/libvpx/vp8/common/arm/neon/save_reg_neon.asm
new file mode 100644
index 0000000..fd7002e
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/save_reg_neon.asm
@@ -0,0 +1,36 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_push_neon|
+    EXPORT  |vp8_pop_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vp8_push_neon| PROC
+    vst1.i64            {d8, d9, d10, d11}, [r0]!
+    vst1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+|vp8_pop_neon| PROC
+    vld1.i64            {d8, d9, d10, d11}, [r0]!
+    vld1.i64            {d12, d13, d14, d15}, [r0]!
+    bx              lr
+
+    ENDP
+
+    END
+
diff --git a/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm
new file mode 100644
index 0000000..67d2ab0
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm
@@ -0,0 +1,139 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_idct4x4llm_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;*************************************************************
+;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
+;                            unsigned char *dst, int stride)
+;r0 short * input
+;r1 short * pred
+;r2 int pitch
+;r3 unsigned char dst
+;sp int stride
+;*************************************************************
+
+; static const int cospi8sqrt2minus1=20091;
+; static const int sinpi8sqrt2      =35468;
+; static const int rounding = 0;
+
+; Optimization note: The resulted data from dequantization are signed
+; 13-bit data that is in the range of [-4096, 4095]. This allows to
+; use "vqdmulh"(neon) instruction since it won't go out of range
+; (13+16+1=30bits<32bits). This instruction gives the high half
+; result of the multiplication that is needed in IDCT.
+
+|vp8_short_idct4x4llm_neon| PROC
+    adr             r12, idct_coeff
+    vld1.16         {q1, q2}, [r0]
+    vld1.16         {d0}, [r12]
+
+    vswp            d3, d4                  ;q2(vp[4] vp[12])
+    ldr             r0, [sp]                ; stride
+
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q4, q4, q2
+
+    ;d6 - c1:temp1
+    ;d7 - d1:temp2
+    ;d8 - d1:temp1
+    ;d9 - c1:temp2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vswp            d3, d4
+
+    vqdmulh.s16     q3, q2, d0[2]
+    vqdmulh.s16     q4, q2, d0[0]
+
+    vqadd.s16       d12, d2, d3             ;a1
+    vqsub.s16       d13, d2, d3             ;b1
+
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
+    vqadd.s16       q4, q4, q2
+
+    vqsub.s16       d10, d6, d9             ;c1
+    vqadd.s16       d11, d7, d8             ;d1
+
+    vqadd.s16       d2, d12, d11
+    vqadd.s16       d3, d13, d10
+    vqsub.s16       d4, d13, d10
+    vqsub.s16       d5, d12, d11
+
+    vrshr.s16       d2, d2, #3
+    vrshr.s16       d3, d3, #3
+    vrshr.s16       d4, d4, #3
+    vrshr.s16       d5, d5, #3
+
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    ; load prediction data
+    vld1.32         d6[0], [r1], r2
+    vld1.32         d6[1], [r1], r2
+    vld1.32         d7[0], [r1], r2
+    vld1.32         d7[1], [r1], r2
+
+    ; add prediction and residual
+    vaddw.u8        q1, q1, d6
+    vaddw.u8        q2, q2, d7
+
+    vqmovun.s16     d1, q1
+    vqmovun.s16     d2, q2
+
+    ; store to destination
+    vst1.32         d1[0], [r3], r0
+    vst1.32         d1[1], [r3], r0
+    vst1.32         d2[0], [r3], r0
+    vst1.32         d2[1], [r3], r0
+
+    bx              lr
+
+    ENDP
+
+;-----------------
+
+idct_coeff
+    DCD     0x4e7b4e7b, 0x8a8c8a8c
+
+;20091, 20091, 35468, 35468
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm b/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm
new file mode 100644
index 0000000..9fdafd3
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm
@@ -0,0 +1,490 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict16x16_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter16_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(r5) int  dst_pitch
+
+;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
+; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
+; the result can be negtive. So, I treat the result as s16. But, since it is also possible
+; that the result can be a large positive number (> 2^15-1), which could be confused as a
+; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
+; which ensures that the result stays in s16 range. Finally, saturated add the result by
+; applying 3rd filter coeff. Same applys to other filter functions.
+
+|vp8_sixtap_predict16x16_neon| PROC
+    push            {r4-r5, lr}
+
+    adr             r12, filter16_coeff
+    ldr             r4, [sp, #12]           ;load parameters from stack
+    ldr             r5, [sp, #16]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_filter16x16_only
+
+    add             r2, r12, r2, lsl #5     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
+
+    beq             firstpass_filter16x16_only
+
+    sub             sp, sp, #336            ;reserve space on stack for temporary storage
+    mov             lr, sp
+
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    mov             r2, #7                  ;loop counter
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+    sub             r0, r0, r1, lsl #1
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+;First Pass: output_height lines x output_width columns (21x16)
+filt_blk2d_fp16x16_loop_neon
+    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
+    vld1.u8         {d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q9, d7, d0
+    vmull.u8        q10, d9, d0
+    vmull.u8        q11, d10, d0
+    vmull.u8        q12, d12, d0
+    vmull.u8        q13, d13, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d9, d10, #1
+    vext.8          d30, d12, d13, #1
+
+    vmlsl.u8        q8, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q10, d29, d1
+    vmlsl.u8        q12, d30, d1
+
+    vext.8          d28, d7, d8, #1
+    vext.8          d29, d10, d11, #1
+    vext.8          d30, d13, d14, #1
+
+    vmlsl.u8        q9, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q11, d29, d1
+    vmlsl.u8        q13, d30, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d9, d10, #4
+    vext.8          d30, d12, d13, #4
+
+    vmlsl.u8        q8, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q10, d29, d4
+    vmlsl.u8        q12, d30, d4
+
+    vext.8          d28, d7, d8, #4
+    vext.8          d29, d10, d11, #4
+    vext.8          d30, d13, d14, #4
+
+    vmlsl.u8        q9, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q11, d29, d4
+    vmlsl.u8        q13, d30, d4
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d9, d10, #5
+    vext.8          d30, d12, d13, #5
+
+    vmlal.u8        q8, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q10, d29, d5
+    vmlal.u8        q12, d30, d5
+
+    vext.8          d28, d7, d8, #5
+    vext.8          d29, d10, d11, #5
+    vext.8          d30, d13, d14, #5
+
+    vmlal.u8        q9, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q11, d29, d5
+    vmlal.u8        q13, d30, d5
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d9, d10, #2
+    vext.8          d30, d12, d13, #2
+
+    vmlal.u8        q8, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q10, d29, d2
+    vmlal.u8        q12, d30, d2
+
+    vext.8          d28, d7, d8, #2
+    vext.8          d29, d10, d11, #2
+    vext.8          d30, d13, d14, #2
+
+    vmlal.u8        q9, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q11, d29, d2
+    vmlal.u8        q13, d30, d2
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d9, d10, #3
+    vext.8          d30, d12, d13, #3
+
+    vext.8          d15, d7, d8, #3
+    vext.8          d31, d10, d11, #3
+    vext.8          d6, d13, d14, #3
+
+    vmull.u8        q4, d28, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q5, d29, d3
+    vmull.u8        q6, d30, d3
+
+    vqadd.s16       q8, q4                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q10, q5
+    vqadd.s16       q12, q6
+
+    vmull.u8        q6, d15, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q7, d31, d3
+    vmull.u8        q3, d6, d3
+
+    subs            r2, r2, #1
+
+    vqadd.s16       q9, q6
+    vqadd.s16       q11, q7
+    vqadd.s16       q13, q3
+
+    vqrshrun.s16    d6, q8, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q9, #7
+    vqrshrun.s16    d8, q10, #7
+    vqrshrun.s16    d9, q11, #7
+    vqrshrun.s16    d10, q12, #7
+    vqrshrun.s16    d11, q13, #7
+
+    vst1.u8         {d6, d7, d8}, [lr]!     ;store result
+    vst1.u8         {d9, d10, d11}, [lr]!
+
+    bne             filt_blk2d_fp16x16_loop_neon
+
+;Second pass: 16x16
+;secondpass_filter - do first 8-columns and then second 8-columns
+    add             r3, r12, r3, lsl #5
+    sub             lr, lr, #336
+
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    mov             r3, #2                  ;loop counter
+
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    mov             r2, #16
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+filt_blk2d_sp16x16_outloop_neon
+    vld1.u8         {d18}, [lr], r2         ;load src data
+    vld1.u8         {d19}, [lr], r2
+    vld1.u8         {d20}, [lr], r2
+    vld1.u8         {d21}, [lr], r2
+    mov             r12, #4                 ;loop counter
+    vld1.u8         {d22}, [lr], r2
+
+secondpass_inner_loop_neon
+    vld1.u8         {d23}, [lr], r2         ;load src data
+    vld1.u8         {d24}, [lr], r2
+    vld1.u8         {d25}, [lr], r2
+    vld1.u8         {d26}, [lr], r2
+
+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d19, d0
+    vmull.u8        q5, d20, d0
+    vmull.u8        q6, d21, d0
+
+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q4, d20, d1
+    vmlsl.u8        q5, d21, d1
+    vmlsl.u8        q6, d22, d1
+
+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d23, d4
+    vmlsl.u8        q5, d24, d4
+    vmlsl.u8        q6, d25, d4
+
+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d21, d2
+    vmlal.u8        q5, d22, d2
+    vmlal.u8        q6, d23, d2
+
+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q4, d24, d5
+    vmlal.u8        q5, d25, d5
+    vmlal.u8        q6, d26, d5
+
+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q8, d22, d3
+    vmull.u8        q9, d23, d3
+    vmull.u8        q10, d24, d3
+
+    subs            r12, r12, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vmov            q9, q11
+    vst1.u8         {d7}, [r4], r5
+    vmov            q10, q12
+    vst1.u8         {d8}, [r4], r5
+    vmov            d22, d26
+    vst1.u8         {d9}, [r4], r5
+
+    bne             secondpass_inner_loop_neon
+
+    subs            r3, r3, #1
+    sub             lr, lr, #336
+    add             lr, lr, #8
+
+    sub             r4, r4, r5, lsl #4
+    add             r4, r4, #8
+
+    bne filt_blk2d_sp16x16_outloop_neon
+
+    add             sp, sp, #336
+    pop             {r4-r5,pc}
+
+;--------------------
+firstpass_filter16x16_only
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    mov             r2, #8                  ;loop counter
+    sub             r0, r0, #2              ;move srcptr back to (column-2)
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+;First Pass: output_height lines x output_width columns (16x16)
+filt_blk2d_fpo16x16_loop_neon
+    vld1.u8         {d6, d7, d8}, [r0], r1      ;load src data
+    vld1.u8         {d9, d10, d11}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+
+    vmull.u8        q6, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q7, d7, d0
+    vmull.u8        q8, d9, d0
+    vmull.u8        q9, d10, d0
+
+    vext.8          d20, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d21, d9, d10, #1
+    vext.8          d22, d7, d8, #1
+    vext.8          d23, d10, d11, #1
+    vext.8          d24, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d25, d9, d10, #4
+    vext.8          d26, d7, d8, #4
+    vext.8          d27, d10, d11, #4
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d9, d10, #5
+
+    vmlsl.u8        q6, d20, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d21, d1
+    vmlsl.u8        q7, d22, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q9, d23, d1
+    vmlsl.u8        q6, d24, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d25, d4
+    vmlsl.u8        q7, d26, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q9, d27, d4
+    vmlal.u8        q6, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q8, d29, d5
+
+    vext.8          d20, d7, d8, #5
+    vext.8          d21, d10, d11, #5
+    vext.8          d22, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d23, d9, d10, #2
+    vext.8          d24, d7, d8, #2
+    vext.8          d25, d10, d11, #2
+
+    vext.8          d26, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d27, d9, d10, #3
+    vext.8          d28, d7, d8, #3
+    vext.8          d29, d10, d11, #3
+
+    vmlal.u8        q7, d20, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q9, d21, d5
+    vmlal.u8        q6, d22, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d23, d2
+    vmlal.u8        q7, d24, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q9, d25, d2
+
+    vmull.u8        q10, d26, d3            ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q11, d27, d3
+    vmull.u8        q12, d28, d3            ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q15, d29, d3
+
+    vqadd.s16       q6, q10                 ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q11
+    vqadd.s16       q7, q12
+    vqadd.s16       q9, q15
+
+    subs            r2, r2, #1
+
+    vqrshrun.s16    d6, q6, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q7, #7
+    vqrshrun.s16    d8, q8, #7
+    vqrshrun.s16    d9, q9, #7
+
+    vst1.u8         {q3}, [r4], r5              ;store result
+    vst1.u8         {q4}, [r4], r5
+
+    bne             filt_blk2d_fpo16x16_loop_neon
+
+    pop             {r4-r5,pc}
+
+;--------------------
+secondpass_filter16x16_only
+;Second pass: 16x16
+    add             r3, r12, r3, lsl #5
+    sub             r0, r0, r1, lsl #1
+
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    mov             r3, #2                  ;loop counter
+
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+filt_blk2d_spo16x16_outloop_neon
+    vld1.u8         {d18}, [r0], r1         ;load src data
+    vld1.u8         {d19}, [r0], r1
+    vld1.u8         {d20}, [r0], r1
+    vld1.u8         {d21}, [r0], r1
+    mov             r12, #4                 ;loop counter
+    vld1.u8         {d22}, [r0], r1
+
+secondpass_only_inner_loop_neon
+    vld1.u8         {d23}, [r0], r1         ;load src data
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+
+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d19, d0
+    vmull.u8        q5, d20, d0
+    vmull.u8        q6, d21, d0
+
+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q4, d20, d1
+    vmlsl.u8        q5, d21, d1
+    vmlsl.u8        q6, d22, d1
+
+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d23, d4
+    vmlsl.u8        q5, d24, d4
+    vmlsl.u8        q6, d25, d4
+
+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d21, d2
+    vmlal.u8        q5, d22, d2
+    vmlal.u8        q6, d23, d2
+
+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q4, d24, d5
+    vmlal.u8        q5, d25, d5
+    vmlal.u8        q6, d26, d5
+
+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q8, d22, d3
+    vmull.u8        q9, d23, d3
+    vmull.u8        q10, d24, d3
+
+    subs            r12, r12, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vmov            q9, q11
+    vst1.u8         {d7}, [r4], r5
+    vmov            q10, q12
+    vst1.u8         {d8}, [r4], r5
+    vmov            d22, d26
+    vst1.u8         {d9}, [r4], r5
+
+    bne             secondpass_only_inner_loop_neon
+
+    subs            r3, r3, #1
+    sub             r0, r0, r1, lsl #4
+    sub             r0, r0, r1, lsl #2
+    sub             r0, r0, r1
+    add             r0, r0, #8
+
+    sub             r4, r4, r5, lsl #4
+    add             r4, r4, #8
+
+    bne filt_blk2d_spo16x16_outloop_neon
+
+    pop             {r4-r5,pc}
+
+    ENDP
+
+;-----------------
+    END
diff --git a/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm b/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm
new file mode 100644
index 0000000..a4222bc
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm
@@ -0,0 +1,422 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict4x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter4_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(lr) int  dst_pitch
+
+|vp8_sixtap_predict4x4_neon| PROC
+    push            {r4, lr}
+
+    adr             r12, filter4_coeff
+    ldr             r4, [sp, #8]            ;load parameters from stack
+    ldr             lr, [sp, #12]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_filter4x4_only
+
+    add             r2, r12, r2, lsl #5     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
+
+    beq             firstpass_filter4x4_only
+
+    vabs.s32        q12, q14                ;get abs(filer_parameters)
+    vabs.s32        q13, q15
+
+    sub             r0, r0, #2              ;go back 2 columns of src data
+    sub             r0, r0, r1, lsl #1      ;go back 2 lines of src data
+
+;First pass: output_height lines x output_width columns (9x4)
+    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d1, d24[4]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d2, d25[0]
+    vld1.u8         {q6}, [r0], r1
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d19, d8, d9, #5
+    vext.8          d20, d10, d11, #5
+    vext.8          d21, d12, d13, #5
+
+    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
+    vswp            d11, d12
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
+    vzip.32         d20, d21
+    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmull.u8        q8, d20, d5
+
+    vmov            q4, q3                  ;keep original src data in q4 q6
+    vmov            q6, q5
+
+    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
+    vshr.u64        q10, q6, #8
+    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp8_filter[0])
+    vmlal.u8        q8, d10, d0
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
+    vshr.u64        q5, q6, #32
+    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d20, d1
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
+    vshr.u64        q10, q6, #16
+    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d10, d4
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
+    vshr.u64        q5, q6, #24
+    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d20, d2
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
+    vzip.32         d10, d11
+    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q10, d10, d3
+
+    vld1.u8         {q3}, [r0], r1          ;load rest 5-line src data
+    vld1.u8         {q4}, [r0], r1
+
+    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q10
+
+    vld1.u8         {q5}, [r0], r1
+    vld1.u8         {q6}, [r0], r1
+
+    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d28, q8, #7
+
+    ;First Pass on rest 5-line data
+    vld1.u8         {q11}, [r0], r1
+
+    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d19, d8, d9, #5
+    vext.8          d20, d10, d11, #5
+    vext.8          d21, d12, d13, #5
+
+    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
+    vswp            d11, d12
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
+    vzip.32         d20, d21
+    vext.8          d31, d22, d23, #5       ;construct src_ptr[3]
+    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmull.u8        q8, d20, d5
+    vmull.u8        q12, d31, d5            ;(src_ptr[3] * vp8_filter[5])
+
+    vmov            q4, q3                  ;keep original src data in q4 q6
+    vmov            q6, q5
+
+    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
+    vshr.u64        q10, q6, #8
+
+    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp8_filter[0])
+    vmlal.u8        q8, d10, d0
+    vmlal.u8        q12, d22, d0            ;(src_ptr[-2] * vp8_filter[0])
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
+    vshr.u64        q5, q6, #32
+    vext.8          d31, d22, d23, #1       ;construct src_ptr[-1]
+
+    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d20, d1
+    vmlsl.u8        q12, d31, d1            ;-(src_ptr[-1] * vp8_filter[1])
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
+    vshr.u64        q10, q6, #16
+    vext.8          d31, d22, d23, #4       ;construct src_ptr[2]
+
+    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d10, d4
+    vmlsl.u8        q12, d31, d4            ;-(src_ptr[2] * vp8_filter[4])
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
+    vshr.u64        q5, q6, #24
+    vext.8          d31, d22, d23, #2       ;construct src_ptr[0]
+
+    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d20, d2
+    vmlal.u8        q12, d31, d2            ;(src_ptr[0] * vp8_filter[2])
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
+    vzip.32         d10, d11
+    vext.8          d31, d22, d23, #3       ;construct src_ptr[1]
+    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q10, d10, d3
+    vmull.u8        q11, d31, d3            ;(src_ptr[1] * vp8_filter[3])
+
+    add             r3, r12, r3, lsl #5
+
+    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q10
+    vqadd.s16       q12, q11
+
+    vext.8          d23, d27, d28, #4
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+
+    vqrshrun.s16    d29, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d30, q8, #7
+    vqrshrun.s16    d31, q12, #7
+
+;Second pass: 4x4
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vext.8          d24, d28, d29, #4
+    vext.8          d25, d29, d30, #4
+    vext.8          d26, d30, d31, #4
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d28, d0
+
+    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmull.u8        q6, d26, d5
+
+    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d30, d4
+
+    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q6, d24, d1
+
+    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d29, d2
+
+    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmlal.u8        q6, d25, d3
+
+    add             r0, r4, lr
+    add             r1, r0, lr
+    add             r2, r1, lr
+
+    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q6, q4
+
+    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d4, q6, #7
+
+    vst1.32         {d3[0]}, [r4]           ;store result
+    vst1.32         {d3[1]}, [r0]
+    vst1.32         {d4[0]}, [r1]
+    vst1.32         {d4[1]}, [r2]
+
+    pop             {r4, pc}
+
+
+;---------------------
+firstpass_filter4x4_only
+    vabs.s32        q12, q14                ;get abs(filer_parameters)
+    vabs.s32        q13, q15
+
+    sub             r0, r0, #2              ;go back 2 columns of src data
+
+;First pass: output_height lines x output_width columns (4x4)
+    vld1.u8         {q3}, [r0], r1          ;load first 4-line src data
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d1, d24[4]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d2, d25[0]
+    vld1.u8         {q6}, [r0], r1
+
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+    vext.8          d18, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d19, d8, d9, #5
+    vext.8          d20, d10, d11, #5
+    vext.8          d21, d12, d13, #5
+
+    vswp            d7, d8                  ;discard 2nd half data after src_ptr[3] is done
+    vswp            d11, d12
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[3])
+    vzip.32         d20, d21
+    vmull.u8        q7, d18, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmull.u8        q8, d20, d5
+
+    vmov            q4, q3                  ;keep original src data in q4 q6
+    vmov            q6, q5
+
+    vzip.32         d6, d7                  ;construct src_ptr[-2], and put 2-line data together
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #8              ;construct src_ptr[-1]
+    vshr.u64        q10, q6, #8
+    vmlal.u8        q7, d6, d0              ;+(src_ptr[-2] * vp8_filter[0])
+    vmlal.u8        q8, d10, d0
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[-1])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #32             ;construct src_ptr[2]
+    vshr.u64        q5, q6, #32
+    vmlsl.u8        q7, d18, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d20, d1
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[2])
+    vzip.32         d10, d11
+    vshr.u64        q9, q4, #16             ;construct src_ptr[0]
+    vshr.u64        q10, q6, #16
+    vmlsl.u8        q7, d6, d4              ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d10, d4
+
+    vzip.32         d18, d19                ;put 2-line data in 1 register (src_ptr[0])
+    vzip.32         d20, d21
+    vshr.u64        q3, q4, #24             ;construct src_ptr[1]
+    vshr.u64        q5, q6, #24
+    vmlal.u8        q7, d18, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d20, d2
+
+    vzip.32         d6, d7                  ;put 2-line data in 1 register (src_ptr[1])
+    vzip.32         d10, d11
+    vmull.u8        q9, d6, d3              ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q10, d10, d3
+
+    add             r0, r4, lr
+    add             r1, r0, lr
+    add             r2, r1, lr
+
+    vqadd.s16       q7, q9                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q10
+
+    vqrshrun.s16    d27, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d28, q8, #7
+
+    vst1.32         {d27[0]}, [r4]          ;store result
+    vst1.32         {d27[1]}, [r0]
+    vst1.32         {d28[0]}, [r1]
+    vst1.32         {d28[1]}, [r2]
+
+    pop             {r4, pc}
+
+
+;---------------------
+secondpass_filter4x4_only
+    sub             r0, r0, r1, lsl #1
+    add             r3, r12, r3, lsl #5
+
+    vld1.32         {d27[0]}, [r0], r1      ;load src data
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    vld1.32         {d27[1]}, [r0], r1
+    vabs.s32        q7, q5
+    vld1.32         {d28[0]}, [r0], r1
+    vabs.s32        q8, q6
+    vld1.32         {d28[1]}, [r0], r1
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vld1.32         {d29[0]}, [r0], r1
+    vdup.8          d1, d14[4]
+    vld1.32         {d29[1]}, [r0], r1
+    vdup.8          d2, d15[0]
+    vld1.32         {d30[0]}, [r0], r1
+    vdup.8          d3, d15[4]
+    vld1.32         {d30[1]}, [r0], r1
+    vdup.8          d4, d16[0]
+    vld1.32         {d31[0]}, [r0], r1
+    vdup.8          d5, d16[4]
+
+    vext.8          d23, d27, d28, #4
+    vext.8          d24, d28, d29, #4
+    vext.8          d25, d29, d30, #4
+    vext.8          d26, d30, d31, #4
+
+    vmull.u8        q3, d27, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d28, d0
+
+    vmull.u8        q5, d25, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmull.u8        q6, d26, d5
+
+    vmlsl.u8        q3, d29, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d30, d4
+
+    vmlsl.u8        q5, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q6, d24, d1
+
+    vmlal.u8        q3, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d29, d2
+
+    vmlal.u8        q5, d24, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmlal.u8        q6, d25, d3
+
+    add             r0, r4, lr
+    add             r1, r0, lr
+    add             r2, r1, lr
+
+    vqadd.s16       q5, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q6, q4
+
+    vqrshrun.s16    d3, q5, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d4, q6, #7
+
+    vst1.32         {d3[0]}, [r4]           ;store result
+    vst1.32         {d3[1]}, [r0]
+    vst1.32         {d4[0]}, [r1]
+    vst1.32         {d4[1]}, [r2]
+
+    pop             {r4, pc}
+
+    ENDP
+
+;-----------------
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm b/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm
new file mode 100644
index 0000000..a57ec01
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm
@@ -0,0 +1,473 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict8x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter8_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; r4    unsigned char *dst_ptr,
+; stack(r5) int  dst_pitch
+
+|vp8_sixtap_predict8x4_neon| PROC
+    push            {r4-r5, lr}
+
+    adr             r12, filter8_coeff
+    ldr             r4, [sp, #12]           ;load parameters from stack
+    ldr             r5, [sp, #16]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_filter8x4_only
+
+    add             r2, r12, r2, lsl #5     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
+
+    beq             firstpass_filter8x4_only
+
+    sub             sp, sp, #32             ;reserve space on stack for temporary storage
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+    mov             lr, sp
+    sub             r0, r0, r1, lsl #1
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+
+;First pass: output_height lines x output_width columns (9x8)
+    vld1.u8         {q3}, [r0], r1          ;load src data
+    vdup.8          d3, d25[4]
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d4, d26[0]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d5, d26[4]
+    vld1.u8         {q6}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q8, d8, d0
+    vmull.u8        q9, d10, d0
+    vmull.u8        q10, d12, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d8, d9, #1
+    vext.8          d30, d10, d11, #1
+    vext.8          d31, d12, d13, #1
+
+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d29, d1
+    vmlsl.u8        q9, d30, d1
+    vmlsl.u8        q10, d31, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d8, d9, #4
+    vext.8          d30, d10, d11, #4
+    vext.8          d31, d12, d13, #4
+
+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d29, d4
+    vmlsl.u8        q9, d30, d4
+    vmlsl.u8        q10, d31, d4
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d8, d9, #2
+    vext.8          d30, d10, d11, #2
+    vext.8          d31, d12, d13, #2
+
+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d29, d2
+    vmlal.u8        q9, d30, d2
+    vmlal.u8        q10, d31, d2
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d8, d9, #5
+    vext.8          d30, d10, d11, #5
+    vext.8          d31, d12, d13, #5
+
+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q8, d29, d5
+    vmlal.u8        q9, d30, d5
+    vmlal.u8        q10, d31, d5
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d8, d9, #3
+    vext.8          d30, d10, d11, #3
+    vext.8          d31, d12, d13, #3
+
+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q4, d29, d3
+    vmull.u8        q5, d30, d3
+    vmull.u8        q6, d31, d3
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vld1.u8         {q3}, [r0], r1          ;load src data
+
+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d23, q8, #7
+    vqrshrun.s16    d24, q9, #7
+    vqrshrun.s16    d25, q10, #7
+
+    vld1.u8         {q4}, [r0], r1
+    vst1.u8         {d22}, [lr]!            ;store result
+    vld1.u8         {q5}, [r0], r1
+    vst1.u8         {d23}, [lr]!
+    vld1.u8         {q6}, [r0], r1
+    vst1.u8         {d24}, [lr]!
+    vld1.u8         {q7}, [r0], r1
+    vst1.u8         {d25}, [lr]!
+
+    ;first_pass filtering on the rest 5-line data
+    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+    vmull.u8        q11, d12, d0
+    vmull.u8        q12, d14, d0
+
+    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d28, d8, d9, #1
+    vext.8          d29, d10, d11, #1
+    vext.8          d30, d12, d13, #1
+    vext.8          d31, d14, d15, #1
+
+    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q9, d28, d1
+    vmlsl.u8        q10, d29, d1
+    vmlsl.u8        q11, d30, d1
+    vmlsl.u8        q12, d31, d1
+
+    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d28, d8, d9, #4
+    vext.8          d29, d10, d11, #4
+    vext.8          d30, d12, d13, #4
+    vext.8          d31, d14, d15, #4
+
+    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q9, d28, d4
+    vmlsl.u8        q10, d29, d4
+    vmlsl.u8        q11, d30, d4
+    vmlsl.u8        q12, d31, d4
+
+    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d28, d8, d9, #2
+    vext.8          d29, d10, d11, #2
+    vext.8          d30, d12, d13, #2
+    vext.8          d31, d14, d15, #2
+
+    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q9, d28, d2
+    vmlal.u8        q10, d29, d2
+    vmlal.u8        q11, d30, d2
+    vmlal.u8        q12, d31, d2
+
+    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d28, d8, d9, #5
+    vext.8          d29, d10, d11, #5
+    vext.8          d30, d12, d13, #5
+    vext.8          d31, d14, d15, #5
+
+    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q9, d28, d5
+    vmlal.u8        q10, d29, d5
+    vmlal.u8        q11, d30, d5
+    vmlal.u8        q12, d31, d5
+
+    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d28, d8, d9, #3
+    vext.8          d29, d10, d11, #3
+    vext.8          d30, d12, d13, #3
+    vext.8          d31, d14, d15, #3
+
+    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q4, d28, d3
+    vmull.u8        q5, d29, d3
+    vmull.u8        q6, d30, d3
+    vmull.u8        q7, d31, d3
+
+    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q9, q4
+    vqadd.s16       q10, q5
+    vqadd.s16       q11, q6
+    vqadd.s16       q12, q7
+
+    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d27, q9, #7
+    vqrshrun.s16    d28, q10, #7
+    vqrshrun.s16    d29, q11, #7                ;load intermediate data from stack
+    vqrshrun.s16    d30, q12, #7
+
+;Second pass: 8x4
+;secondpass_filter
+    add             r3, r12, r3, lsl #5
+    sub             lr, lr, #32
+
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    vld1.u8         {q11}, [lr]!
+
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vld1.u8         {q12}, [lr]!
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d23, d0
+    vmull.u8        q5, d24, d0
+    vmull.u8        q6, d25, d0
+
+    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q4, d24, d1
+    vmlsl.u8        q5, d25, d1
+    vmlsl.u8        q6, d26, d1
+
+    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d27, d4
+    vmlsl.u8        q5, d28, d4
+    vmlsl.u8        q6, d29, d4
+
+    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d25, d2
+    vmlal.u8        q5, d26, d2
+    vmlal.u8        q6, d27, d2
+
+    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q4, d28, d5
+    vmlal.u8        q5, d29, d5
+    vmlal.u8        q6, d30, d5
+
+    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q8, d26, d3
+    vmull.u8        q9, d27, d3
+    vmull.u8        q10, d28, d3
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vst1.u8         {d7}, [r4], r5
+    vst1.u8         {d8}, [r4], r5
+    vst1.u8         {d9}, [r4], r5
+
+    add             sp, sp, #32
+    pop             {r4-r5,pc}
+
+;--------------------
+firstpass_filter8x4_only
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+    vld1.u8         {q3}, [r0], r1          ;load src data
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d1, d24[4]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d2, d25[0]
+    vld1.u8         {q6}, [r0], r1
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+;First pass: output_height lines x output_width columns (4x8)
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q8, d8, d0
+    vmull.u8        q9, d10, d0
+    vmull.u8        q10, d12, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d8, d9, #1
+    vext.8          d30, d10, d11, #1
+    vext.8          d31, d12, d13, #1
+
+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d29, d1
+    vmlsl.u8        q9, d30, d1
+    vmlsl.u8        q10, d31, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d8, d9, #4
+    vext.8          d30, d10, d11, #4
+    vext.8          d31, d12, d13, #4
+
+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d29, d4
+    vmlsl.u8        q9, d30, d4
+    vmlsl.u8        q10, d31, d4
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d8, d9, #2
+    vext.8          d30, d10, d11, #2
+    vext.8          d31, d12, d13, #2
+
+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d29, d2
+    vmlal.u8        q9, d30, d2
+    vmlal.u8        q10, d31, d2
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d8, d9, #5
+    vext.8          d30, d10, d11, #5
+    vext.8          d31, d12, d13, #5
+
+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q8, d29, d5
+    vmlal.u8        q9, d30, d5
+    vmlal.u8        q10, d31, d5
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d8, d9, #3
+    vext.8          d30, d10, d11, #3
+    vext.8          d31, d12, d13, #3
+
+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q4, d29, d3
+    vmull.u8        q5, d30, d3
+    vmull.u8        q6, d31, d3
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d23, q8, #7
+    vqrshrun.s16    d24, q9, #7
+    vqrshrun.s16    d25, q10, #7
+
+    vst1.u8         {d22}, [r4], r5         ;store result
+    vst1.u8         {d23}, [r4], r5
+    vst1.u8         {d24}, [r4], r5
+    vst1.u8         {d25}, [r4], r5
+
+    pop             {r4-r5,pc}
+
+;---------------------
+secondpass_filter8x4_only
+;Second pass: 8x4
+    add             r3, r12, r3, lsl #5
+    sub             r0, r0, r1, lsl #1
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vld1.u8         {d22}, [r0], r1
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vld1.u8         {d25}, [r0], r1
+    vdup.8          d1, d14[4]
+    vld1.u8         {d26}, [r0], r1
+    vdup.8          d2, d15[0]
+    vld1.u8         {d27}, [r0], r1
+    vdup.8          d3, d15[4]
+    vld1.u8         {d28}, [r0], r1
+    vdup.8          d4, d16[0]
+    vld1.u8         {d29}, [r0], r1
+    vdup.8          d5, d16[4]
+    vld1.u8         {d30}, [r0], r1
+
+    vmull.u8        q3, d22, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d23, d0
+    vmull.u8        q5, d24, d0
+    vmull.u8        q6, d25, d0
+
+    vmlsl.u8        q3, d23, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q4, d24, d1
+    vmlsl.u8        q5, d25, d1
+    vmlsl.u8        q6, d26, d1
+
+    vmlsl.u8        q3, d26, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d27, d4
+    vmlsl.u8        q5, d28, d4
+    vmlsl.u8        q6, d29, d4
+
+    vmlal.u8        q3, d24, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d25, d2
+    vmlal.u8        q5, d26, d2
+    vmlal.u8        q6, d27, d2
+
+    vmlal.u8        q3, d27, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q4, d28, d5
+    vmlal.u8        q5, d29, d5
+    vmlal.u8        q6, d30, d5
+
+    vmull.u8        q7, d25, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q8, d26, d3
+    vmull.u8        q9, d27, d3
+    vmull.u8        q10, d28, d3
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vst1.u8         {d7}, [r4], r5
+    vst1.u8         {d8}, [r4], r5
+    vst1.u8         {d9}, [r4], r5
+
+    pop             {r4-r5,pc}
+
+    ENDP
+
+;-----------------
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm b/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm
new file mode 100644
index 0000000..00ed5ae
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm
@@ -0,0 +1,524 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sixtap_predict8x8_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+filter8_coeff
+    DCD     0,  0,  128,    0,   0,  0,   0,  0
+    DCD     0, -6,  123,   12,  -1,  0,   0,  0
+    DCD     2, -11, 108,   36,  -8,  1,   0,  0
+    DCD     0, -9,   93,   50,  -6,  0,   0,  0
+    DCD     3, -16,  77,   77, -16,  3,   0,  0
+    DCD     0, -6,   50,   93,  -9,  0,   0,  0
+    DCD     1, -8,   36,  108, -11,  2,   0,  0
+    DCD     0, -1,   12,  123,  -6,   0,  0,  0
+
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int  dst_pitch
+
+|vp8_sixtap_predict8x8_neon| PROC
+    push            {r4-r5, lr}
+
+    adr             r12, filter8_coeff
+
+    ldr             r4, [sp, #12]           ;load parameters from stack
+    ldr             r5, [sp, #16]           ;load parameters from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_filter8x8_only
+
+    add             r2, r12, r2, lsl #5     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {q14, q15}, [r2]        ;load first_pass filter
+
+    beq             firstpass_filter8x8_only
+
+    sub             sp, sp, #64             ;reserve space on stack for temporary storage
+    mov             lr, sp
+
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    mov             r2, #2                  ;loop counter
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+    sub             r0, r0, r1, lsl #1
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+
+;First pass: output_height lines x output_width columns (13x8)
+    vld1.u8         {q3}, [r0], r1          ;load src data
+    vdup.8          d3, d25[4]
+    vld1.u8         {q4}, [r0], r1
+    vdup.8          d4, d26[0]
+    vld1.u8         {q5}, [r0], r1
+    vdup.8          d5, d26[4]
+    vld1.u8         {q6}, [r0], r1
+
+filt_blk2d_fp8x8_loop_neon
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q8, d8, d0
+    vmull.u8        q9, d10, d0
+    vmull.u8        q10, d12, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d8, d9, #1
+    vext.8          d30, d10, d11, #1
+    vext.8          d31, d12, d13, #1
+
+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d29, d1
+    vmlsl.u8        q9, d30, d1
+    vmlsl.u8        q10, d31, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d8, d9, #4
+    vext.8          d30, d10, d11, #4
+    vext.8          d31, d12, d13, #4
+
+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d29, d4
+    vmlsl.u8        q9, d30, d4
+    vmlsl.u8        q10, d31, d4
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d8, d9, #2
+    vext.8          d30, d10, d11, #2
+    vext.8          d31, d12, d13, #2
+
+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d29, d2
+    vmlal.u8        q9, d30, d2
+    vmlal.u8        q10, d31, d2
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d8, d9, #5
+    vext.8          d30, d10, d11, #5
+    vext.8          d31, d12, d13, #5
+
+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q8, d29, d5
+    vmlal.u8        q9, d30, d5
+    vmlal.u8        q10, d31, d5
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d8, d9, #3
+    vext.8          d30, d10, d11, #3
+    vext.8          d31, d12, d13, #3
+
+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q4, d29, d3
+    vmull.u8        q5, d30, d3
+    vmull.u8        q6, d31, d3
+
+    subs            r2, r2, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vld1.u8         {q3}, [r0], r1          ;load src data
+
+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d23, q8, #7
+    vqrshrun.s16    d24, q9, #7
+    vqrshrun.s16    d25, q10, #7
+
+    vst1.u8         {d22}, [lr]!            ;store result
+    vld1.u8         {q4}, [r0], r1
+    vst1.u8         {d23}, [lr]!
+    vld1.u8         {q5}, [r0], r1
+    vst1.u8         {d24}, [lr]!
+    vld1.u8         {q6}, [r0], r1
+    vst1.u8         {d25}, [lr]!
+
+    bne             filt_blk2d_fp8x8_loop_neon
+
+    ;first_pass filtering on the rest 5-line data
+    ;vld1.u8            {q3}, [r0], r1          ;load src data
+    ;vld1.u8            {q4}, [r0], r1
+    ;vld1.u8            {q5}, [r0], r1
+    ;vld1.u8            {q6}, [r0], r1
+    vld1.u8         {q7}, [r0], r1
+
+    vmull.u8        q8, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+    vmull.u8        q11, d12, d0
+    vmull.u8        q12, d14, d0
+
+    vext.8          d27, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d28, d8, d9, #1
+    vext.8          d29, d10, d11, #1
+    vext.8          d30, d12, d13, #1
+    vext.8          d31, d14, d15, #1
+
+    vmlsl.u8        q8, d27, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q9, d28, d1
+    vmlsl.u8        q10, d29, d1
+    vmlsl.u8        q11, d30, d1
+    vmlsl.u8        q12, d31, d1
+
+    vext.8          d27, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d28, d8, d9, #4
+    vext.8          d29, d10, d11, #4
+    vext.8          d30, d12, d13, #4
+    vext.8          d31, d14, d15, #4
+
+    vmlsl.u8        q8, d27, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q9, d28, d4
+    vmlsl.u8        q10, d29, d4
+    vmlsl.u8        q11, d30, d4
+    vmlsl.u8        q12, d31, d4
+
+    vext.8          d27, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d28, d8, d9, #2
+    vext.8          d29, d10, d11, #2
+    vext.8          d30, d12, d13, #2
+    vext.8          d31, d14, d15, #2
+
+    vmlal.u8        q8, d27, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q9, d28, d2
+    vmlal.u8        q10, d29, d2
+    vmlal.u8        q11, d30, d2
+    vmlal.u8        q12, d31, d2
+
+    vext.8          d27, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d28, d8, d9, #5
+    vext.8          d29, d10, d11, #5
+    vext.8          d30, d12, d13, #5
+    vext.8          d31, d14, d15, #5
+
+    vmlal.u8        q8, d27, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q9, d28, d5
+    vmlal.u8        q10, d29, d5
+    vmlal.u8        q11, d30, d5
+    vmlal.u8        q12, d31, d5
+
+    vext.8          d27, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d28, d8, d9, #3
+    vext.8          d29, d10, d11, #3
+    vext.8          d30, d12, d13, #3
+    vext.8          d31, d14, d15, #3
+
+    vmull.u8        q3, d27, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q4, d28, d3
+    vmull.u8        q5, d29, d3
+    vmull.u8        q6, d30, d3
+    vmull.u8        q7, d31, d3
+
+    vqadd.s16       q8, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q9, q4
+    vqadd.s16       q10, q5
+    vqadd.s16       q11, q6
+    vqadd.s16       q12, q7
+
+    add             r3, r12, r3, lsl #5
+
+    vqrshrun.s16    d26, q8, #7             ;shift/round/saturate to u8
+    sub             lr, lr, #64
+    vqrshrun.s16    d27, q9, #7
+    vld1.u8         {q9}, [lr]!             ;load intermediate data from stack
+    vqrshrun.s16    d28, q10, #7
+    vld1.u8         {q10}, [lr]!
+
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+
+    vqrshrun.s16    d29, q11, #7
+    vld1.u8         {q11}, [lr]!
+
+    vabs.s32        q7, q5
+    vabs.s32        q8, q6
+
+    vqrshrun.s16    d30, q12, #7
+    vld1.u8         {q12}, [lr]!
+
+;Second pass: 8x8
+    mov             r3, #2                  ;loop counter
+
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vdup.8          d1, d14[4]
+    vdup.8          d2, d15[0]
+    vdup.8          d3, d15[4]
+    vdup.8          d4, d16[0]
+    vdup.8          d5, d16[4]
+
+filt_blk2d_sp8x8_loop_neon
+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d19, d0
+    vmull.u8        q5, d20, d0
+    vmull.u8        q6, d21, d0
+
+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q4, d20, d1
+    vmlsl.u8        q5, d21, d1
+    vmlsl.u8        q6, d22, d1
+
+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d23, d4
+    vmlsl.u8        q5, d24, d4
+    vmlsl.u8        q6, d25, d4
+
+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d21, d2
+    vmlal.u8        q5, d22, d2
+    vmlal.u8        q6, d23, d2
+
+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q4, d24, d5
+    vmlal.u8        q5, d25, d5
+    vmlal.u8        q6, d26, d5
+
+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q8, d22, d3
+    vmull.u8        q9, d23, d3
+    vmull.u8        q10, d24, d3
+
+    subs            r3, r3, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vmov            q9, q11
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vmov            q10, q12
+    vst1.u8         {d7}, [r4], r5
+    vmov            q11, q13
+    vst1.u8         {d8}, [r4], r5
+    vmov            q12, q14
+    vst1.u8         {d9}, [r4], r5
+    vmov            d26, d30
+
+    bne filt_blk2d_sp8x8_loop_neon
+
+    add             sp, sp, #64
+    pop             {r4-r5,pc}
+
+;---------------------
+firstpass_filter8x8_only
+    ;add                r2, r12, r2, lsl #5     ;calculate filter location
+    ;vld1.s32       {q14, q15}, [r2]        ;load first_pass filter
+    vabs.s32        q12, q14
+    vabs.s32        q13, q15
+
+    mov             r2, #2                  ;loop counter
+    sub             r0, r0, #2              ;move srcptr back to (line-2) and (column-2)
+
+    vdup.8          d0, d24[0]              ;first_pass filter (d0-d5)
+    vdup.8          d1, d24[4]
+    vdup.8          d2, d25[0]
+    vdup.8          d3, d25[4]
+    vdup.8          d4, d26[0]
+    vdup.8          d5, d26[4]
+
+;First pass: output_height lines x output_width columns (8x8)
+filt_blk2d_fpo8x8_loop_neon
+    vld1.u8         {q3}, [r0], r1          ;load src data
+    vld1.u8         {q4}, [r0], r1
+    vld1.u8         {q5}, [r0], r1
+    vld1.u8         {q6}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d6, d0              ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q8, d8, d0
+    vmull.u8        q9, d10, d0
+    vmull.u8        q10, d12, d0
+
+    vext.8          d28, d6, d7, #1         ;construct src_ptr[-1]
+    vext.8          d29, d8, d9, #1
+    vext.8          d30, d10, d11, #1
+    vext.8          d31, d12, d13, #1
+
+    vmlsl.u8        q7, d28, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q8, d29, d1
+    vmlsl.u8        q9, d30, d1
+    vmlsl.u8        q10, d31, d1
+
+    vext.8          d28, d6, d7, #4         ;construct src_ptr[2]
+    vext.8          d29, d8, d9, #4
+    vext.8          d30, d10, d11, #4
+    vext.8          d31, d12, d13, #4
+
+    vmlsl.u8        q7, d28, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q8, d29, d4
+    vmlsl.u8        q9, d30, d4
+    vmlsl.u8        q10, d31, d4
+
+    vext.8          d28, d6, d7, #2         ;construct src_ptr[0]
+    vext.8          d29, d8, d9, #2
+    vext.8          d30, d10, d11, #2
+    vext.8          d31, d12, d13, #2
+
+    vmlal.u8        q7, d28, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q8, d29, d2
+    vmlal.u8        q9, d30, d2
+    vmlal.u8        q10, d31, d2
+
+    vext.8          d28, d6, d7, #5         ;construct src_ptr[3]
+    vext.8          d29, d8, d9, #5
+    vext.8          d30, d10, d11, #5
+    vext.8          d31, d12, d13, #5
+
+    vmlal.u8        q7, d28, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q8, d29, d5
+    vmlal.u8        q9, d30, d5
+    vmlal.u8        q10, d31, d5
+
+    vext.8          d28, d6, d7, #3         ;construct src_ptr[1]
+    vext.8          d29, d8, d9, #3
+    vext.8          d30, d10, d11, #3
+    vext.8          d31, d12, d13, #3
+
+    vmull.u8        q3, d28, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q4, d29, d3
+    vmull.u8        q5, d30, d3
+    vmull.u8        q6, d31, d3
+ ;
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    subs            r2, r2, #1
+
+    vqrshrun.s16    d22, q7, #7             ;shift/round/saturate to u8
+    vqrshrun.s16    d23, q8, #7
+    vqrshrun.s16    d24, q9, #7
+    vqrshrun.s16    d25, q10, #7
+
+    vst1.u8         {d22}, [r4], r5         ;store result
+    vst1.u8         {d23}, [r4], r5
+    vst1.u8         {d24}, [r4], r5
+    vst1.u8         {d25}, [r4], r5
+
+    bne             filt_blk2d_fpo8x8_loop_neon
+
+    pop             {r4-r5,pc}
+
+;---------------------
+secondpass_filter8x8_only
+    sub             r0, r0, r1, lsl #1
+    add             r3, r12, r3, lsl #5
+
+    vld1.u8         {d18}, [r0], r1         ;load src data
+    vld1.s32        {q5, q6}, [r3]          ;load second_pass filter
+    vld1.u8         {d19}, [r0], r1
+    vabs.s32        q7, q5
+    vld1.u8         {d20}, [r0], r1
+    vabs.s32        q8, q6
+    vld1.u8         {d21}, [r0], r1
+    mov             r3, #2                  ;loop counter
+    vld1.u8         {d22}, [r0], r1
+    vdup.8          d0, d14[0]              ;second_pass filter parameters (d0-d5)
+    vld1.u8         {d23}, [r0], r1
+    vdup.8          d1, d14[4]
+    vld1.u8         {d24}, [r0], r1
+    vdup.8          d2, d15[0]
+    vld1.u8         {d25}, [r0], r1
+    vdup.8          d3, d15[4]
+    vld1.u8         {d26}, [r0], r1
+    vdup.8          d4, d16[0]
+    vld1.u8         {d27}, [r0], r1
+    vdup.8          d5, d16[4]
+    vld1.u8         {d28}, [r0], r1
+    vld1.u8         {d29}, [r0], r1
+    vld1.u8         {d30}, [r0], r1
+
+;Second pass: 8x8
+filt_blk2d_spo8x8_loop_neon
+    vmull.u8        q3, d18, d0             ;(src_ptr[-2] * vp8_filter[0])
+    vmull.u8        q4, d19, d0
+    vmull.u8        q5, d20, d0
+    vmull.u8        q6, d21, d0
+
+    vmlsl.u8        q3, d19, d1             ;-(src_ptr[-1] * vp8_filter[1])
+    vmlsl.u8        q4, d20, d1
+    vmlsl.u8        q5, d21, d1
+    vmlsl.u8        q6, d22, d1
+
+    vmlsl.u8        q3, d22, d4             ;-(src_ptr[2] * vp8_filter[4])
+    vmlsl.u8        q4, d23, d4
+    vmlsl.u8        q5, d24, d4
+    vmlsl.u8        q6, d25, d4
+
+    vmlal.u8        q3, d20, d2             ;(src_ptr[0] * vp8_filter[2])
+    vmlal.u8        q4, d21, d2
+    vmlal.u8        q5, d22, d2
+    vmlal.u8        q6, d23, d2
+
+    vmlal.u8        q3, d23, d5             ;(src_ptr[3] * vp8_filter[5])
+    vmlal.u8        q4, d24, d5
+    vmlal.u8        q5, d25, d5
+    vmlal.u8        q6, d26, d5
+
+    vmull.u8        q7, d21, d3             ;(src_ptr[1] * vp8_filter[3])
+    vmull.u8        q8, d22, d3
+    vmull.u8        q9, d23, d3
+    vmull.u8        q10, d24, d3
+
+    subs            r3, r3, #1
+
+    vqadd.s16       q7, q3                  ;sum of all (src_data*filter_parameters)
+    vqadd.s16       q8, q4
+    vqadd.s16       q9, q5
+    vqadd.s16       q10, q6
+
+    vqrshrun.s16    d6, q7, #7              ;shift/round/saturate to u8
+    vqrshrun.s16    d7, q8, #7
+    vqrshrun.s16    d8, q9, #7
+    vqrshrun.s16    d9, q10, #7
+
+    vmov            q9, q11
+    vst1.u8         {d6}, [r4], r5          ;store result
+    vmov            q10, q12
+    vst1.u8         {d7}, [r4], r5
+    vmov            q11, q13
+    vst1.u8         {d8}, [r4], r5
+    vmov            q12, q14
+    vst1.u8         {d9}, [r4], r5
+    vmov            d26, d30
+
+    bne filt_blk2d_spo8x8_loop_neon
+
+    pop             {r4-r5,pc}
+
+    ENDP
+
+;-----------------
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/variance_neon.asm b/libvpx/vp8/common/arm/neon/variance_neon.asm
new file mode 100644
index 0000000..e3b4832
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/variance_neon.asm
@@ -0,0 +1,276 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance16x16_neon|
+    EXPORT  |vp8_variance16x8_neon|
+    EXPORT  |vp8_variance8x16_neon|
+    EXPORT  |vp8_variance8x8_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #8
+
+variance16x16_neon_loop
+    vld1.8          {q0}, [r0], r1              ;Load up source and reference
+    vld1.8          {q2}, [r2], r3
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
+    ;the results into the elements of the destination vector. The explanation
+    ;in ARM guide is wrong.
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             variance16x16_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    ;vmov.32        r0, d0[0]                   ;this instruction costs a lot
+    ;vmov.32        r1, d1[0]
+    ;mul            r0, r0, r0
+    ;str            r1, [r12]
+    ;sub            r0, r1, r0, lsr #8
+
+    ; while sum is signed, sum * sum is always positive and must be treated as
+    ; unsigned to avoid propagating the sign bit.
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+;================================
+;unsigned int vp8_variance16x8_c(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;   unsigned int *sse)
+|vp8_variance16x8_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #4
+
+variance16x8_neon_loop
+    vld1.8          {q0}, [r0], r1              ;Load up source and reference
+    vld1.8          {q2}, [r2], r3
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             variance16x8_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.u32        d10, d10, #7
+    vsub.u32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+;=================================
+;unsigned int vp8_variance8x16_c(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;   unsigned int *sse)
+
+|vp8_variance8x16_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #8
+
+variance8x16_neon_loop
+    vld1.8          {d0}, [r0], r1              ;Load up source and reference
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d6}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d2, d6
+
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+
+    bne             variance8x16_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.u32        d10, d10, #7
+    vsub.u32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+;==================================
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance8x8_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #2
+
+variance8x8_neon_loop
+    vld1.8          {d0}, [r0], r1              ;Load up source and reference
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d1}, [r0], r1
+    vld1.8          {d5}, [r2], r3
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d6}, [r2], r3
+    vld1.8          {d3}, [r0], r1
+    vld1.8          {d7}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             variance8x8_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.u32        d10, d10, #6
+    vsub.u32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
new file mode 100644
index 0000000..e7a3ed1
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -0,0 +1,423 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+bilinear_taps_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+;-----------------
+
+    EXPORT  |vp8_sub_pixel_variance16x16_neon_func|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
+
+|vp8_sub_pixel_variance16x16_neon_func| PROC
+    push            {r4-r6, lr}
+
+    adr             r12, bilinear_taps_coeff
+    ldr             r4, [sp, #16]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #20]           ;load dst_pixels_per_line from stack
+    ldr             r6, [sp, #24]           ;load *sse from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_bfilter16x16_only
+
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {d31}, [r2]             ;load first_pass filter
+
+    beq             firstpass_bfilter16x16_only
+
+    sub             sp, sp, #272            ;reserve space on stack for temporary storage
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    mov             lr, sp
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    mov             r2, #3                  ;loop counter
+    vld1.u8         {d8, d9, d10}, [r0], r1
+
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    vdup.8          d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16_loop_neon
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vqrshrn.u16    d21, q14, #7
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vst1.u8         {d18, d19, d20, d21}, [lr]!
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    bne             vp8e_filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+    vld1.u8         {d14, d15, d16}, [r0], r1
+
+    vmull.u8        q9, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q10, d3, d0
+    vmull.u8        q11, d5, d0
+    vmull.u8        q12, d6, d0
+    vmull.u8        q13, d8, d0
+    vmull.u8        q14, d9, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+
+    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q11, d5, d1
+    vmlal.u8        q13, d8, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+
+    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q12, d6, d1
+    vmlal.u8        q14, d9, d1
+
+    vmull.u8        q1, d11, d0
+    vmull.u8        q2, d12, d0
+    vmull.u8        q3, d14, d0
+    vmull.u8        q4, d15, d0
+
+    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]
+    vext.8          d14, d14, d15, #1
+
+    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q3, d14, d1
+
+    vext.8          d12, d12, d13, #1
+    vext.8          d15, d15, d16, #1
+
+    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q4, d15, d1
+
+    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d11, q10, #7
+    vqrshrn.u16    d12, q11, #7
+    vqrshrn.u16    d13, q12, #7
+    vqrshrn.u16    d14, q13, #7
+    vqrshrn.u16    d15, q14, #7
+    vqrshrn.u16    d16, q1, #7
+    vqrshrn.u16    d17, q2, #7
+    vqrshrn.u16    d18, q3, #7
+    vqrshrn.u16    d19, q4, #7
+
+    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result
+    vst1.u8         {d14, d15, d16, d17}, [lr]!
+    vst1.u8         {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+    add             r3, r12, r3, lsl #3
+    sub             lr, lr, #272
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+
+    sub             sp, sp, #256
+    mov             r3, sp
+
+    vld1.u8         {d22, d23}, [lr]!       ;load src data
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+    mov             r12, #4                 ;loop counter
+
+vp8e_filt_blk2d_sp16x16_loop_neon
+    vld1.u8         {d24, d25}, [lr]!
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
+    vld1.u8         {d26, d27}, [lr]!
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [lr]!
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [lr]!
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    subs            r12, r12, #1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r3]!         ;store result
+    vst1.u8         {d4, d5}, [r3]!
+    vst1.u8         {d6, d7}, [r3]!
+    vmov            q11, q15
+    vst1.u8         {d8, d9}, [r3]!
+
+    bne             vp8e_filt_blk2d_sp16x16_loop_neon
+
+    b               sub_pixel_variance16x16_neon
+
+;--------------------
+firstpass_bfilter16x16_only
+    mov             r2, #4                      ;loop counter
+    sub             sp, sp, #528            ;reserve space on stack for temporary storage
+    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)
+    vdup.8          d1, d31[4]
+    mov             r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16_loop_neon
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vld1.u8         {d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+    vst1.u8         {d14, d15}, [r3]!       ;store result
+    vqrshrn.u16    d21, q14, #7
+
+    vst1.u8         {d16, d17}, [r3]!
+    vst1.u8         {d18, d19}, [r3]!
+    vst1.u8         {d20, d21}, [r3]!
+
+    bne             vp8e_filt_blk2d_fpo16x16_loop_neon
+
+    b               sub_pixel_variance16x16_neon
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+    sub             sp, sp, #528            ;reserve space on stack for temporary storage
+    add             r3, r12, r3, lsl #3
+    mov             r12, #4                     ;loop counter
+    vld1.u32        {d31}, [r3]                 ;load second_pass filter
+    vld1.u8         {d22, d23}, [r0], r1        ;load src data
+    mov             r3, sp
+
+    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+vp8e_filt_blk2d_spo16x16_loop_neon
+    vld1.u8         {d24, d25}, [r0], r1
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
+    vld1.u8         {d26, d27}, [r0], r1
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [r0], r1
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [r0], r1
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r3]!         ;store result
+    subs            r12, r12, #1
+    vst1.u8         {d4, d5}, [r3]!
+    vmov            q11, q15
+    vst1.u8         {d6, d7}, [r3]!
+    vst1.u8         {d8, d9}, [r3]!
+
+    bne             vp8e_filt_blk2d_spo16x16_loop_neon
+
+    b               sub_pixel_variance16x16_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16_neon
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    sub             r3, r3, #256
+    mov             r12, #8
+
+sub_pixel_variance16x16_neon_loop
+    vld1.8          {q0}, [r3]!                 ;Load up source and reference
+    vld1.8          {q2}, [r4], r5
+    vld1.8          {q1}, [r3]!
+    vld1.8          {q3}, [r4], r5
+
+    vsubl.u8        q11, d0, d4                 ;diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vpadal.s16      q8, q11                     ;sum
+    vmlal.s16       q9, d22, d22                ;sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             sub_pixel_variance16x16_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r6]               ;store sse
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
+
+    add             sp, sp, #528
+    vmov.32         r0, d0[0]                   ;return
+
+    pop             {r4-r6,pc}
+
+    ENDP
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
new file mode 100644
index 0000000..155be4f
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -0,0 +1,572 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance_halfpixvar16x16_h_neon|
+    EXPORT  |vp8_variance_halfpixvar16x16_v_neon|
+    EXPORT  |vp8_variance_halfpixvar16x16_hv_neon|
+    EXPORT  |vp8_sub_pixel_variance16x16s_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;================================================
+;unsigned int vp8_variance_halfpixvar16x16_h_neon
+;(
+;    unsigned char  *src_ptr, r0
+;    int  src_pixels_per_line,  r1
+;    unsigned char *dst_ptr,  r2
+;    int dst_pixels_per_line,   r3
+;    unsigned int *sse
+;);
+;================================================
+|vp8_variance_halfpixvar16x16_h_neon| PROC
+    push            {lr}
+
+    mov             r12, #4                  ;loop counter
+    ldr             lr, [sp, #4]           ;load *sse from stack
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8_filt_fpo16x16s_4_0_loop_neon
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+    vld1.8          {q11}, [r2], r3
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.8          {q12}, [r2], r3
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.8          {q13}, [r2], r3
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+    vext.8          q3, q2, q3, #1
+    vext.8          q5, q4, q5, #1
+    vext.8          q7, q6, q7, #1
+
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vld1.8          {q14}, [r2], r3
+    vrhadd.u8       q1, q2, q3
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+
+    vsubl.u8        q4, d0, d22                 ;diff
+    vsubl.u8        q5, d1, d23
+    vsubl.u8        q6, d2, d24
+    vsubl.u8        q7, d3, d25
+    vsubl.u8        q0, d4, d26
+    vsubl.u8        q1, d5, d27
+    vsubl.u8        q2, d6, d28
+    vsubl.u8        q3, d7, d29
+
+    vpadal.s16      q8, q4                     ;sum
+    vmlal.s16       q9, d8, d8                ;sse
+    vmlal.s16       q10, d9, d9
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q5
+    vmlal.s16       q9, d10, d10
+    vmlal.s16       q10, d11, d11
+    vpadal.s16      q8, q6
+    vmlal.s16       q9, d12, d12
+    vmlal.s16       q10, d13, d13
+    vpadal.s16      q8, q7
+    vmlal.s16       q9, d14, d14
+    vmlal.s16       q10, d15, d15
+
+    vpadal.s16      q8, q0                     ;sum
+    vmlal.s16       q9, d0, d0                ;sse
+    vmlal.s16       q10, d1, d1
+    vpadal.s16      q8, q1
+    vmlal.s16       q9, d2, d2
+    vmlal.s16       q10, d3, d3
+    vpadal.s16      q8, q2
+    vmlal.s16       q9, d4, d4
+    vmlal.s16       q10, d5, d5
+    vpadal.s16      q8, q3
+    vmlal.s16       q9, d6, d6
+    vmlal.s16       q10, d7, d7
+
+    bne             vp8_filt_fpo16x16s_4_0_loop_neon
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {pc}
+    ENDP
+
+;================================================
+;unsigned int vp8_variance_halfpixvar16x16_v_neon
+;(
+;    unsigned char  *src_ptr, r0
+;    int  src_pixels_per_line,  r1
+;    unsigned char *dst_ptr,  r2
+;    int dst_pixels_per_line,   r3
+;    unsigned int *sse
+;);
+;================================================
+|vp8_variance_halfpixvar16x16_v_neon| PROC
+    push            {lr}
+
+    mov             r12, #4                     ;loop counter
+
+    vld1.u8         {q0}, [r0], r1              ;load src data
+    ldr             lr, [sp, #4]                ;load *sse from stack
+
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+vp8_filt_spo16x16s_0_4_loop_neon
+    vld1.u8         {q2}, [r0], r1
+    vld1.8          {q1}, [r2], r3
+    vld1.u8         {q4}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+    vld1.u8         {q6}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+    vld1.u8         {q15}, [r0], r1
+
+    vrhadd.u8       q0, q0, q2
+    vld1.8          {q7}, [r2], r3
+    vrhadd.u8       q2, q2, q4
+    vrhadd.u8       q4, q4, q6
+    vrhadd.u8       q6, q6, q15
+
+    vsubl.u8        q11, d0, d2                 ;diff
+    vsubl.u8        q12, d1, d3
+    vsubl.u8        q13, d4, d6
+    vsubl.u8        q14, d5, d7
+    vsubl.u8        q0, d8, d10
+    vsubl.u8        q1, d9, d11
+    vsubl.u8        q2, d12, d14
+    vsubl.u8        q3, d13, d15
+
+    vpadal.s16      q8, q11                     ;sum
+    vmlal.s16       q9, d22, d22                ;sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    vpadal.s16      q8, q0                     ;sum
+    vmlal.s16       q9, d0, d0                 ;sse
+    vmlal.s16       q10, d1, d1
+    vpadal.s16      q8, q1
+    vmlal.s16       q9, d2, d2
+    vmlal.s16       q10, d3, d3
+    vpadal.s16      q8, q2
+    vmlal.s16       q9, d4, d4
+    vmlal.s16       q10, d5, d5
+
+    vmov            q0, q15
+
+    vpadal.s16      q8, q3
+    vmlal.s16       q9, d6, d6
+    vmlal.s16       q10, d7, d7
+
+    bne             vp8_filt_spo16x16s_0_4_loop_neon
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {pc}
+    ENDP
+
+;================================================
+;unsigned int vp8_variance_halfpixvar16x16_hv_neon
+;(
+;    unsigned char  *src_ptr, r0
+;    int  src_pixels_per_line,  r1
+;    unsigned char *dst_ptr,  r2
+;    int dst_pixels_per_line,   r3
+;    unsigned int *sse
+;);
+;================================================
+|vp8_variance_halfpixvar16x16_hv_neon| PROC
+    push            {lr}
+
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+
+    ldr             lr, [sp, #4]           ;load *sse from stack
+    vmov.i8         q13, #0                      ;q8 - sum
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+
+    vmov.i8         q14, #0                      ;q9, q10 - sse
+    vmov.i8         q15, #0
+
+    mov             r12, #4                  ;loop counter
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8_filt16x16s_4_4_loop_neon
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+    vld1.u8         {d16, d17, d18, d19}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
+    vext.8          q5, q4, q5, #1
+    vext.8          q7, q6, q7, #1
+    vext.8          q9, q8, q9, #1
+
+    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+    vrhadd.u8       q4, q8, q9
+
+    vld1.8          {q5}, [r2], r3
+    vrhadd.u8       q0, q0, q1
+    vld1.8          {q6}, [r2], r3
+    vrhadd.u8       q1, q1, q2
+    vld1.8          {q7}, [r2], r3
+    vrhadd.u8       q2, q2, q3
+    vld1.8          {q8}, [r2], r3
+    vrhadd.u8       q3, q3, q4
+
+    vsubl.u8        q9, d0, d10                 ;diff
+    vsubl.u8        q10, d1, d11
+    vsubl.u8        q11, d2, d12
+    vsubl.u8        q12, d3, d13
+
+    vsubl.u8        q0, d4, d14                 ;diff
+    vsubl.u8        q1, d5, d15
+    vsubl.u8        q5, d6, d16
+    vsubl.u8        q6, d7, d17
+
+    vpadal.s16      q13, q9                     ;sum
+    vmlal.s16       q14, d18, d18                ;sse
+    vmlal.s16       q15, d19, d19
+
+    vpadal.s16      q13, q10                     ;sum
+    vmlal.s16       q14, d20, d20                ;sse
+    vmlal.s16       q15, d21, d21
+
+    vpadal.s16      q13, q11                     ;sum
+    vmlal.s16       q14, d22, d22                ;sse
+    vmlal.s16       q15, d23, d23
+
+    vpadal.s16      q13, q12                     ;sum
+    vmlal.s16       q14, d24, d24                ;sse
+    vmlal.s16       q15, d25, d25
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q13, q0                     ;sum
+    vmlal.s16       q14, d0, d0                ;sse
+    vmlal.s16       q15, d1, d1
+
+    vpadal.s16      q13, q1                     ;sum
+    vmlal.s16       q14, d2, d2                ;sse
+    vmlal.s16       q15, d3, d3
+
+    vpadal.s16      q13, q5                     ;sum
+    vmlal.s16       q14, d10, d10                ;sse
+    vmlal.s16       q15, d11, d11
+
+    vmov            q0, q4
+
+    vpadal.s16      q13, q6                     ;sum
+    vmlal.s16       q14, d12, d12                ;sse
+    vmlal.s16       q15, d13, d13
+
+    bne             vp8_filt16x16s_4_4_loop_neon
+
+    vadd.u32        q15, q14, q15                ;accumulate sse
+    vpaddl.s32      q0, q13                      ;accumulate sum
+
+    vpaddl.u32      q1, q15
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {pc}
+    ENDP
+
+;==============================
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack unsigned char *dst_ptr,
+; stack int dst_pixels_per_line,
+; stack unsigned int *sse
+;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
+;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
+;or filter coeff is {64, 64}. This simplified program only works in this situation.
+;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
+
+|vp8_sub_pixel_variance16x16s_neon| PROC
+    push            {r4, lr}
+
+    ldr             r4, [sp, #8]            ;load *dst_ptr from stack
+    ldr             r12, [sp, #12]          ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #16]           ;load *sse from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_bfilter16x16s_only
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             firstpass_bfilter16x16s_only
+
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+    sub             sp, sp, #256            ;reserve space on stack for temporary storage
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+    mov             r3, sp
+    mov             r2, #4                  ;loop counter
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16s_loop_neon
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+    vld1.u8         {d16, d17, d18, d19}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
+    vext.8          q5, q4, q5, #1
+    vext.8          q7, q6, q7, #1
+    vext.8          q9, q8, q9, #1
+
+    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+    vrhadd.u8       q4, q8, q9
+
+    vrhadd.u8       q0, q0, q1
+    vrhadd.u8       q1, q1, q2
+    vrhadd.u8       q2, q2, q3
+    vrhadd.u8       q3, q3, q4
+
+    subs            r2, r2, #1
+    vst1.u8         {d0, d1 ,d2, d3}, [r3]!         ;store result
+    vmov            q0, q4
+    vst1.u8         {d4, d5, d6, d7}, [r3]!
+
+    bne             vp8e_filt_blk2d_fp16x16s_loop_neon
+
+    b               sub_pixel_variance16x16s_neon
+
+;--------------------
+firstpass_bfilter16x16s_only
+    mov             r2, #2                  ;loop counter
+    sub             sp, sp, #256            ;reserve space on stack for temporary storage
+    mov             r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16s_loop_neon
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+    vld1.u8         {d16, d17, d18, d19}, [r0], r1
+    vext.8          q3, q2, q3, #1
+    vld1.u8         {d20, d21, d22, d23}, [r0], r1
+    vext.8          q5, q4, q5, #1
+    vld1.u8         {d24, d25, d26, d27}, [r0], r1
+    vext.8          q7, q6, q7, #1
+    vld1.u8         {d28, d29, d30, d31}, [r0], r1
+    vext.8          q9, q8, q9, #1
+    vext.8          q11, q10, q11, #1
+    vext.8          q13, q12, q13, #1
+    vext.8          q15, q14, q15, #1
+
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vrhadd.u8       q1, q2, q3
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+    vrhadd.u8       q4, q8, q9
+    vrhadd.u8       q5, q10, q11
+    vrhadd.u8       q6, q12, q13
+    vrhadd.u8       q7, q14, q15
+
+    subs            r2, r2, #1
+
+    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
+    vst1.u8         {d4, d5, d6, d7}, [r3]!
+    vst1.u8         {d8, d9, d10, d11}, [r3]!
+    vst1.u8         {d12, d13, d14, d15}, [r3]!
+
+    bne             vp8e_filt_blk2d_fpo16x16s_loop_neon
+
+    b               sub_pixel_variance16x16s_neon
+
+;---------------------
+secondpass_bfilter16x16s_only
+    sub             sp, sp, #256            ;reserve space on stack for temporary storage
+
+    mov             r2, #2                  ;loop counter
+    vld1.u8         {d0, d1}, [r0], r1      ;load src data
+    mov             r3, sp
+
+vp8e_filt_blk2d_spo16x16s_loop_neon
+    vld1.u8         {d2, d3}, [r0], r1
+    vld1.u8         {d4, d5}, [r0], r1
+    vld1.u8         {d6, d7}, [r0], r1
+    vld1.u8         {d8, d9}, [r0], r1
+
+    vrhadd.u8       q0, q0, q1
+    vld1.u8         {d10, d11}, [r0], r1
+    vrhadd.u8       q1, q1, q2
+    vld1.u8         {d12, d13}, [r0], r1
+    vrhadd.u8       q2, q2, q3
+    vld1.u8         {d14, d15}, [r0], r1
+    vrhadd.u8       q3, q3, q4
+    vld1.u8         {d16, d17}, [r0], r1
+    vrhadd.u8       q4, q4, q5
+    vrhadd.u8       q5, q5, q6
+    vrhadd.u8       q6, q6, q7
+    vrhadd.u8       q7, q7, q8
+
+    subs            r2, r2, #1
+
+    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
+    vmov            q0, q8
+    vst1.u8         {d4, d5, d6, d7}, [r3]!
+    vst1.u8         {d8, d9, d10, d11}, [r3]!           ;store result
+    vst1.u8         {d12, d13, d14, d15}, [r3]!
+
+    bne             vp8e_filt_blk2d_spo16x16s_loop_neon
+
+    b               sub_pixel_variance16x16s_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16s_neon
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    sub             r3, r3, #256
+    mov             r2, #4
+
+sub_pixel_variance16x16s_neon_loop
+    vld1.8          {q0}, [r3]!                 ;Load up source and reference
+    vld1.8          {q1}, [r4], r12
+    vld1.8          {q2}, [r3]!
+    vld1.8          {q3}, [r4], r12
+    vld1.8          {q4}, [r3]!
+    vld1.8          {q5}, [r4], r12
+    vld1.8          {q6}, [r3]!
+    vld1.8          {q7}, [r4], r12
+
+    vsubl.u8        q11, d0, d2                 ;diff
+    vsubl.u8        q12, d1, d3
+    vsubl.u8        q13, d4, d6
+    vsubl.u8        q14, d5, d7
+    vsubl.u8        q0, d8, d10
+    vsubl.u8        q1, d9, d11
+    vsubl.u8        q2, d12, d14
+    vsubl.u8        q3, d13, d15
+
+    vpadal.s16      q8, q11                     ;sum
+    vmlal.s16       q9, d22, d22                ;sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r2, r2, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    vpadal.s16      q8, q0                     ;sum
+    vmlal.s16       q9, d0, d0                ;sse
+    vmlal.s16       q10, d1, d1
+    vpadal.s16      q8, q1
+    vmlal.s16       q9, d2, d2
+    vmlal.s16       q10, d3, d3
+    vpadal.s16      q8, q2
+    vmlal.s16       q9, d4, d4
+    vmlal.s16       q10, d5, d5
+    vpadal.s16      q8, q3
+    vmlal.s16       q9, d6, d6
+    vmlal.s16       q10, d7, d7
+
+    bne             sub_pixel_variance16x16s_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.u32        d10, d10, #8
+    vsub.u32        d0, d1, d10
+
+    add             sp, sp, #256
+    vmov.32         r0, d0[0]                   ;return
+
+    pop             {r4, pc}
+    ENDP
+
+    END
diff --git a/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
new file mode 100644
index 0000000..f6b6847
--- /dev/null
+++ b/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -0,0 +1,222 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sub_pixel_variance8x8_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
+
+|vp8_sub_pixel_variance8x8_neon| PROC
+    push            {r4-r5, lr}
+
+    adr             r12, bilinear_taps_coeff
+    ldr             r4, [sp, #12]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #16]           ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #20]           ;load *sse from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vld1.u32        {d31}, [r2]             ;load first_pass filter
+    vld1.u8         {q2}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {q3}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {q4}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
+    vld1.u8         {q2}, [r0], r1
+    vqrshrn.u16    d23, q7, #7
+    vld1.u8         {q3}, [r0], r1
+    vqrshrn.u16    d24, q8, #7
+    vld1.u8         {q4}, [r0], r1
+    vqrshrn.u16    d25, q9, #7
+
+    ;first_pass filtering on the rest 5-line data
+    vld1.u8         {q5}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+    vext.8          d11, d10, d11, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+    vmlal.u8        q10, d11, d1
+
+    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d27, q7, #7
+    vqrshrn.u16    d28, q8, #7
+    vqrshrn.u16    d29, q9, #7
+    vqrshrn.u16    d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    ;skip_secondpass_filter
+    beq             sub_pixel_variance8x8_neon
+
+    add             r3, r12, r3, lsl #3
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
+    vmull.u8        q2, d23, d0
+    vmull.u8        q3, d24, d0
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * Filter[1])
+    vmlal.u8        q2, d24, d1
+    vmlal.u8        q3, d25, d1
+    vmlal.u8        q4, d26, d1
+    vmlal.u8        q5, d27, d1
+    vmlal.u8        q6, d28, d1
+    vmlal.u8        q7, d29, d1
+    vmlal.u8        q8, d30, d1
+
+    vqrshrn.u16    d22, q1, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d23, q2, #7
+    vqrshrn.u16    d24, q3, #7
+    vqrshrn.u16    d25, q4, #7
+    vqrshrn.u16    d26, q5, #7
+    vqrshrn.u16    d27, q6, #7
+    vqrshrn.u16    d28, q7, #7
+    vqrshrn.u16    d29, q8, #7
+
+    b               sub_pixel_variance8x8_neon
+
+;--------------------
+skip_firstpass_filter
+    vld1.u8         {d22}, [r0], r1         ;load src data
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+    vld1.u8         {d27}, [r0], r1
+    vld1.u8         {d28}, [r0], r1
+    vld1.u8         {d29}, [r0], r1
+    vld1.u8         {d30}, [r0], r1
+
+    b               secondpass_filter
+
+;----------------------
+;vp8_variance8x8_neon
+sub_pixel_variance8x8_neon
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #2
+
+sub_pixel_variance8x8_neon_loop
+    vld1.8          {d0}, [r4], r5              ;load dst data
+    subs            r12, r12, #1
+    vld1.8          {d1}, [r4], r5
+    vld1.8          {d2}, [r4], r5
+    vsubl.u8        q4, d22, d0                 ;calculate diff
+    vld1.8          {d3}, [r4], r5
+
+    vsubl.u8        q5, d23, d1
+    vsubl.u8        q6, d24, d2
+
+    vpadal.s16      q8, q4                      ;sum
+    vmlal.s16       q9, d8, d8                  ;sse
+    vmlal.s16       q10, d9, d9
+
+    vsubl.u8        q7, d25, d3
+
+    vpadal.s16      q8, q5
+    vmlal.s16       q9, d10, d10
+    vmlal.s16       q10, d11, d11
+
+    vmov            q11, q13
+
+    vpadal.s16      q8, q6
+    vmlal.s16       q9, d12, d12
+    vmlal.s16       q10, d13, d13
+
+    vmov            q12, q14
+
+    vpadal.s16      q8, q7
+    vmlal.s16       q9, d14, d14
+    vmlal.s16       q10, d15, d15
+
+    bne             sub_pixel_variance8x8_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.u32        d10, d10, #6
+    vsub.u32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {r4-r5, pc}
+
+    ENDP
+
+;-----------------
+
+bilinear_taps_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END
diff --git a/libvpx/vp8/common/arm/reconintra_arm.c b/libvpx/vp8/common/arm/reconintra_arm.c
new file mode 100644
index 0000000..121e090
--- /dev/null
+++ b/libvpx/vp8/common/arm/reconintra_arm.c
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_NEON
+extern void vp8_build_intra_predictors_mby_neon_func(
+    unsigned char *y_buffer,
+    unsigned char *ypred_ptr,
+    int y_stride,
+    int mode,
+    int Up,
+    int Left);
+
+void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
+{
+    unsigned char *y_buffer = x->dst.y_buffer;
+    unsigned char *ypred_ptr = x->predictor;
+    int y_stride = x->dst.y_stride;
+    int mode = x->mode_info_context->mbmi.mode;
+    int Up = x->up_available;
+    int Left = x->left_available;
+
+    vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
+}
+
+extern void vp8_build_intra_predictors_mby_s_neon_func(
+    unsigned char *y_buffer,
+    unsigned char *ypred_ptr,
+    int y_stride,
+    int mode,
+    int Up,
+    int Left);
+
+void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
+{
+    unsigned char *y_buffer = x->dst.y_buffer;
+    unsigned char *ypred_ptr = x->predictor;
+    int y_stride = x->dst.y_stride;
+    int mode = x->mode_info_context->mbmi.mode;
+    int Up = x->up_available;
+    int Left = x->left_available;
+
+    vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
+}
+
+#endif
diff --git a/libvpx/vp8/common/arm/variance_arm.c b/libvpx/vp8/common/arm/variance_arm.c
new file mode 100644
index 0000000..891d767
--- /dev/null
+++ b/libvpx/vp8/common/arm/variance_arm.c
@@ -0,0 +1,132 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/variance.h"
+#include "vp8/common/filter.h"
+
+#if HAVE_MEDIA
+#include "vp8/common/arm/bilinearfilter_arm.h"
+
+unsigned int vp8_sub_pixel_variance8x8_armv6
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short first_pass[10*8];
+    unsigned char  second_pass[8*8];
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            9, 8, HFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                             8, 8, 8, VFilter);
+
+    return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
+                                   dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_variance16x16_armv6
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short first_pass[36*16];
+    unsigned char  second_pass[20*16];
+    const short *HFilter, *VFilter;
+    unsigned int var;
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
+                                                   dst_ptr, dst_pixels_per_line, sse);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
+                                                   dst_ptr, dst_pixels_per_line, sse);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
+                                                   dst_ptr, dst_pixels_per_line, sse);
+    }
+    else
+    {
+        HFilter = vp8_bilinear_filters[xoffset];
+        VFilter = vp8_bilinear_filters[yoffset];
+
+        vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                                src_pixels_per_line,
+                                                17, 16, HFilter);
+        vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                                 16, 16, 16, VFilter);
+
+        var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
+                                       dst_pixels_per_line, sse);
+    }
+    return var;
+}
+
+#endif /* HAVE_MEDIA */
+
+
+#if HAVE_NEON
+
+extern unsigned int vp8_sub_pixel_variance16x16_neon_func
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+);
+
+unsigned int vp8_sub_pixel_variance16x16_neon
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+  if (xoffset == 4 && yoffset == 0)
+    return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else if (xoffset == 0 && yoffset == 4)
+    return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else if (xoffset == 4 && yoffset == 4)
+    return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
+  else
+    return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+}
+
+#endif
diff --git a/libvpx/vp8/common/asm_com_offsets.c b/libvpx/vp8/common/asm_com_offsets.c
new file mode 100644
index 0000000..ae22b5f
--- /dev/null
+++ b/libvpx/vp8/common/asm_com_offsets.c
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/asm_offsets.h"
+#include "vpx_scale/yv12config.h"
+#include "vp8/common/blockd.h"
+
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif /* CONFIG_POSTPROC */
+
+BEGIN
+
+/* vpx_scale */
+DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
+DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
+DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
+DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
+DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
+DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
+DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
+DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
+DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
+DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
+DEFINE(VP8BORDERINPIXELS_VAL,                   VP8BORDERINPIXELS);
+
+#if CONFIG_POSTPROC
+/* mfqe.c / filter_by_weight */
+DEFINE(MFQE_PRECISION_VAL,                      MFQE_PRECISION);
+#endif /* CONFIG_POSTPROC */
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
+
+#if HAVE_MEDIA
+/* switch case in vp8_intra4x4_predict_armv6 is based on these enumerated values */
+ct_assert(B_DC_PRED, B_DC_PRED == 0);
+ct_assert(B_TM_PRED, B_TM_PRED == 1);
+ct_assert(B_VE_PRED, B_VE_PRED == 2);
+ct_assert(B_HE_PRED, B_HE_PRED == 3);
+ct_assert(B_LD_PRED, B_LD_PRED == 4);
+ct_assert(B_RD_PRED, B_RD_PRED == 5);
+ct_assert(B_VR_PRED, B_VR_PRED == 6);
+ct_assert(B_VL_PRED, B_VL_PRED == 7);
+ct_assert(B_HD_PRED, B_HD_PRED == 8);
+ct_assert(B_HU_PRED, B_HU_PRED == 9);
+#endif
+
+#if HAVE_NEON
+/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
+ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
+#endif
+
+#if HAVE_SSE2
+#if CONFIG_POSTPROC
+/* vp8_filter_by_weight16x16 and 8x8 */
+ct_assert(MFQE_PRECISION_VAL, MFQE_PRECISION == 4)
+#endif /* CONFIG_POSTPROC */
+#endif /* HAVE_SSE2 */
diff --git a/libvpx/vp8/common/blockd.c b/libvpx/vp8/common/blockd.c
new file mode 100644
index 0000000..1fc3cd0
--- /dev/null
+++ b/libvpx/vp8/common/blockd.c
@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+const unsigned char vp8_block2left[25] =
+{
+    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+const unsigned char vp8_block2above[25] =
+{
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
+};
diff --git a/libvpx/vp8/common/blockd.h b/libvpx/vp8/common/blockd.h
new file mode 100644
index 0000000..f7ff577
--- /dev/null
+++ b/libvpx/vp8/common/blockd.h
@@ -0,0 +1,300 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BLOCKD_H
+#define __INC_BLOCKD_H
+
+void vpx_log(const char *format, ...);
+
+#include "vpx_config.h"
+#include "vpx_scale/yv12config.h"
+#include "mv.h"
+#include "treecoder.h"
+#include "vpx_ports/mem.h"
+
+/*#define DCPRED 1*/
+#define DCPREDSIMTHRESH 0
+#define DCPREDCNTTHRESH 3
+
+#define MB_FEATURE_TREE_PROBS   3
+#define MAX_MB_SEGMENTS         4
+
+#define MAX_REF_LF_DELTAS       4
+#define MAX_MODE_LF_DELTAS      4
+
+/* Segment Feature Masks */
+#define SEGMENT_DELTADATA   0
+#define SEGMENT_ABSDATA     1
+
+typedef struct
+{
+    int r, c;
+} POS;
+
+#define PLANE_TYPE_Y_NO_DC    0
+#define PLANE_TYPE_Y2         1
+#define PLANE_TYPE_UV         2
+#define PLANE_TYPE_Y_WITH_DC  3
+
+
+typedef char ENTROPY_CONTEXT;
+typedef struct
+{
+    ENTROPY_CONTEXT y1[4];
+    ENTROPY_CONTEXT u[2];
+    ENTROPY_CONTEXT v[2];
+    ENTROPY_CONTEXT y2;
+} ENTROPY_CONTEXT_PLANES;
+
+extern const unsigned char vp8_block2left[25];
+extern const unsigned char vp8_block2above[25];
+
+#define VP8_COMBINEENTROPYCONTEXTS( Dest, A, B) \
+    Dest = (A)+(B);
+
+
+typedef enum
+{
+    KEY_FRAME = 0,
+    INTER_FRAME = 1
+} FRAME_TYPE;
+
+typedef enum
+{
+    DC_PRED,            /* average of above and left pixels */
+    V_PRED,             /* vertical prediction */
+    H_PRED,             /* horizontal prediction */
+    TM_PRED,            /* Truemotion prediction */
+    B_PRED,             /* block based prediction, each block has its own prediction mode */
+
+    NEARESTMV,
+    NEARMV,
+    ZEROMV,
+    NEWMV,
+    SPLITMV,
+
+    MB_MODE_COUNT
+} MB_PREDICTION_MODE;
+
+/* Macroblock level features */
+typedef enum
+{
+    MB_LVL_ALT_Q = 0,               /* Use alternate Quantizer .... */
+    MB_LVL_ALT_LF = 1,              /* Use alternate loop filter value... */
+    MB_LVL_MAX = 2                  /* Number of MB level features supported */
+
+} MB_LVL_FEATURES;
+
+/* Segment Feature Masks */
+#define SEGMENT_ALTQ    0x01
+#define SEGMENT_ALT_LF  0x02
+
+#define VP8_YMODES  (B_PRED + 1)
+#define VP8_UV_MODES (TM_PRED + 1)
+
+#define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
+
+typedef enum
+{
+    B_DC_PRED,          /* average of above and left pixels */
+    B_TM_PRED,
+
+    B_VE_PRED,           /* vertical prediction */
+    B_HE_PRED,           /* horizontal prediction */
+
+    B_LD_PRED,
+    B_RD_PRED,
+
+    B_VR_PRED,
+    B_VL_PRED,
+    B_HD_PRED,
+    B_HU_PRED,
+
+    LEFT4X4,
+    ABOVE4X4,
+    ZERO4X4,
+    NEW4X4,
+
+    B_MODE_COUNT
+} B_PREDICTION_MODE;
+
+#define VP8_BINTRAMODES (B_HU_PRED + 1)  /* 10 */
+#define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+   modes for the Y blocks to the left and above us; for interframes, there
+   is a single probability table. */
+
+union b_mode_info
+{
+    B_PREDICTION_MODE as_mode;
+    int_mv mv;
+};
+
+typedef enum
+{
+    INTRA_FRAME = 0,
+    LAST_FRAME = 1,
+    GOLDEN_FRAME = 2,
+    ALTREF_FRAME = 3,
+    MAX_REF_FRAMES = 4
+} MV_REFERENCE_FRAME;
+
+typedef struct
+{
+    uint8_t mode, uv_mode;
+    uint8_t ref_frame;
+    uint8_t is_4x4;
+    int_mv mv;
+
+    uint8_t partitioning;
+    uint8_t mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
+    uint8_t need_to_clamp_mvs;
+    uint8_t segment_id;                  /* Which set of segmentation parameters should be used for this MB */
+} MB_MODE_INFO;
+
+typedef struct modeinfo
+{
+    MB_MODE_INFO mbmi;
+    union b_mode_info bmi[16];
+} MODE_INFO;
+
+#if CONFIG_MULTI_RES_ENCODING
+/* The mb-level information needed to be stored for higher-resolution encoder */
+typedef struct
+{
+    MB_PREDICTION_MODE mode;
+    MV_REFERENCE_FRAME ref_frame;
+    int_mv mv;
+    int dissim;    /* dissimilarity level of the macroblock */
+} LOWER_RES_MB_INFO;
+
+/* The frame-level information needed to be stored for higher-resolution
+ *  encoder */
+typedef struct
+{
+    FRAME_TYPE frame_type;
+    int is_frame_dropped;
+    /* The frame number of each reference frames */
+    unsigned int low_res_ref_frames[MAX_REF_FRAMES];
+    LOWER_RES_MB_INFO *mb_info;
+} LOWER_RES_FRAME_INFO;
+#endif
+
+typedef struct blockd
+{
+    short *qcoeff;
+    short *dqcoeff;
+    unsigned char  *predictor;
+    short *dequant;
+
+    int offset;
+    char *eob;
+
+    union b_mode_info bmi;
+} BLOCKD;
+
+typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+
+typedef struct macroblockd
+{
+    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
+    DECLARE_ALIGNED(16, short, qcoeff[400]);
+    DECLARE_ALIGNED(16, short, dqcoeff[400]);
+    DECLARE_ALIGNED(16, char,  eobs[25]);
+
+    DECLARE_ALIGNED(16, short,  dequant_y1[16]);
+    DECLARE_ALIGNED(16, short,  dequant_y1_dc[16]);
+    DECLARE_ALIGNED(16, short,  dequant_y2[16]);
+    DECLARE_ALIGNED(16, short,  dequant_uv[16]);
+
+    /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
+    BLOCKD block[25];
+    int fullpixel_mask;
+
+    YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
+    YV12_BUFFER_CONFIG dst;
+
+    MODE_INFO *mode_info_context;
+    int mode_info_stride;
+
+    FRAME_TYPE frame_type;
+
+    int up_available;
+    int left_available;
+
+    unsigned char *recon_above[3];
+    unsigned char *recon_left[3];
+    int recon_left_stride[2];
+
+    /* Y,U,V,Y2 */
+    ENTROPY_CONTEXT_PLANES *above_context;
+    ENTROPY_CONTEXT_PLANES *left_context;
+
+    /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */
+    unsigned char segmentation_enabled;
+
+    /* 0 (do not update) 1 (update) the macroblock segmentation map. */
+    unsigned char update_mb_segmentation_map;
+
+    /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+    unsigned char update_mb_segmentation_data;
+
+    /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
+    unsigned char mb_segement_abs_delta;
+
+    /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */
+    /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */
+    vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];         /* Probability Tree used to code Segment number */
+
+    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];            /* Segment parameters */
+
+    /* mode_based Loop filter adjustment */
+    unsigned char mode_ref_lf_delta_enabled;
+    unsigned char mode_ref_lf_delta_update;
+
+    /* Delta values have the range +/- MAX_LOOP_FILTER */
+    signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];                /* 0 = Intra, Last, GF, ARF */
+    signed char ref_lf_deltas[MAX_REF_LF_DELTAS];                     /* 0 = Intra, Last, GF, ARF */
+    signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];                      /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+    signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];                           /* 0 = BPRED, ZERO_MV, MV, SPLIT */
+
+    /* Distance of MB away from frame edges */
+    int mb_to_left_edge;
+    int mb_to_right_edge;
+    int mb_to_top_edge;
+    int mb_to_bottom_edge;
+
+
+
+    vp8_subpix_fn_t  subpixel_predict;
+    vp8_subpix_fn_t  subpixel_predict8x4;
+    vp8_subpix_fn_t  subpixel_predict8x8;
+    vp8_subpix_fn_t  subpixel_predict16x16;
+
+    void *current_bc;
+
+    int corrupted;
+
+#if ARCH_X86 || ARCH_X86_64
+    /* This is an intermediate buffer currently used in sub-pixel motion search
+     * to keep a copy of the reference area. This buffer can be used for other
+     * purpose.
+     */
+    DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]);
+#endif
+} MACROBLOCKD;
+
+
+extern void vp8_build_block_doffsets(MACROBLOCKD *x);
+extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
+
+#endif  /* __INC_BLOCKD_H */
diff --git a/libvpx/vp8/common/coefupdateprobs.h b/libvpx/vp8/common/coefupdateprobs.h
new file mode 100644
index 0000000..9e194dc
--- /dev/null
+++ b/libvpx/vp8/common/coefupdateprobs.h
@@ -0,0 +1,185 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/* Update probabilities for the nodes in the token entropy tree.
+   Generated file included by entropy.c */
+
+const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] =
+{
+    {
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
+            {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
+        },
+        {
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
+            {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+};
diff --git a/libvpx/vp8/common/common.h b/libvpx/vp8/common/common.h
new file mode 100644
index 0000000..2cc1c54
--- /dev/null
+++ b/libvpx/vp8/common/common.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef common_h
+#define common_h 1
+
+#include <assert.h>
+
+/* Interface header for common constant data structures and lookup tables */
+
+#include "vpx_mem/vpx_mem.h"
+
+/* Only need this for fixed-size arrays, for structs just assign. */
+
+#define vp8_copy( Dest, Src) { \
+        assert( sizeof( Dest) == sizeof( Src)); \
+        vpx_memcpy( Dest, Src, sizeof( Src)); \
+    }
+
+/* Use this for variably-sized arrays. */
+
+#define vp8_copy_array( Dest, Src, N) { \
+        assert( sizeof( *Dest) == sizeof( *Src)); \
+        vpx_memcpy( Dest, Src, N * sizeof( *Src)); \
+    }
+
+#define vp8_zero( Dest)  vpx_memset( &Dest, 0, sizeof( Dest));
+
+#define vp8_zero_array( Dest, N)  vpx_memset( Dest, 0, N * sizeof( *Dest));
+
+
+#endif  /* common_h */
diff --git a/libvpx/vp8/common/context.c b/libvpx/vp8/common/context.c
new file mode 100644
index 0000000..99e95d3
--- /dev/null
+++ b/libvpx/vp8/common/context.c
@@ -0,0 +1,399 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropy.h"
+
+/* *** GENERATED FILE: DO NOT EDIT *** */
+
+#if 0
+int Contexts[vp8_coef_counter_dimen];
+
+const int default_contexts[vp8_coef_counter_dimen] =
+{
+    {
+        // Block Type ( 0 )
+        {
+            // Coeff Band ( 0 )
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593,},
+            {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987,},
+            {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104,},
+        },
+        {
+            // Coeff Band ( 2 )
+            {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0,},
+            {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294,},
+            {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879,},
+        },
+        {
+            // Coeff Band ( 3 )
+            {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302,},
+            { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073,},
+            { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362,},
+            { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190,},
+        },
+        {
+            // Coeff Band ( 6 )
+            {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164,},
+            { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,},
+        },
+    },
+    {
+        // Block Type ( 1 )
+        {
+            // Coeff Band ( 0 )
+            {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289,},
+            {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914,},
+            {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0,},
+            {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988,},
+            {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136,},
+        },
+        {
+            // Coeff Band ( 2 )
+            {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0,},
+            {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980,},
+            {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429,},
+        },
+        {
+            // Coeff Band ( 3 )
+            {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820,},
+            {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0,},
+            {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127,},
+            { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157,},
+            { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198,},
+        },
+        {
+            // Coeff Band ( 6 )
+            {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0,},
+            {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195,},
+            { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641,},
+            {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+    },
+    {
+        // Block Type ( 2 )
+        {
+            // Coeff Band ( 0 )
+            { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798,},
+            {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837,},
+            {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0,},
+            {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063,},
+            {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047,},
+        },
+        {
+            // Coeff Band ( 2 )
+            { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0,},
+            { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404,},
+            { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236,},
+        },
+        {
+            // Coeff Band ( 3 )
+            { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157,},
+            { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427,},
+            {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652,},
+            {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+        {
+            // Coeff Band ( 6 )
+            { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517,},
+            {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+    },
+    {
+        // Block Type ( 3 )
+        {
+            // Coeff Band ( 0 )
+            {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694,},
+            {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572,},
+            {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284,},
+        },
+        {
+            // Coeff Band ( 1 )
+            {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0,},
+            {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280,},
+            {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460,},
+        },
+        {
+            // Coeff Band ( 2 )
+            {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0,},
+            {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539,},
+            {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138,},
+        },
+        {
+            // Coeff Band ( 3 )
+            {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0,},
+            {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181,},
+            {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267,},
+        },
+        {
+            // Coeff Band ( 4 )
+            {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0,},
+            {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401,},
+            {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268,},
+        },
+        {
+            // Coeff Band ( 5 )
+            {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811,},
+            {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527,},
+        },
+        {
+            // Coeff Band ( 6 )
+            {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0,},
+            {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954,},
+            {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979,},
+        },
+        {
+            // Coeff Band ( 7 )
+            {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459,},
+            {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,},
+        },
+    },
+};
+
+//Update probabilities for the nodes in the token entropy tree.
+const vp8_prob tree_update_probs[vp8_coef_tree_dimen] =
+{
+    {
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, },
+            {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, },
+        },
+        {
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+    {
+        {
+            {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, },
+            {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+        {
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+            {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, },
+        },
+    },
+};
+#endif
diff --git a/libvpx/vp8/common/debugmodes.c b/libvpx/vp8/common/debugmodes.c
new file mode 100644
index 0000000..46064e6
--- /dev/null
+++ b/libvpx/vp8/common/debugmodes.c
@@ -0,0 +1,157 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include "blockd.h"
+
+
+void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int frame)
+{
+
+    int mb_row;
+    int mb_col;
+    int mb_index = 0;
+    FILE *mvs = fopen("mvs.stt", "a");
+
+    /* print out the macroblock Y modes */
+    mb_index = 0;
+    fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+
+            fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode);
+
+            mb_index++;
+        }
+
+        fprintf(mvs, "\n");
+        mb_index++;
+    }
+
+    fprintf(mvs, "\n");
+
+    mb_index = 0;
+    fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+
+            fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame);
+
+            mb_index++;
+        }
+
+        fprintf(mvs, "\n");
+        mb_index++;
+    }
+
+    fprintf(mvs, "\n");
+
+    /* print out the macroblock UV modes */
+    mb_index = 0;
+    fprintf(mvs, "UV Modes for Frame %d\n", frame);
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+
+            fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode);
+
+            mb_index++;
+        }
+
+        mb_index++;
+        fprintf(mvs, "\n");
+    }
+
+    fprintf(mvs, "\n");
+
+    /* print out the block modes */
+    mb_index = 0;
+    fprintf(mvs, "Mbs for Frame %d\n", frame);
+    {
+        int b_row;
+
+        for (b_row = 0; b_row < 4 * rows; b_row++)
+        {
+            int b_col;
+            int bindex;
+
+            for (b_col = 0; b_col < 4 * cols; b_col++)
+            {
+                mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+                bindex = (b_row & 3) * 4 + (b_col & 3);
+
+                if (mi[mb_index].mbmi.mode == B_PRED)
+                    fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode);
+                else
+                    fprintf(mvs, "xx ");
+
+            }
+
+            fprintf(mvs, "\n");
+        }
+    }
+    fprintf(mvs, "\n");
+
+    /* print out the macroblock mvs */
+    mb_index = 0;
+    fprintf(mvs, "MVs for Frame %d\n", frame);
+
+    for (mb_row = 0; mb_row < rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cols; mb_col++)
+        {
+            fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv.as_mv.row / 2, mi[mb_index].mbmi.mv.as_mv.col / 2);
+
+            mb_index++;
+        }
+
+        mb_index++;
+        fprintf(mvs, "\n");
+    }
+
+    fprintf(mvs, "\n");
+
+
+    /* print out the block modes */
+    mb_index = 0;
+    fprintf(mvs, "MVs for Frame %d\n", frame);
+    {
+        int b_row;
+
+        for (b_row = 0; b_row < 4 * rows; b_row++)
+        {
+            int b_col;
+            int bindex;
+
+            for (b_col = 0; b_col < 4 * cols; b_col++)
+            {
+                mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);
+                bindex = (b_row & 3) * 4 + (b_col & 3);
+                fprintf(mvs, "%3d:%-3d ", mi[mb_index].bmi[bindex].mv.as_mv.row, mi[mb_index].bmi[bindex].mv.as_mv.col);
+
+            }
+
+            fprintf(mvs, "\n");
+        }
+    }
+    fprintf(mvs, "\n");
+
+
+    fclose(mvs);
+}
diff --git a/libvpx/vp8/common/default_coef_probs.h b/libvpx/vp8/common/default_coef_probs.h
new file mode 100644
index 0000000..0d19563
--- /dev/null
+++ b/libvpx/vp8/common/default_coef_probs.h
@@ -0,0 +1,188 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+*/
+
+
+/*Generated file, included by entropy.c*/
+
+
+static const vp8_prob default_coef_probs [BLOCK_TYPES]
+                                         [COEF_BANDS]
+                                         [PREV_COEF_CONTEXTS]
+                                         [ENTROPY_NODES] =
+{
+    { /* Block Type ( 0 ) */
+        { /* Coeff Band ( 0 )*/
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 1 )*/
+            { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+            { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+            { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 2 )*/
+            {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+            { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+            {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 3 )*/
+            {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+            { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+            {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 4 )*/
+            {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+            { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+            {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 5 )*/
+            {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+            { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+            { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 6 )*/
+            {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+            { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+            {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 7 )*/
+            {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+        }
+    },
+    { /* Block Type ( 1 ) */
+        { /* Coeff Band ( 0 )*/
+            { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },
+            { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },
+            {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
+        },
+        { /* Coeff Band ( 1 )*/
+            {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+            { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+            {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 2 )*/
+            {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+            {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+            {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 3 )*/
+            {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+            { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+            {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 4 )*/
+            {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+            {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+            {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 5 )*/
+            {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+            { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+            {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 6 )*/
+            {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+            { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+            {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 7 )*/
+            {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+            { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+            { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
+        }
+    },
+    { /* Block Type ( 2 ) */
+        { /* Coeff Band ( 0 )*/
+            { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+            { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+            {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
+        },
+        { /* Coeff Band ( 1 )*/
+            {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+            { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+            { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 2 )*/
+            {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+            { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+            {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 3 )*/
+            {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+            { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+            { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 4 )*/
+            {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+            { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+            { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 5 )*/
+            {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 6 )*/
+            {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+            { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+            {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 7 )*/
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+        }
+    },
+    { /* Block Type ( 3 ) */
+        { /* Coeff Band ( 0 )*/
+            { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+            { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+            {  61,  46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
+        },
+        { /* Coeff Band ( 1 )*/
+            {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+            { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+            {  39,  77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 2 )*/
+            {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+            { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+            {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 3 )*/
+            {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+            { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+            {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 4 )*/
+            {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+            { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+            {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 5 )*/
+            {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+            { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+            {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 6 )*/
+            {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+            { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+            {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 7 )*/
+            {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+        }
+    }
+};
diff --git a/libvpx/vp8/common/dequantize.c b/libvpx/vp8/common/dequantize.c
new file mode 100644
index 0000000..8eda486
--- /dev/null
+++ b/libvpx/vp8/common/dequantize.c
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequantize_b_c(BLOCKD *d, short *DQC)
+{
+    int i;
+    short *DQ  = d->dqcoeff;
+    short *Q   = d->qcoeff;
+
+    for (i = 0; i < 16; i++)
+    {
+        DQ[i] = Q[i] * DQC[i];
+    }
+}
+
+void vp8_dequant_idct_add_c(short *input, short *dq,
+                            unsigned char *dest, int stride)
+{
+    int i;
+
+    for (i = 0; i < 16; i++)
+    {
+        input[i] = dq[i] * input[i];
+    }
+
+    vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
+
+    vpx_memset(input, 0, 32);
+
+}
diff --git a/libvpx/vp8/common/entropy.c b/libvpx/vp8/common/entropy.c
new file mode 100644
index 0000000..8c046a4
--- /dev/null
+++ b/libvpx/vp8/common/entropy.c
@@ -0,0 +1,189 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "entropy.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "coefupdateprobs.h"
+
+DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) =
+{
+    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) =
+{ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
+
+DECLARE_ALIGNED(16, const unsigned char,
+                vp8_prev_token_class[MAX_ENTROPY_TOKENS]) =
+{ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
+
+DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
+{
+    0,  1,  4,  8,
+    5,  2,  3,  6,
+    9, 12, 13, 10,
+    7, 11, 14, 15,
+};
+
+DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) =
+{
+    1,  2,  6,  7,
+    3,  5,  8, 13,
+    4,  9, 12, 14,
+   10, 11, 15, 16
+};
+
+/* vp8_default_zig_zag_mask generated with:
+
+    void vp8_init_scan_order_mask()
+    {
+        int i;
+
+        for (i = 0; i < 16; i++)
+        {
+            vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i;
+        }
+
+    }
+*/
+DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]) =
+{
+     1,    2,    32,     64,
+     4,   16,   128,   4096,
+     8,  256,  2048,   8192,
+   512, 1024, 16384, -32768
+};
+
+const int vp8_mb_feature_data_bits[MB_LVL_MAX] = {7, 6};
+
+/* Array indices are identical to previously-existing CONTEXT_NODE indices */
+
+const vp8_tree_index vp8_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
+{
+    -DCT_EOB_TOKEN, 2,                             /* 0 = EOB */
+    -ZERO_TOKEN, 4,                               /* 1 = ZERO */
+    -ONE_TOKEN, 6,                               /* 2 = ONE */
+    8, 12,                                      /* 3 = LOW_VAL */
+    -TWO_TOKEN, 10,                            /* 4 = TWO */
+    -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
+    14, 16,                                    /* 6 = HIGH_LOW */
+    -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
+    18, 20,                                   /* 8 = CAT_THREEFOUR */
+    -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,  /* 9 = CAT_THREE */
+    -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6   /* 10 = CAT_FIVE */
+};
+
+/* vp8_coef_encodings generated with:
+    vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree);
+*/
+vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] =
+{
+    {2, 2},
+    {6, 3},
+    {28, 5},
+    {58, 6},
+    {59, 6},
+    {60, 6},
+    {61, 6},
+    {124, 7},
+    {125, 7},
+    {126, 7},
+    {127, 7},
+    {0, 1}
+};
+
+/* Trees for extra bits.  Probabilities are constant and
+   do not depend on previously encoded bits */
+
+static const vp8_prob Pcat1[] = { 159};
+static const vp8_prob Pcat2[] = { 165, 145};
+static const vp8_prob Pcat3[] = { 173, 148, 140};
+static const vp8_prob Pcat4[] = { 176, 155, 140, 135};
+static const vp8_prob Pcat5[] = { 180, 157, 141, 134, 130};
+static const vp8_prob Pcat6[] =
+{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129};
+
+
+/* tree index tables generated with:
+
+    void init_bit_tree(vp8_tree_index *p, int n)
+    {
+        int i = 0;
+
+        while (++i < n)
+        {
+            p[0] = p[1] = i << 1;
+            p += 2;
+        }
+
+        p[0] = p[1] = 0;
+    }
+
+    void init_bit_trees()
+    {
+        init_bit_tree(cat1, 1);
+        init_bit_tree(cat2, 2);
+        init_bit_tree(cat3, 3);
+        init_bit_tree(cat4, 4);
+        init_bit_tree(cat5, 5);
+        init_bit_tree(cat6, 11);
+    }
+*/
+
+static const vp8_tree_index cat1[2] = { 0, 0 };
+static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 };
+static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 };
+static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 };
+static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 };
+static const vp8_tree_index cat6[22] = { 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
+                                        14, 14, 16, 16, 18, 18, 20, 20, 0, 0 };
+
+const vp8_extra_bit_struct vp8_extra_bits[12] =
+{
+    { 0, 0, 0, 0},
+    { 0, 0, 0, 1},
+    { 0, 0, 0, 2},
+    { 0, 0, 0, 3},
+    { 0, 0, 0, 4},
+    { cat1, Pcat1, 1, 5},
+    { cat2, Pcat2, 2, 7},
+    { cat3, Pcat3, 3, 11},
+    { cat4, Pcat4, 4, 19},
+    { cat5, Pcat5, 5, 35},
+    { cat6, Pcat6, 11, 67},
+    { 0, 0, 0, 0}
+};
+
+#include "default_coef_probs.h"
+
+void vp8_default_coef_probs(VP8_COMMON *pc)
+{
+    vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
+                   sizeof(default_coef_probs));
+}
+
diff --git a/libvpx/vp8/common/entropy.h b/libvpx/vp8/common/entropy.h
new file mode 100644
index 0000000..5389bc1
--- /dev/null
+++ b/libvpx/vp8/common/entropy.h
@@ -0,0 +1,101 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPY_H
+#define __INC_ENTROPY_H
+
+#include "treecoder.h"
+#include "blockd.h"
+
+/* Coefficient token alphabet */
+
+#define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
+#define ONE_TOKEN               1       /* 1         Extra Bits 0+1 */
+#define TWO_TOKEN               2       /* 2         Extra Bits 0+1 */
+#define THREE_TOKEN             3       /* 3         Extra Bits 0+1 */
+#define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
+#define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
+#define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
+#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
+#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
+#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
+#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 11+1 */
+#define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */
+
+#define MAX_ENTROPY_TOKENS 12
+#define ENTROPY_NODES 11
+
+extern const vp8_tree_index vp8_coef_tree[];
+
+extern const struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];
+
+typedef struct
+{
+    vp8_tree_p tree;
+    const vp8_prob *prob;
+    int Len;
+    int base_val;
+} vp8_extra_bit_struct;
+
+extern const vp8_extra_bit_struct vp8_extra_bits[12];    /* indexed by token value */
+
+#define PROB_UPDATE_BASELINE_COST   7
+
+#define MAX_PROB                255
+#define DCT_MAX_VALUE           2048
+
+
+/* Coefficients are predicted via a 3-dimensional probability table. */
+
+/* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
+
+#define BLOCK_TYPES 4
+
+/* Middle dimension is a coarsening of the coefficient's
+   position within the 4x4 DCT. */
+
+#define COEF_BANDS 8
+extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
+
+/* Inside dimension is 3-valued measure of nearby complexity, that is,
+   the extent to which nearby coefficients are nonzero.  For the first
+   coefficient (DC, unless block type is 0), we look at the (already encoded)
+   blocks above and to the left of the current block.  The context index is
+   then the number (0,1,or 2) of these blocks having nonzero coefficients.
+   After decoding a coefficient, the measure is roughly the size of the
+   most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).
+   Note that the intuitive meaning of this measure changes as coefficients
+   are decoded, e.g., prior to the first token, a zero means that my neighbors
+   are empty while, after the first token, because of the use of end-of-block,
+   a zero means we just decoded a zero and hence guarantees that a non-zero
+   coefficient will appear later in this block.  However, this shift
+   in meaning is perfectly OK because our context depends also on the
+   coefficient band (and since zigzag positions 0, 1, and 2 are in
+   distinct bands). */
+
+/*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
+#   define PREV_COEF_CONTEXTS       3
+
+extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]);
+
+extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+
+struct VP8Common;
+void vp8_default_coef_probs(struct VP8Common *);
+
+extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]);
+extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]);
+extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
+
+void vp8_coef_tree_initialize(void);
+#endif
diff --git a/libvpx/vp8/common/entropymode.c b/libvpx/vp8/common/entropymode.c
new file mode 100644
index 0000000..091e4c7
--- /dev/null
+++ b/libvpx/vp8/common/entropymode.c
@@ -0,0 +1,171 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#define USE_PREBUILT_TABLES
+
+#include "entropymode.h"
+#include "entropy.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp8_entropymodedata.h"
+
+int vp8_mv_cont(const int_mv *l, const int_mv *a)
+{
+    int lez = (l->as_int == 0);
+    int aez = (a->as_int == 0);
+    int lea = (l->as_int == a->as_int);
+
+    if (lea && lez)
+        return SUBMVREF_LEFT_ABOVE_ZED;
+
+    if (lea)
+        return SUBMVREF_LEFT_ABOVE_SAME;
+
+    if (aez)
+        return SUBMVREF_ABOVE_ZED;
+
+    if (lez)
+        return SUBMVREF_LEFT_ZED;
+
+    return SUBMVREF_NORMAL;
+}
+
+static const vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1] = { 180, 162, 25};
+
+const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1] =
+{
+    { 147, 136, 18 },
+    { 106, 145, 1  },
+    { 179, 121, 1  },
+    { 223, 1  , 34 },
+    { 208, 1  , 1  }
+};
+
+
+
+const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS] =
+{
+    {
+        0,  0,  0,  0,
+        0,  0,  0,  0,
+        1,  1,  1,  1,
+        1,  1,  1,  1,
+    },
+    {
+        0,  0,  1,  1,
+        0,  0,  1,  1,
+        0,  0,  1,  1,
+        0,  0,  1,  1,
+    },
+    {
+        0,  0,  1,  1,
+        0,  0,  1,  1,
+        2,  2,  3,  3,
+        2,  2,  3,  3,
+    },
+    {
+        0,  1,  2,  3,
+        4,  5,  6,  7,
+        8,  9,  10, 11,
+        12, 13, 14, 15,
+    }
+};
+
+const int vp8_mbsplit_count [VP8_NUMMBSPLITS] = { 2, 2, 4, 16};
+
+const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1] = { 110, 111, 150};
+
+
+/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+
+const vp8_tree_index vp8_bmode_tree[18] =     /* INTRAMODECONTEXTNODE value */
+{
+    -B_DC_PRED, 2,                             /* 0 = DC_NODE */
+    -B_TM_PRED, 4,                            /* 1 = TM_NODE */
+    -B_VE_PRED, 6,                           /* 2 = VE_NODE */
+    8, 12,                                  /* 3 = COM_NODE */
+    -B_HE_PRED, 10,                        /* 4 = HE_NODE */
+    -B_RD_PRED, -B_VR_PRED,               /* 5 = RD_NODE */
+    -B_LD_PRED, 14,                        /* 6 = LD_NODE */
+    -B_VL_PRED, 16,                      /* 7 = VL_NODE */
+    -B_HD_PRED, -B_HU_PRED             /* 8 = HD_NODE */
+};
+
+/* Again, these trees use the same probability indices as their
+   explicitly-programmed predecessors. */
+
+const vp8_tree_index vp8_ymode_tree[8] =
+{
+    -DC_PRED, 2,
+    4, 6,
+    -V_PRED, -H_PRED,
+    -TM_PRED, -B_PRED
+};
+
+const vp8_tree_index vp8_kf_ymode_tree[8] =
+{
+    -B_PRED, 2,
+    4, 6,
+    -DC_PRED, -V_PRED,
+    -H_PRED, -TM_PRED
+};
+
+const vp8_tree_index vp8_uv_mode_tree[6] =
+{
+    -DC_PRED, 2,
+    -V_PRED, 4,
+    -H_PRED, -TM_PRED
+};
+
+const vp8_tree_index vp8_mbsplit_tree[6] =
+{
+    -3, 2,
+    -2, 4,
+    -0, -1
+};
+
+const vp8_tree_index vp8_mv_ref_tree[8] =
+{
+    -ZEROMV, 2,
+    -NEARESTMV, 4,
+    -NEARMV, 6,
+    -NEWMV, -SPLITMV
+};
+
+const vp8_tree_index vp8_sub_mv_ref_tree[6] =
+{
+    -LEFT4X4, 2,
+    -ABOVE4X4, 4,
+    -ZERO4X4, -NEW4X4
+};
+
+const vp8_tree_index vp8_small_mvtree [14] =
+{
+    2, 8,
+    4, 6,
+    -0, -1,
+    -2, -3,
+    10, 12,
+    -4, -5,
+    -6, -7
+};
+
+void vp8_init_mbmode_probs(VP8_COMMON *x)
+{
+    vpx_memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
+    vpx_memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
+    vpx_memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
+}
+
+void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1])
+{
+    vpx_memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
+}
+
diff --git a/libvpx/vp8/common/entropymode.h b/libvpx/vp8/common/entropymode.h
new file mode 100644
index 0000000..1df0f64
--- /dev/null
+++ b/libvpx/vp8/common/entropymode.h
@@ -0,0 +1,80 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPYMODE_H
+#define __INC_ENTROPYMODE_H
+
+#include "onyxc_int.h"
+#include "treecoder.h"
+
+typedef enum
+{
+    SUBMVREF_NORMAL,
+    SUBMVREF_LEFT_ZED,
+    SUBMVREF_ABOVE_ZED,
+    SUBMVREF_LEFT_ABOVE_SAME,
+    SUBMVREF_LEFT_ABOVE_ZED
+} sumvfref_t;
+
+typedef int vp8_mbsplit[16];
+
+#define VP8_NUMMBSPLITS 4
+
+extern const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS];
+
+extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS];    /* # of subsets */
+
+extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1];
+
+extern int vp8_mv_cont(const int_mv *l, const int_mv *a);
+#define SUBMVREF_COUNT 5
+extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1];
+
+
+extern const unsigned int vp8_kf_default_bmode_counts [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES];
+
+
+extern const vp8_tree_index vp8_bmode_tree[];
+
+extern const vp8_tree_index  vp8_ymode_tree[];
+extern const vp8_tree_index  vp8_kf_ymode_tree[];
+extern const vp8_tree_index  vp8_uv_mode_tree[];
+
+extern const vp8_tree_index  vp8_mbsplit_tree[];
+extern const vp8_tree_index  vp8_mv_ref_tree[];
+extern const vp8_tree_index  vp8_sub_mv_ref_tree[];
+
+extern const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES];
+extern const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES];
+extern const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES];
+extern const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES];
+extern const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS];
+
+/* Inter mode values do not start at zero */
+
+extern const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS];
+extern const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS];
+
+extern const vp8_tree_index vp8_small_mvtree[];
+
+extern const struct vp8_token_struct vp8_small_mvencodings[8];
+
+/* Key frame default mode probs */
+extern const vp8_prob vp8_kf_bmode_prob[VP8_BINTRAMODES][VP8_BINTRAMODES]
+[VP8_BINTRAMODES-1];
+extern const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1];
+extern const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1];
+
+void vp8_init_mbmode_probs(VP8_COMMON *x);
+void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]);
+void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]);
+
+#endif
diff --git a/libvpx/vp8/common/entropymv.c b/libvpx/vp8/common/entropymv.c
new file mode 100644
index 0000000..e5df1f0
--- /dev/null
+++ b/libvpx/vp8/common/entropymv.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropymv.h"
+
+const MV_CONTEXT vp8_mv_update_probs[2] =
+{
+    {{
+        237,
+        246,
+        253, 253, 254, 254, 254, 254, 254,
+        254, 254, 254, 254, 254, 250, 250, 252, 254, 254
+    }},
+    {{
+        231,
+        243,
+        245, 253, 254, 254, 254, 254, 254,
+        254, 254, 254, 254, 254, 251, 251, 254, 254, 254
+    }}
+};
+const MV_CONTEXT vp8_default_mv_context[2] =
+{
+    {{
+        /* row */
+        162,                                        /* is short */
+        128,                                        /* sign */
+        225, 146, 172, 147, 214,  39, 156,          /* short tree */
+        128, 129, 132,  75, 145, 178, 206, 239, 254, 254 /* long bits */
+    }},
+
+
+
+    {{
+        /* same for column */
+        164,                                        /* is short */
+        128,
+        204, 170, 119, 235, 140, 230, 228,
+        128, 130, 130,  74, 148, 180, 203, 236, 254, 254 /* long bits */
+
+    }}
+};
diff --git a/libvpx/vp8/common/entropymv.h b/libvpx/vp8/common/entropymv.h
new file mode 100644
index 0000000..2db1e38
--- /dev/null
+++ b/libvpx/vp8/common/entropymv.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENTROPYMV_H
+#define __INC_ENTROPYMV_H
+
+#include "treecoder.h"
+
+enum
+{
+    mv_max  = 1023,              /* max absolute value of a MV component */
+    MVvals = (2 * mv_max) + 1,   /* # possible values "" */
+    mvfp_max  = 255,              /* max absolute value of a full pixel MV component */
+    MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */
+
+    mvlong_width = 10,       /* Large MVs have 9 bit magnitudes */
+    mvnum_short = 8,         /* magnitudes 0 through 7 */
+
+    /* probability offsets for coding each MV component */
+
+    mvpis_short = 0,         /* short (<= 7) vs long (>= 8) */
+    MVPsign,                /* sign for non-zero */
+    MVPshort,               /* 8 short values = 7-position tree */
+
+    MVPbits = MVPshort + mvnum_short - 1, /* mvlong_width long value bits */
+    MVPcount = MVPbits + mvlong_width    /* (with independent probabilities) */
+};
+
+typedef struct mv_context
+{
+    vp8_prob prob[MVPcount];  /* often come in row, col pairs */
+} MV_CONTEXT;
+
+extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
+
+#endif
diff --git a/libvpx/vp8/common/extend.c b/libvpx/vp8/common/extend.c
new file mode 100644
index 0000000..c9bdd21
--- /dev/null
+++ b/libvpx/vp8/common/extend.c
@@ -0,0 +1,188 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "extend.h"
+#include "vpx_mem/vpx_mem.h"
+
+
+static void copy_and_extend_plane
+(
+    unsigned char *s, /* source */
+    int sp,           /* source pitch */
+    unsigned char *d, /* destination */
+    int dp,           /* destination pitch */
+    int h,            /* height */
+    int w,            /* width */
+    int et,           /* extend top border */
+    int el,           /* extend left border */
+    int eb,           /* extend bottom border */
+    int er            /* extend right border */
+)
+{
+    int i;
+    unsigned char *src_ptr1, *src_ptr2;
+    unsigned char *dest_ptr1, *dest_ptr2;
+    int linesize;
+
+    /* copy the left and right most columns out */
+    src_ptr1 = s;
+    src_ptr2 = s + w - 1;
+    dest_ptr1 = d - el;
+    dest_ptr2 = d + w;
+
+    for (i = 0; i < h; i++)
+    {
+        vpx_memset(dest_ptr1, src_ptr1[0], el);
+        vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
+        vpx_memset(dest_ptr2, src_ptr2[0], er);
+        src_ptr1  += sp;
+        src_ptr2  += sp;
+        dest_ptr1 += dp;
+        dest_ptr2 += dp;
+    }
+
+    /* Now copy the top and bottom lines into each line of the respective
+     * borders
+     */
+    src_ptr1 = d - el;
+    src_ptr2 = d + dp * (h - 1) - el;
+    dest_ptr1 = d + dp * (-et) - el;
+    dest_ptr2 = d + dp * (h) - el;
+    linesize = el + er + w;
+
+    for (i = 0; i < et; i++)
+    {
+        vpx_memcpy(dest_ptr1, src_ptr1, linesize);
+        dest_ptr1 += dp;
+    }
+
+    for (i = 0; i < eb; i++)
+    {
+        vpx_memcpy(dest_ptr2, src_ptr2, linesize);
+        dest_ptr2 += dp;
+    }
+}
+
+
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst)
+{
+    int et = dst->border;
+    int el = dst->border;
+    int eb = dst->border + dst->y_height - src->y_height;
+    int er = dst->border + dst->y_width - src->y_width;
+
+    copy_and_extend_plane(src->y_buffer, src->y_stride,
+                          dst->y_buffer, dst->y_stride,
+                          src->y_height, src->y_width,
+                          et, el, eb, er);
+
+    et = dst->border >> 1;
+    el = dst->border >> 1;
+    eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
+    er = (dst->border >> 1) + dst->uv_width - src->uv_width;
+
+    copy_and_extend_plane(src->u_buffer, src->uv_stride,
+                          dst->u_buffer, dst->uv_stride,
+                          src->uv_height, src->uv_width,
+                          et, el, eb, er);
+
+    copy_and_extend_plane(src->v_buffer, src->uv_stride,
+                          dst->v_buffer, dst->uv_stride,
+                          src->uv_height, src->uv_width,
+                          et, el, eb, er);
+}
+
+
+void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst,
+                                         int srcy, int srcx,
+                                         int srch, int srcw)
+{
+    int et = dst->border;
+    int el = dst->border;
+    int eb = dst->border + dst->y_height - src->y_height;
+    int er = dst->border + dst->y_width - src->y_width;
+    int src_y_offset = srcy * src->y_stride + srcx;
+    int dst_y_offset = srcy * dst->y_stride + srcx;
+    int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+    int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+
+    /* If the side is not touching the bounder then don't extend. */
+    if (srcy)
+      et = 0;
+    if (srcx)
+      el = 0;
+    if (srcy + srch != src->y_height)
+      eb = 0;
+    if (srcx + srcw != src->y_width)
+      er = 0;
+
+    copy_and_extend_plane(src->y_buffer + src_y_offset,
+                          src->y_stride,
+                          dst->y_buffer + dst_y_offset,
+                          dst->y_stride,
+                          srch, srcw,
+                          et, el, eb, er);
+
+    et = (et + 1) >> 1;
+    el = (el + 1) >> 1;
+    eb = (eb + 1) >> 1;
+    er = (er + 1) >> 1;
+    srch = (srch + 1) >> 1;
+    srcw = (srcw + 1) >> 1;
+
+    copy_and_extend_plane(src->u_buffer + src_uv_offset,
+                          src->uv_stride,
+                          dst->u_buffer + dst_uv_offset,
+                          dst->uv_stride,
+                          srch, srcw,
+                          et, el, eb, er);
+
+    copy_and_extend_plane(src->v_buffer + src_uv_offset,
+                          src->uv_stride,
+                          dst->v_buffer + dst_uv_offset,
+                          dst->uv_stride,
+                          srch, srcw,
+                          et, el, eb, er);
+}
+
+
+/* note the extension is only for the last row, for intra prediction purpose */
+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf,
+                       unsigned char *YPtr,
+                       unsigned char *UPtr,
+                       unsigned char *VPtr)
+{
+    int i;
+
+    YPtr += ybf->y_stride * 14;
+    UPtr += ybf->uv_stride * 6;
+    VPtr += ybf->uv_stride * 6;
+
+    for (i = 0; i < 4; i++)
+    {
+        YPtr[i] = YPtr[-1];
+        UPtr[i] = UPtr[-1];
+        VPtr[i] = VPtr[-1];
+    }
+
+    YPtr += ybf->y_stride;
+    UPtr += ybf->uv_stride;
+    VPtr += ybf->uv_stride;
+
+    for (i = 0; i < 4; i++)
+    {
+        YPtr[i] = YPtr[-1];
+        UPtr[i] = UPtr[-1];
+        VPtr[i] = VPtr[-1];
+    }
+}
diff --git a/libvpx/vp8/common/extend.h b/libvpx/vp8/common/extend.h
new file mode 100644
index 0000000..74a0b17
--- /dev/null
+++ b/libvpx/vp8/common/extend.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_EXTEND_H
+#define __INC_EXTEND_H
+
+#include "vpx_scale/yv12config.h"
+
+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
+void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);
+void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst,
+                                         int srcy, int srcx,
+                                         int srch, int srcw);
+
+#endif
diff --git a/libvpx/vp8/common/filter.c b/libvpx/vp8/common/filter.c
new file mode 100644
index 0000000..1901ea3
--- /dev/null
+++ b/libvpx/vp8/common/filter.c
@@ -0,0 +1,494 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "filter.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
+{
+    { 128,   0 },
+    { 112,  16 },
+    {  96,  32 },
+    {  80,  48 },
+    {  64,  64 },
+    {  48,  80 },
+    {  32,  96 },
+    {  16, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
+{
+
+    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
+    { 0, -6,  123,   12,  -1,  0 },
+    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
+    { 0, -9,   93,   50,  -6,  0 },
+    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
+    { 0, -6,   50,   93,  -9,  0 },
+    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
+    { 0, -1,   12,  123,  -6,  0 },
+};
+
+static void filter_block2d_first_pass
+(
+    unsigned char *src_ptr,
+    int *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short *vp8_filter
+)
+{
+    unsigned int i, j;
+    int  Temp;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+                   ((int)src_ptr[0]                 * vp8_filter[2]) +
+                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
+                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
+                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
+                   (VP8_FILTER_WEIGHT >> 1);      /* Rounding */
+
+            /* Normalize back to 0-255 */
+            Temp = Temp >> VP8_FILTER_SHIFT;
+
+            if (Temp < 0)
+                Temp = 0;
+            else if (Temp > 255)
+                Temp = 255;
+
+            output_ptr[j] = Temp;
+            src_ptr++;
+        }
+
+        /* Next row... */
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_width;
+    }
+}
+
+static void filter_block2d_second_pass
+(
+    int *src_ptr,
+    unsigned char *output_ptr,
+    int output_pitch,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short *vp8_filter
+)
+{
+    unsigned int i, j;
+    int  Temp;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            /* Apply filter */
+            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+                   ((int)src_ptr[0]                 * vp8_filter[2]) +
+                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
+                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
+                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
+                   (VP8_FILTER_WEIGHT >> 1);   /* Rounding */
+
+            /* Normalize back to 0-255 */
+            Temp = Temp >> VP8_FILTER_SHIFT;
+
+            if (Temp < 0)
+                Temp = 0;
+            else if (Temp > 255)
+                Temp = 255;
+
+            output_ptr[j] = (unsigned char)Temp;
+            src_ptr++;
+        }
+
+        /* Start next row */
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_pitch;
+    }
+}
+
+
+static void filter_block2d
+(
+    unsigned char  *src_ptr,
+    unsigned char  *output_ptr,
+    unsigned int src_pixels_per_line,
+    int output_pitch,
+    const short  *HFilter,
+    const short  *VFilter
+)
+{
+    int FData[9*4]; /* Temp data buffer used in filtering */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
+}
+
+
+void vp8_sixtap_predict4x4_c
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
+}
+void vp8_sixtap_predict8x8_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[13*16];   /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
+
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+
+}
+
+void vp8_sixtap_predict8x4_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[13*16];   /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
+
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
+
+}
+
+void vp8_sixtap_predict16x16_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[21*24];   /* Temp data buffer used in filtering */
+
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
+
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_first_pass
+ *
+ *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
+ *                  UINT32  src_stride : Stride of source block.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the horizontal direction to produce the filtered output
+ *                  block. Used to implement first-pass of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_first_pass
+(
+    unsigned char  *src_ptr,
+    unsigned short *dst_ptr,
+    unsigned int    src_stride,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
+)
+{
+    unsigned int i, j;
+
+    for (i = 0; i < height; i++)
+    {
+        for (j = 0; j < width; j++)
+        {
+            /* Apply bilinear filter */
+            dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
+                          ((int)src_ptr[1] * vp8_filter[1]) +
+                          (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+            src_ptr++;
+        }
+
+        /* Next row... */
+        src_ptr += src_stride - width;
+        dst_ptr += width;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_second_pass
+ *
+ *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
+ *                  UINT32  dst_pitch  : Destination block pitch.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the vertical direction to produce the filtered output
+ *                  block. Used to implement second-pass of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_second_pass
+(
+    unsigned short *src_ptr,
+    unsigned char  *dst_ptr,
+    int             dst_pitch,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
+)
+{
+    unsigned int  i, j;
+    int  Temp;
+
+    for (i = 0; i < height; i++)
+    {
+        for (j = 0; j < width; j++)
+        {
+            /* Apply filter */
+            Temp = ((int)src_ptr[0]     * vp8_filter[0]) +
+                   ((int)src_ptr[width] * vp8_filter[1]) +
+                   (VP8_FILTER_WEIGHT / 2);
+            dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+            src_ptr++;
+        }
+
+        /* Next row... */
+        dst_ptr += dst_pitch;
+    }
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil
+ *
+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
+ *                  UINT32  src_pitch        : Stride of source block.
+ *                  UINT32  dst_pitch        : Stride of destination block.
+ *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
+ *                  INT32  *VFilter          : Array of 2 vertical filter taps.
+ *                  INT32  Width             : Block width
+ *                  INT32  Height            : Block height
+ *
+ *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 2-D filters an input block by applying a 2-tap
+ *                  bi-linear filter horizontally followed by a 2-tap
+ *                  bi-linear filter vertically on the result.
+ *
+ *  SPECIAL NOTES : The largest block size can be handled here is 16x16
+ *
+ ****************************************************************************/
+static void filter_block2d_bil
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
+    unsigned int   dst_pitch,
+    const short   *HFilter,
+    const short   *VFilter,
+    int            Width,
+    int            Height
+)
+{
+
+    unsigned short FData[17*16];    /* Temp data buffer used in filtering */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+    /* then 1-D vertically... */
+    filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+
+void vp8_bilinear_predict4x4_c
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+#if 0
+    {
+        int i;
+        unsigned char temp1[16];
+        unsigned char temp2[16];
+
+        bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
+        filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+
+        for (i = 0; i < 16; i++)
+        {
+            if (temp1[i] != temp2[i])
+            {
+                bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
+                filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+            }
+        }
+    }
+#endif
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+
+}
+
+void vp8_bilinear_predict8x8_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+
+}
+
+void vp8_bilinear_predict8x4_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+
+}
+
+void vp8_bilinear_predict16x16_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}
diff --git a/libvpx/vp8/common/filter.h b/libvpx/vp8/common/filter.h
new file mode 100644
index 0000000..b7591f2
--- /dev/null
+++ b/libvpx/vp8/common/filter.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef FILTER_H
+#define FILTER_H
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT  7
+
+extern const short vp8_bilinear_filters[8][2];
+extern const short vp8_sub_pel_filters[8][6];
+
+#endif
diff --git a/libvpx/vp8/common/findnearmv.c b/libvpx/vp8/common/findnearmv.c
new file mode 100644
index 0000000..e8ee40f
--- /dev/null
+++ b/libvpx/vp8/common/findnearmv.c
@@ -0,0 +1,193 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "findnearmv.h"
+
+const unsigned char vp8_mbsplit_offset[4][16] = {
+    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};
+
+/* Predict motion vectors using those from already-decoded nearby blocks.
+   Note that we only consider one 4x4 subblock from each candidate 16x16
+   macroblock.   */
+void vp8_find_near_mvs
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv *nearest,
+    int_mv *nearby,
+    int_mv *best_mv,
+    int cnt[4],
+    int refframe,
+    int *ref_frame_sign_bias
+)
+{
+    const MODE_INFO *above = here - xd->mode_info_stride;
+    const MODE_INFO *left = here - 1;
+    const MODE_INFO *aboveleft = above - 1;
+    int_mv            near_mvs[4];
+    int_mv           *mv = near_mvs;
+    int             *cntx = cnt;
+    enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
+
+    /* Zero accumulators */
+    mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
+    cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+
+    /* Process above */
+    if (above->mbmi.ref_frame != INTRA_FRAME)
+    {
+        if (above->mbmi.mv.as_int)
+        {
+            (++mv)->as_int = above->mbmi.mv.as_int;
+            mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, mv, ref_frame_sign_bias);
+            ++cntx;
+        }
+
+        *cntx += 2;
+    }
+
+    /* Process left */
+    if (left->mbmi.ref_frame != INTRA_FRAME)
+    {
+        if (left->mbmi.mv.as_int)
+        {
+            int_mv this_mv;
+
+            this_mv.as_int = left->mbmi.mv.as_int;
+            mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
+
+            if (this_mv.as_int != mv->as_int)
+            {
+                (++mv)->as_int = this_mv.as_int;
+                ++cntx;
+            }
+
+            *cntx += 2;
+        }
+        else
+            cnt[CNT_INTRA] += 2;
+    }
+
+    /* Process above left */
+    if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
+    {
+        if (aboveleft->mbmi.mv.as_int)
+        {
+            int_mv this_mv;
+
+            this_mv.as_int = aboveleft->mbmi.mv.as_int;
+            mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &this_mv, ref_frame_sign_bias);
+
+            if (this_mv.as_int != mv->as_int)
+            {
+                (++mv)->as_int = this_mv.as_int;
+                ++cntx;
+            }
+
+            *cntx += 1;
+        }
+        else
+            cnt[CNT_INTRA] += 1;
+    }
+
+    /* If we have three distinct MV's ... */
+    if (cnt[CNT_SPLITMV])
+    {
+        /* See if above-left MV can be merged with NEAREST */
+        if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
+            cnt[CNT_NEAREST] += 1;
+    }
+
+    cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
+                        + (left->mbmi.mode == SPLITMV)) * 2
+                       + (aboveleft->mbmi.mode == SPLITMV);
+
+    /* Swap near and nearest if necessary */
+    if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
+    {
+        int tmp;
+        tmp = cnt[CNT_NEAREST];
+        cnt[CNT_NEAREST] = cnt[CNT_NEAR];
+        cnt[CNT_NEAR] = tmp;
+        tmp = near_mvs[CNT_NEAREST].as_int;
+        near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
+        near_mvs[CNT_NEAR].as_int = tmp;
+    }
+
+    /* Use near_mvs[0] to store the "best" MV */
+    if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA])
+        near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
+
+    /* Set up return values */
+    best_mv->as_int = near_mvs[0].as_int;
+    nearest->as_int = near_mvs[CNT_NEAREST].as_int;
+    nearby->as_int = near_mvs[CNT_NEAR].as_int;
+}
+
+
+static void invert_and_clamp_mvs(int_mv *inv, int_mv *src, MACROBLOCKD *xd)
+{
+    inv->as_mv.row = src->as_mv.row * -1;
+    inv->as_mv.col = src->as_mv.col * -1;
+    vp8_clamp_mv2(inv, xd);
+    vp8_clamp_mv2(src, xd);
+}
+
+
+int vp8_find_near_mvs_bias
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv mode_mv_sb[2][MB_MODE_COUNT],
+    int_mv best_mv_sb[2],
+    int cnt[4],
+    int refframe,
+    int *ref_frame_sign_bias
+)
+{
+    int sign_bias = ref_frame_sign_bias[refframe];
+
+    vp8_find_near_mvs(xd,
+                      here,
+                      &mode_mv_sb[sign_bias][NEARESTMV],
+                      &mode_mv_sb[sign_bias][NEARMV],
+                      &best_mv_sb[sign_bias],
+                      cnt,
+                      refframe,
+                      ref_frame_sign_bias);
+
+    invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARESTMV],
+                         &mode_mv_sb[sign_bias][NEARESTMV], xd);
+    invert_and_clamp_mvs(&mode_mv_sb[!sign_bias][NEARMV],
+                         &mode_mv_sb[sign_bias][NEARMV], xd);
+    invert_and_clamp_mvs(&best_mv_sb[!sign_bias],
+                         &best_mv_sb[sign_bias], xd);
+
+    return sign_bias;
+}
+
+
+vp8_prob *vp8_mv_ref_probs(
+    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
+)
+{
+    p[0] = vp8_mode_contexts [near_mv_ref_ct[0]] [0];
+    p[1] = vp8_mode_contexts [near_mv_ref_ct[1]] [1];
+    p[2] = vp8_mode_contexts [near_mv_ref_ct[2]] [2];
+    p[3] = vp8_mode_contexts [near_mv_ref_ct[3]] [3];
+    /*p[3] = vp8_mode_contexts [near_mv_ref_ct[1] + near_mv_ref_ct[2] + near_mv_ref_ct[3]] [3];*/
+    return p;
+}
+
diff --git a/libvpx/vp8/common/findnearmv.h b/libvpx/vp8/common/findnearmv.h
new file mode 100644
index 0000000..06ef060
--- /dev/null
+++ b/libvpx/vp8/common/findnearmv.h
@@ -0,0 +1,182 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_FINDNEARMV_H
+#define __INC_FINDNEARMV_H
+
+#include "mv.h"
+#include "blockd.h"
+#include "modecont.h"
+#include "treecoder.h"
+
+
+static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp,
+                    const int *ref_frame_sign_bias)
+{
+    if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe])
+    {
+        mvp->as_mv.row *= -1;
+        mvp->as_mv.col *= -1;
+    }
+}
+
+#define LEFT_TOP_MARGIN (16 << 3)
+#define RIGHT_BOTTOM_MARGIN (16 << 3)
+static void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd)
+{
+    if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+        mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+    else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+        mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+    if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+        mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+    else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+        mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+}
+
+static void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, int mb_to_right_edge,
+                         int mb_to_top_edge, int mb_to_bottom_edge)
+{
+    mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
+        mb_to_left_edge : mv->as_mv.col;
+    mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
+        mb_to_right_edge : mv->as_mv.col;
+    mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
+        mb_to_top_edge : mv->as_mv.row;
+    mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
+        mb_to_bottom_edge : mv->as_mv.row;
+}
+static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
+                                int mb_to_right_edge, int mb_to_top_edge,
+                                int mb_to_bottom_edge)
+{
+    unsigned int need_to_clamp;
+    need_to_clamp = (mv->as_mv.col < mb_to_left_edge);
+    need_to_clamp |= (mv->as_mv.col > mb_to_right_edge);
+    need_to_clamp |= (mv->as_mv.row < mb_to_top_edge);
+    need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge);
+    return need_to_clamp;
+}
+
+void vp8_find_near_mvs
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv *nearest, int_mv *nearby, int_mv *best,
+    int near_mv_ref_cts[4],
+    int refframe,
+    int *ref_frame_sign_bias
+);
+
+
+int vp8_find_near_mvs_bias
+(
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv mode_mv_sb[2][MB_MODE_COUNT],
+    int_mv best_mv_sb[2],
+    int cnt[4],
+    int refframe,
+    int *ref_frame_sign_bias
+);
+
+
+vp8_prob *vp8_mv_ref_probs(
+    vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
+);
+
+extern const unsigned char vp8_mbsplit_offset[4][16];
+
+
+static int left_block_mv(const MODE_INFO *cur_mb, int b)
+{
+    if (!(b & 3))
+    {
+        /* On L edge, get from MB to left of us */
+        --cur_mb;
+
+        if(cur_mb->mbmi.mode != SPLITMV)
+            return cur_mb->mbmi.mv.as_int;
+        b += 4;
+    }
+
+    return (cur_mb->bmi + b - 1)->mv.as_int;
+}
+
+static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
+{
+    if (!(b >> 2))
+    {
+        /* On top edge, get from MB above us */
+        cur_mb -= mi_stride;
+
+        if(cur_mb->mbmi.mode != SPLITMV)
+            return cur_mb->mbmi.mv.as_int;
+        b += 16;
+    }
+
+    return (cur_mb->bmi + b - 4)->mv.as_int;
+}
+static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
+{
+    if (!(b & 3))
+    {
+        /* On L edge, get from MB to left of us */
+        --cur_mb;
+        switch (cur_mb->mbmi.mode)
+        {
+            case B_PRED:
+              return (cur_mb->bmi + b + 3)->as_mode;
+            case DC_PRED:
+                return B_DC_PRED;
+            case V_PRED:
+                return B_VE_PRED;
+            case H_PRED:
+                return B_HE_PRED;
+            case TM_PRED:
+                return B_TM_PRED;
+            default:
+                return B_DC_PRED;
+        }
+    }
+
+    return (cur_mb->bmi + b - 1)->as_mode;
+}
+
+static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, int mi_stride)
+{
+    if (!(b >> 2))
+    {
+        /* On top edge, get from MB above us */
+        cur_mb -= mi_stride;
+
+        switch (cur_mb->mbmi.mode)
+        {
+            case B_PRED:
+              return (cur_mb->bmi + b + 12)->as_mode;
+            case DC_PRED:
+                return B_DC_PRED;
+            case V_PRED:
+                return B_VE_PRED;
+            case H_PRED:
+                return B_HE_PRED;
+            case TM_PRED:
+                return B_TM_PRED;
+            default:
+                return B_DC_PRED;
+        }
+    }
+
+    return (cur_mb->bmi + b - 4)->as_mode;
+}
+
+#endif
diff --git a/libvpx/vp8/common/generic/systemdependent.c b/libvpx/vp8/common/generic/systemdependent.c
new file mode 100644
index 0000000..5a6ac7b
--- /dev/null
+++ b/libvpx/vp8/common/generic/systemdependent.c
@@ -0,0 +1,97 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#elif ARCH_X86 || ARCH_X86_64
+#include "vpx_ports/x86.h"
+#endif
+#include "vp8/common/onyxc_int.h"
+
+#if CONFIG_MULTITHREAD
+#if HAVE_UNISTD_H && !defined(__OS2__)
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <windows.h>
+typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
+#elif defined(__OS2__)
+#define INCL_DOS
+#define INCL_DOSSPINLOCK
+#include <os2.h>
+#endif
+#endif
+
+#if CONFIG_MULTITHREAD
+static int get_cpu_count()
+{
+    int core_count = 16;
+
+#if HAVE_UNISTD_H && !defined(__OS2__)
+#if defined(_SC_NPROCESSORS_ONLN)
+    core_count = sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(_SC_NPROC_ONLN)
+    core_count = sysconf(_SC_NPROC_ONLN);
+#endif
+#elif defined(_WIN32)
+    {
+        PGNSI pGNSI;
+        SYSTEM_INFO sysinfo;
+
+        /* Call GetNativeSystemInfo if supported or
+         * GetSystemInfo otherwise. */
+
+        pGNSI = (PGNSI) GetProcAddress(
+                GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo");
+        if (pGNSI != NULL)
+            pGNSI(&sysinfo);
+        else
+            GetSystemInfo(&sysinfo);
+
+        core_count = sysinfo.dwNumberOfProcessors;
+    }
+#elif defined(__OS2__)
+    {
+        ULONG proc_id;
+        ULONG status;
+
+        core_count = 0;
+        for (proc_id = 1; ; proc_id++)
+        {
+            if (DosGetProcessorStatus(proc_id, &status))
+                break;
+
+            if (status == PROC_ONLINE)
+                core_count++;
+        }
+    }
+#else
+    /* other platforms */
+#endif
+
+    return core_count > 0 ? core_count : 1;
+}
+#endif
+
+
+void vp8_machine_specific_config(VP8_COMMON *ctx)
+{
+#if CONFIG_MULTITHREAD
+    ctx->processor_core_count = get_cpu_count();
+#endif /* CONFIG_MULTITHREAD */
+
+#if ARCH_ARM
+    ctx->cpu_caps = arm_cpu_caps();
+#elif ARCH_X86 || ARCH_X86_64
+    ctx->cpu_caps = x86_simd_caps();
+#endif
+}
diff --git a/libvpx/vp8/common/header.h b/libvpx/vp8/common/header.h
new file mode 100644
index 0000000..3e98eeb
--- /dev/null
+++ b/libvpx/vp8/common/header.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_HEADER_H
+#define __INC_HEADER_H
+
+/* 24 bits total */
+typedef struct
+{
+    unsigned int type: 1;
+    unsigned int version: 3;
+    unsigned int show_frame: 1;
+
+    /* Allow 2^20 bytes = 8 megabits for first partition */
+
+    unsigned int first_partition_length_in_bytes: 19;
+
+#ifdef PACKET_TESTING
+    unsigned int frame_number;
+    unsigned int update_gold: 1;
+    unsigned int uses_gold: 1;
+    unsigned int update_last: 1;
+    unsigned int uses_last: 1;
+#endif
+
+} VP8_HEADER;
+
+#ifdef PACKET_TESTING
+#define VP8_HEADER_SIZE 8
+#else
+#define VP8_HEADER_SIZE 3
+#endif
+
+
+#endif
diff --git a/libvpx/vp8/common/idct_blk.c b/libvpx/vp8/common/idct_blk.c
new file mode 100644
index 0000000..0b058c7
--- /dev/null
+++ b/libvpx/vp8/common/idct_blk.c
@@ -0,0 +1,89 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+void vp8_dequant_idct_add_c(short *input, short *dq,
+                            unsigned char *dest, int stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
+                            int pred_stride, unsigned char *dst_ptr,
+                            int dst_stride);
+
+void vp8_dequant_idct_add_y_block_c
+            (short *q, short *dq,
+             unsigned char *dst, int stride, char *eobs)
+{
+    int i, j;
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_c (q, dq, dst, stride);
+            else
+            {
+                vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q   += 16;
+            dst += 4;
+        }
+
+        dst += 4*stride - 16;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_c
+            (short *q, short *dq,
+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    int i, j;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_c (q, dq, dstu, stride);
+            else
+            {
+                vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q    += 16;
+            dstu += 4;
+        }
+
+        dstu += 4*stride - 8;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_c (q, dq, dstv, stride);
+            else
+            {
+                vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q    += 16;
+            dstv += 4;
+        }
+
+        dstv += 4*stride - 8;
+    }
+}
diff --git a/libvpx/vp8/common/idctllm.c b/libvpx/vp8/common/idctllm.c
new file mode 100644
index 0000000..47af52f
--- /dev/null
+++ b/libvpx/vp8/common/idctllm.c
@@ -0,0 +1,204 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ * Notes:
+ *
+ * This implementation makes use of 16 bit fixed point verio of two multiply
+ * constants:
+ *         1.   sqrt(2) * cos (pi/8)
+ *         2.   sqrt(2) * sin (pi/8)
+ * Becuase the first constant is bigger than 1, to maintain the same 16 bit
+ * fixed point precision as the second one, we use a trick of
+ *         x * a = x + x*(a-1)
+ * so
+ *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+ **************************************************************************/
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2      = 35468;
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr,
+                            int pred_stride, unsigned char *dst_ptr,
+                            int dst_stride)
+{
+    int i;
+    int r, c;
+    int a1, b1, c1, d1;
+    short output[16];
+    short *ip = input;
+    short *op = output;
+    int temp1, temp2;
+    int shortpitch = 4;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[8];
+        b1 = ip[0] - ip[8];
+
+        temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+        temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+        c1 = temp1 - temp2;
+
+        temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+        temp2 = (ip[12] * sinpi8sqrt2) >> 16;
+        d1 = temp1 + temp2;
+
+        op[shortpitch*0] = a1 + d1;
+        op[shortpitch*3] = a1 - d1;
+
+        op[shortpitch*1] = b1 + c1;
+        op[shortpitch*2] = b1 - c1;
+
+        ip++;
+        op++;
+    }
+
+    ip = output;
+    op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[2];
+        b1 = ip[0] - ip[2];
+
+        temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+        temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+        c1 = temp1 - temp2;
+
+        temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+        temp2 = (ip[3] * sinpi8sqrt2) >> 16;
+        d1 = temp1 + temp2;
+
+
+        op[0] = (a1 + d1 + 4) >> 3;
+        op[3] = (a1 - d1 + 4) >> 3;
+
+        op[1] = (b1 + c1 + 4) >> 3;
+        op[2] = (b1 - c1 + 4) >> 3;
+
+        ip += shortpitch;
+        op += shortpitch;
+    }
+
+    ip = output;
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            int a = ip[c] + pred_ptr[c] ;
+
+            if (a < 0)
+                a = 0;
+
+            if (a > 255)
+                a = 255;
+
+            dst_ptr[c] = (unsigned char) a ;
+        }
+        ip += 4;
+        dst_ptr += dst_stride;
+        pred_ptr += pred_stride;
+    }
+}
+
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+                            int pred_stride, unsigned char *dst_ptr,
+                            int dst_stride)
+{
+    int a1 = ((input_dc + 4) >> 3);
+    int r, c;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            int a = a1 + pred_ptr[c] ;
+
+            if (a < 0)
+                a = 0;
+
+            if (a > 255)
+                a = 255;
+
+            dst_ptr[c] = (unsigned char) a ;
+        }
+
+        dst_ptr += dst_stride;
+        pred_ptr += pred_stride;
+    }
+
+}
+
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff)
+{
+    short output[16];
+    int i;
+    int a1, b1, c1, d1;
+    int a2, b2, c2, d2;
+    short *ip = input;
+    short *op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[12];
+        b1 = ip[4] + ip[8];
+        c1 = ip[4] - ip[8];
+        d1 = ip[0] - ip[12];
+
+        op[0] = a1 + b1;
+        op[4] = c1 + d1;
+        op[8] = a1 - b1;
+        op[12] = d1 - c1;
+        ip++;
+        op++;
+    }
+
+    ip = output;
+    op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[3];
+        b1 = ip[1] + ip[2];
+        c1 = ip[1] - ip[2];
+        d1 = ip[0] - ip[3];
+
+        a2 = a1 + b1;
+        b2 = c1 + d1;
+        c2 = a1 - b1;
+        d2 = d1 - c1;
+
+        op[0] = (a2 + 3) >> 3;
+        op[1] = (b2 + 3) >> 3;
+        op[2] = (c2 + 3) >> 3;
+        op[3] = (d2 + 3) >> 3;
+
+        ip += 4;
+        op += 4;
+    }
+
+    for(i = 0; i < 16; i++)
+    {
+        mb_dqcoeff[i * 16] = output[i];
+    }
+}
+
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff)
+{
+    int i;
+    int a1;
+
+    a1 = ((input[0] + 3) >> 3);
+    for(i = 0; i < 16; i++)
+    {
+        mb_dqcoeff[i * 16] = a1;
+    }
+}
diff --git a/libvpx/vp8/common/invtrans.h b/libvpx/vp8/common/invtrans.h
new file mode 100644
index 0000000..d048665
--- /dev/null
+++ b/libvpx/vp8/common/invtrans.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_INVTRANS_H
+#define __INC_INVTRANS_H
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+#if CONFIG_MULTITHREAD
+#include "vpx_mem/vpx_mem.h"
+#endif
+
+static void eob_adjust(char *eobs, short *diff)
+{
+    /* eob adjust.... the idct can only skip if both the dc and eob are zero */
+    int js;
+    for(js = 0; js < 16; js++)
+    {
+        if((eobs[js] == 0) && (diff[0] != 0))
+            eobs[js]++;
+        diff+=16;
+    }
+}
+
+static void vp8_inverse_transform_mby(MACROBLOCKD *xd)
+{
+    short *DQC = xd->dequant_y1;
+
+    if (xd->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        /* do 2nd order transform on the dc block */
+        if (xd->eobs[24] > 1)
+        {
+            vp8_short_inv_walsh4x4
+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
+        }
+        else
+        {
+            vp8_short_inv_walsh4x4_1
+                (&xd->block[24].dqcoeff[0], xd->qcoeff);
+        }
+        eob_adjust(xd->eobs, xd->qcoeff);
+
+        DQC = xd->dequant_y1_dc;
+    }
+    vp8_dequant_idct_add_y_block
+                    (xd->qcoeff, DQC,
+                     xd->dst.y_buffer,
+                     xd->dst.y_stride, xd->eobs);
+}
+#endif
diff --git a/libvpx/vp8/common/loopfilter.c b/libvpx/vp8/common/loopfilter.c
new file mode 100644
index 0000000..41b4f12
--- /dev/null
+++ b/libvpx/vp8/common/loopfilter.c
@@ -0,0 +1,679 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "loopfilter.h"
+#include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
+
+typedef unsigned char uc;
+
+static void lf_init_lut(loop_filter_info_n *lfi)
+{
+    int filt_lvl;
+
+    for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++)
+    {
+        if (filt_lvl >= 40)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
+        }
+        else if (filt_lvl >= 20)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
+        }
+        else if (filt_lvl >= 15)
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
+        }
+        else
+        {
+            lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
+            lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
+        }
+    }
+
+    lfi->mode_lf_lut[DC_PRED] = 1;
+    lfi->mode_lf_lut[V_PRED] = 1;
+    lfi->mode_lf_lut[H_PRED] = 1;
+    lfi->mode_lf_lut[TM_PRED] = 1;
+    lfi->mode_lf_lut[B_PRED]  = 0;
+
+    lfi->mode_lf_lut[ZEROMV]  = 1;
+    lfi->mode_lf_lut[NEARESTMV] = 2;
+    lfi->mode_lf_lut[NEARMV] = 2;
+    lfi->mode_lf_lut[NEWMV] = 2;
+    lfi->mode_lf_lut[SPLITMV] = 3;
+
+}
+
+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+                                      int sharpness_lvl)
+{
+    int i;
+
+    /* For each possible value for the loop filter fill out limits */
+    for (i = 0; i <= MAX_LOOP_FILTER; i++)
+    {
+        int filt_lvl = i;
+        int block_inside_limit = 0;
+
+        /* Set loop filter paramaeters that control sharpness. */
+        block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
+        block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
+
+        if (sharpness_lvl > 0)
+        {
+            if (block_inside_limit > (9 - sharpness_lvl))
+                block_inside_limit = (9 - sharpness_lvl);
+        }
+
+        if (block_inside_limit < 1)
+            block_inside_limit = 1;
+
+        vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+        vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
+                SIMD_WIDTH);
+        vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+                SIMD_WIDTH);
+    }
+}
+
+void vp8_loop_filter_init(VP8_COMMON *cm)
+{
+    loop_filter_info_n *lfi = &cm->lf_info;
+    int i;
+
+    /* init limits for given sharpness*/
+    vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+    cm->last_sharpness_level = cm->sharpness_level;
+
+    /* init LUT for lvl  and hev thr picking */
+    lf_init_lut(lfi);
+
+    /* init hev threshold const vectors */
+    for(i = 0; i < 4 ; i++)
+    {
+        vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+    }
+}
+
+void vp8_loop_filter_frame_init(VP8_COMMON *cm,
+                                MACROBLOCKD *mbd,
+                                int default_filt_lvl)
+{
+    int seg,  /* segment number */
+        ref,  /* index in ref_lf_deltas */
+        mode; /* index in mode_lf_deltas */
+
+    loop_filter_info_n *lfi = &cm->lf_info;
+
+    /* update limits if sharpness has changed */
+    if(cm->last_sharpness_level != cm->sharpness_level)
+    {
+        vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+        cm->last_sharpness_level = cm->sharpness_level;
+    }
+
+    for(seg = 0; seg < MAX_MB_SEGMENTS; seg++)
+    {
+        int lvl_seg = default_filt_lvl;
+        int lvl_ref, lvl_mode;
+
+        /* Note the baseline filter values for each segment */
+        if (mbd->segmentation_enabled)
+        {
+            /* Abs value */
+            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+            {
+                lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+            }
+            else  /* Delta Value */
+            {
+                lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
+                lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
+            }
+        }
+
+        if (!mbd->mode_ref_lf_delta_enabled)
+        {
+            /* we could get rid of this if we assume that deltas are set to
+             * zero when not in use; encoder always uses deltas
+             */
+            vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
+            continue;
+        }
+
+        lvl_ref = lvl_seg;
+
+        /* INTRA_FRAME */
+        ref = INTRA_FRAME;
+
+        /* Apply delta for reference frame */
+        lvl_ref += mbd->ref_lf_deltas[ref];
+
+        /* Apply delta for Intra modes */
+        mode = 0; /* B_PRED */
+        /* Only the split mode BPRED has a further special case */
+        lvl_mode = lvl_ref +  mbd->mode_lf_deltas[mode];
+        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+
+        lfi->lvl[seg][ref][mode] = lvl_mode;
+
+        mode = 1; /* all the rest of Intra modes */
+        lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref)  : 0; /* clamp */
+        lfi->lvl[seg][ref][mode] = lvl_mode;
+
+        /* LAST, GOLDEN, ALT */
+        for(ref = 1; ref < MAX_REF_FRAMES; ref++)
+        {
+            int lvl_ref = lvl_seg;
+
+            /* Apply delta for reference frame */
+            lvl_ref += mbd->ref_lf_deltas[ref];
+
+            /* Apply delta for Inter modes */
+            for (mode = 1; mode < 4; mode++)
+            {
+                lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
+                lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
+
+                lfi->lvl[seg][ref][mode] = lvl_mode;
+            }
+        }
+    }
+}
+
+
+void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+                         int mb_row, int post_ystride, int post_uvstride,
+                         unsigned char *y_ptr, unsigned char *u_ptr,
+                         unsigned char *v_ptr)
+{
+    int mb_col;
+    int filter_level;
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+    FRAME_TYPE frame_type = cm->frame_type;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+    {
+        int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                        mode_info_context->mbmi.mode != SPLITMV &&
+                        mode_info_context->mbmi.mb_skip_coeff);
+
+        const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+        const int seg = mode_info_context->mbmi.segment_id;
+        const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+        filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+        if (filter_level)
+        {
+            const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+            lfi.mblim = lfi_n->mblim[filter_level];
+            lfi.blim = lfi_n->blim[filter_level];
+            lfi.lim = lfi_n->lim[filter_level];
+            lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+            if (mb_col > 0)
+                vp8_loop_filter_mbv
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+            if (!skip_lf)
+                vp8_loop_filter_bv
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+            /* don't apply across umv border */
+            if (mb_row > 0)
+                vp8_loop_filter_mbh
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+            if (!skip_lf)
+                vp8_loop_filter_bh
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+        }
+
+        y_ptr += 16;
+        u_ptr += 8;
+        v_ptr += 8;
+
+        mode_info_context++;     /* step to next MB */
+    }
+
+}
+
+void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+                         int mb_row, int post_ystride, int post_uvstride,
+                         unsigned char *y_ptr, unsigned char *u_ptr,
+                         unsigned char *v_ptr)
+{
+    int mb_col;
+    int filter_level;
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+    {
+        int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                        mode_info_context->mbmi.mode != SPLITMV &&
+                        mode_info_context->mbmi.mb_skip_coeff);
+
+        const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+        const int seg = mode_info_context->mbmi.segment_id;
+        const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+        filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+        if (filter_level)
+        {
+            if (mb_col > 0)
+                vp8_loop_filter_simple_mbv
+                (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
+
+            if (!skip_lf)
+                vp8_loop_filter_simple_bv
+                (y_ptr, post_ystride, lfi_n->blim[filter_level]);
+
+            /* don't apply across umv border */
+            if (mb_row > 0)
+                vp8_loop_filter_simple_mbh
+                (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
+
+            if (!skip_lf)
+                vp8_loop_filter_simple_bh
+                (y_ptr, post_ystride, lfi_n->blim[filter_level]);
+        }
+
+        y_ptr += 16;
+        u_ptr += 8;
+        v_ptr += 8;
+
+        mode_info_context++;     /* step to next MB */
+    }
+
+}
+void vp8_loop_filter_frame(VP8_COMMON *cm,
+                           MACROBLOCKD *mbd,
+                           int frame_type)
+{
+    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+
+    int mb_row;
+    int mb_col;
+    int mb_rows = cm->mb_rows;
+    int mb_cols = cm->mb_cols;
+
+    int filter_level;
+
+    unsigned char *y_ptr, *u_ptr, *v_ptr;
+
+    /* Point at base of Mb MODE_INFO list */
+    const MODE_INFO *mode_info_context = cm->mi;
+    int post_y_stride = post->y_stride;
+    int post_uv_stride = post->uv_stride;
+
+    /* Initialize the loop filter for this frame. */
+    vp8_loop_filter_frame_init(cm, mbd, cm->filter_level);
+
+    /* Set up the buffer pointers */
+    y_ptr = post->y_buffer;
+    u_ptr = post->u_buffer;
+    v_ptr = post->v_buffer;
+
+    /* vp8_filter each macro block */
+    if (cm->filter_type == NORMAL_LOOPFILTER)
+    {
+        for (mb_row = 0; mb_row < mb_rows; mb_row++)
+        {
+            for (mb_col = 0; mb_col < mb_cols; mb_col++)
+            {
+                int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                                mode_info_context->mbmi.mode != SPLITMV &&
+                                mode_info_context->mbmi.mb_skip_coeff);
+
+                const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+                const int seg = mode_info_context->mbmi.segment_id;
+                const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+                filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+                if (filter_level)
+                {
+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                    lfi.mblim = lfi_n->mblim[filter_level];
+                    lfi.blim = lfi_n->blim[filter_level];
+                    lfi.lim = lfi_n->lim[filter_level];
+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+                    if (mb_col > 0)
+                        vp8_loop_filter_mbv
+                        (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_bv
+                        (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        vp8_loop_filter_mbh
+                        (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_bh
+                        (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
+                }
+
+                y_ptr += 16;
+                u_ptr += 8;
+                v_ptr += 8;
+
+                mode_info_context++;     /* step to next MB */
+            }
+            y_ptr += post_y_stride  * 16 - post->y_width;
+            u_ptr += post_uv_stride *  8 - post->uv_width;
+            v_ptr += post_uv_stride *  8 - post->uv_width;
+
+            mode_info_context++;         /* Skip border mb */
+
+        }
+    }
+    else /* SIMPLE_LOOPFILTER */
+    {
+        for (mb_row = 0; mb_row < mb_rows; mb_row++)
+        {
+            for (mb_col = 0; mb_col < mb_cols; mb_col++)
+            {
+                int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                                mode_info_context->mbmi.mode != SPLITMV &&
+                                mode_info_context->mbmi.mb_skip_coeff);
+
+                const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+                const int seg = mode_info_context->mbmi.segment_id;
+                const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+                filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+                if (filter_level)
+                {
+                    const unsigned char * mblim = lfi_n->mblim[filter_level];
+                    const unsigned char * blim = lfi_n->blim[filter_level];
+
+                    if (mb_col > 0)
+                        vp8_loop_filter_simple_mbv
+                        (y_ptr, post_y_stride, mblim);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_simple_bv
+                        (y_ptr, post_y_stride, blim);
+
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        vp8_loop_filter_simple_mbh
+                        (y_ptr, post_y_stride, mblim);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_simple_bh
+                        (y_ptr, post_y_stride, blim);
+                }
+
+                y_ptr += 16;
+                u_ptr += 8;
+                v_ptr += 8;
+
+                mode_info_context++;     /* step to next MB */
+            }
+            y_ptr += post_y_stride  * 16 - post->y_width;
+            u_ptr += post_uv_stride *  8 - post->uv_width;
+            v_ptr += post_uv_stride *  8 - post->uv_width;
+
+            mode_info_context++;         /* Skip border mb */
+
+        }
+    }
+}
+
+void vp8_loop_filter_frame_yonly
+(
+    VP8_COMMON *cm,
+    MACROBLOCKD *mbd,
+    int default_filt_lvl
+)
+{
+    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+    unsigned char *y_ptr;
+    int mb_row;
+    int mb_col;
+
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+
+    int filter_level;
+    FRAME_TYPE frame_type = cm->frame_type;
+
+    /* Point at base of Mb MODE_INFO list */
+    const MODE_INFO *mode_info_context = cm->mi;
+
+#if 0
+    if(default_filt_lvl == 0) /* no filter applied */
+        return;
+#endif
+
+    /* Initialize the loop filter for this frame. */
+    vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);
+
+    /* Set up the buffer pointers */
+    y_ptr = post->y_buffer;
+
+    /* vp8_filter each macro block */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                            mode_info_context->mbmi.mode != SPLITMV &&
+                            mode_info_context->mbmi.mb_skip_coeff);
+
+            const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+            const int seg = mode_info_context->mbmi.segment_id;
+            const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+            filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+            if (filter_level)
+            {
+                if (cm->filter_type == NORMAL_LOOPFILTER)
+                {
+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                    lfi.mblim = lfi_n->mblim[filter_level];
+                    lfi.blim = lfi_n->blim[filter_level];
+                    lfi.lim = lfi_n->lim[filter_level];
+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+                    if (mb_col > 0)
+                        vp8_loop_filter_mbv
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_bv
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        vp8_loop_filter_mbh
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_bh
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+                }
+                else
+                {
+                    if (mb_col > 0)
+                        vp8_loop_filter_simple_mbv
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_simple_bv
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+                    /* don't apply across umv border */
+                    if (mb_row > 0)
+                        vp8_loop_filter_simple_mbh
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_simple_bh
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+                }
+            }
+
+            y_ptr += 16;
+            mode_info_context ++;        /* step to next MB */
+
+        }
+
+        y_ptr += post->y_stride  * 16 - post->y_width;
+        mode_info_context ++;            /* Skip border mb */
+    }
+
+}
+
+void vp8_loop_filter_partial_frame
+(
+    VP8_COMMON *cm,
+    MACROBLOCKD *mbd,
+    int default_filt_lvl
+)
+{
+    YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+
+    unsigned char *y_ptr;
+    int mb_row;
+    int mb_col;
+    int mb_cols = post->y_width >> 4;
+    int mb_rows = post->y_height >> 4;
+
+    int linestocopy, i;
+
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+
+    int filter_level;
+    int alt_flt_enabled = mbd->segmentation_enabled;
+    FRAME_TYPE frame_type = cm->frame_type;
+
+    const MODE_INFO *mode_info_context;
+
+    int lvl_seg[MAX_MB_SEGMENTS];
+
+    /* number of MB rows to use in partial filtering */
+    linestocopy = mb_rows / PARTIAL_FRAME_FRACTION;
+    linestocopy = linestocopy ? linestocopy << 4 : 16;     /* 16 lines per MB */
+
+    /* Note the baseline filter values for each segment */
+    /* See vp8_loop_filter_frame_init. Rather than call that for each change
+     * to default_filt_lvl, copy the relevant calculation here.
+     */
+    if (alt_flt_enabled)
+    {
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+        {    /* Abs value */
+            if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+            {
+                lvl_seg[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+            }
+            /* Delta Value */
+            else
+            {
+                lvl_seg[i] = default_filt_lvl
+                        + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+                lvl_seg[i] = (lvl_seg[i] > 0) ?
+                        ((lvl_seg[i] > 63) ? 63: lvl_seg[i]) : 0;
+            }
+        }
+    }
+
+    /* Set up the buffer pointers; partial image starts at ~middle of frame */
+    y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride;
+    mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
+
+    /* vp8_filter each macro block */
+    for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++)
+    {
+        for (mb_col = 0; mb_col < mb_cols; mb_col++)
+        {
+            int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                           mode_info_context->mbmi.mode != SPLITMV &&
+                           mode_info_context->mbmi.mb_skip_coeff);
+
+            if (alt_flt_enabled)
+                filter_level = lvl_seg[mode_info_context->mbmi.segment_id];
+            else
+                filter_level = default_filt_lvl;
+
+            if (filter_level)
+            {
+                if (cm->filter_type == NORMAL_LOOPFILTER)
+                {
+                    const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                    lfi.mblim = lfi_n->mblim[filter_level];
+                    lfi.blim = lfi_n->blim[filter_level];
+                    lfi.lim = lfi_n->lim[filter_level];
+                    lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+                    if (mb_col > 0)
+                        vp8_loop_filter_mbv
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_bv
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    vp8_loop_filter_mbh
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_bh
+                        (y_ptr, 0, 0, post->y_stride, 0, &lfi);
+                }
+                else
+                {
+                    if (mb_col > 0)
+                        vp8_loop_filter_simple_mbv
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_simple_bv
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+
+                    vp8_loop_filter_simple_mbh
+                        (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+
+                    if (!skip_lf)
+                        vp8_loop_filter_simple_bh
+                        (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+                }
+            }
+
+            y_ptr += 16;
+            mode_info_context += 1;      /* step to next MB */
+        }
+
+        y_ptr += post->y_stride  * 16 - post->y_width;
+        mode_info_context += 1;          /* Skip border mb */
+    }
+}
diff --git a/libvpx/vp8/common/loopfilter.h b/libvpx/vp8/common/loopfilter.h
new file mode 100644
index 0000000..b3af2d6
--- /dev/null
+++ b/libvpx/vp8/common/loopfilter.h
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef loopfilter_h
+#define loopfilter_h
+
+#include "vpx_ports/mem.h"
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+#define MAX_LOOP_FILTER             63
+/* fraction of total macroblock rows to be used in fast filter level picking */
+/* has to be > 2 */
+#define PARTIAL_FRAME_FRACTION      8
+
+typedef enum
+{
+    NORMAL_LOOPFILTER = 0,
+    SIMPLE_LOOPFILTER = 1
+} LOOPFILTERTYPE;
+
+#if ARCH_ARM
+#define SIMD_WIDTH 1
+#else
+#define SIMD_WIDTH 16
+#endif
+
+/* Need to align this structure so when it is declared and
+ * passed it can be loaded into vector registers.
+ */
+typedef struct
+{
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
+    DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]);
+    unsigned char lvl[4][4][4];
+    unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
+    unsigned char mode_lf_lut[10];
+} loop_filter_info_n;
+
+typedef struct loop_filter_info
+{
+    const unsigned char * mblim;
+    const unsigned char * blim;
+    const unsigned char * lim;
+    const unsigned char * hev_thr;
+} loop_filter_info;
+
+
+typedef void loop_filter_uvfunction
+(
+    unsigned char *u,   /* source pointer */
+    int p,              /* pitch */
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
+    unsigned char *v
+);
+
+/* assorted loopfilter functions which get used elsewhere */
+struct VP8Common;
+struct macroblockd;
+struct modeinfo;
+
+void vp8_loop_filter_init(struct VP8Common *cm);
+
+void vp8_loop_filter_frame_init(struct VP8Common *cm,
+                                struct macroblockd *mbd,
+                                int default_filt_lvl);
+
+void vp8_loop_filter_frame(struct VP8Common *cm, struct macroblockd *mbd,
+                           int frame_type);
+
+void vp8_loop_filter_partial_frame(struct VP8Common *cm,
+                                   struct macroblockd *mbd,
+                                   int default_filt_lvl);
+
+void vp8_loop_filter_frame_yonly(struct VP8Common *cm,
+                                 struct macroblockd *mbd,
+                                 int default_filt_lvl);
+
+void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
+                                      int sharpness_lvl);
+
+void vp8_loop_filter_row_normal(struct VP8Common *cm,
+                                struct modeinfo *mode_info_context,
+                                int mb_row, int post_ystride, int post_uvstride,
+                                unsigned char *y_ptr, unsigned char *u_ptr,
+                                unsigned char *v_ptr);
+
+void vp8_loop_filter_row_simple(struct VP8Common *cm,
+                                struct modeinfo *mode_info_context,
+                                int mb_row, int post_ystride, int post_uvstride,
+                                unsigned char *y_ptr, unsigned char *u_ptr,
+                                unsigned char *v_ptr);
+#endif
diff --git a/libvpx/vp8/common/loopfilter_filters.c b/libvpx/vp8/common/loopfilter_filters.c
new file mode 100644
index 0000000..8235f6e
--- /dev/null
+++ b/libvpx/vp8/common/loopfilter_filters.c
@@ -0,0 +1,430 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef unsigned char uc;
+
+static signed char vp8_signed_char_clamp(int t)
+{
+    t = (t < -128 ? -128 : t);
+    t = (t > 127 ? 127 : t);
+    return (signed char) t;
+}
+
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static signed char vp8_filter_mask(uc limit, uc blimit,
+                            uc p3, uc p2, uc p1, uc p0,
+                            uc q0, uc q1, uc q2, uc q3)
+{
+    signed char mask = 0;
+    mask |= (abs(p3 - p2) > limit);
+    mask |= (abs(p2 - p1) > limit);
+    mask |= (abs(p1 - p0) > limit);
+    mask |= (abs(q1 - q0) > limit);
+    mask |= (abs(q2 - q1) > limit);
+    mask |= (abs(q3 - q2) > limit);
+    mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit);
+    return mask - 1;
+}
+
+/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
+static signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
+{
+    signed char hev = 0;
+    hev  |= (abs(p1 - p0) > thresh) * -1;
+    hev  |= (abs(q1 - q0) > thresh) * -1;
+    return hev;
+}
+
+static void vp8_filter(signed char mask, uc hev, uc *op1,
+        uc *op0, uc *oq0, uc *oq1)
+
+{
+    signed char ps0, qs0;
+    signed char ps1, qs1;
+    signed char vp8_filter, Filter1, Filter2;
+    signed char u;
+
+    ps1 = (signed char) * op1 ^ 0x80;
+    ps0 = (signed char) * op0 ^ 0x80;
+    qs0 = (signed char) * oq0 ^ 0x80;
+    qs1 = (signed char) * oq1 ^ 0x80;
+
+    /* add outer taps if we have high edge variance */
+    vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
+    vp8_filter &= hev;
+
+    /* inner taps */
+    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
+    vp8_filter &= mask;
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3
+     * if it equals 4 we'll set to adjust by -1 to account for the fact
+     * we'd round 3 the other way
+     */
+    Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
+    Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
+    Filter1 >>= 3;
+    Filter2 >>= 3;
+    u = vp8_signed_char_clamp(qs0 - Filter1);
+    *oq0 = u ^ 0x80;
+    u = vp8_signed_char_clamp(ps0 + Filter2);
+    *op0 = u ^ 0x80;
+    vp8_filter = Filter1;
+
+    /* outer tap adjustments */
+    vp8_filter += 1;
+    vp8_filter >>= 1;
+    vp8_filter &= ~hev;
+
+    u = vp8_signed_char_clamp(qs1 - vp8_filter);
+    *oq1 = u ^ 0x80;
+    u = vp8_signed_char_clamp(ps1 + vp8_filter);
+    *op1 = u ^ 0x80;
+
+}
+void vp8_loop_filter_horizontal_edge_c
+(
+    unsigned char *s,
+    int p, /* pitch */
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
+    int count
+)
+{
+    int  hev = 0; /* high edge variance */
+    signed char mask = 0;
+    int i = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+    do
+    {
+        mask = vp8_filter_mask(limit[0], blimit[0],
+                               s[-4*p], s[-3*p], s[-2*p], s[-1*p],
+                               s[0*p], s[1*p], s[2*p], s[3*p]);
+
+        hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+
+        vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+
+        ++s;
+    }
+    while (++i < count * 8);
+}
+
+void vp8_loop_filter_vertical_edge_c
+(
+    unsigned char *s,
+    int p,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
+    int count
+)
+{
+    int  hev = 0; /* high edge variance */
+    signed char mask = 0;
+    int i = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+    do
+    {
+        mask = vp8_filter_mask(limit[0], blimit[0],
+                               s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
+
+        hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+
+        vp8_filter(mask, hev, s - 2, s - 1, s, s + 1);
+
+        s += p;
+    }
+    while (++i < count * 8);
+}
+
+static void vp8_mbfilter(signed char mask, uc hev,
+                           uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
+{
+    signed char s, u;
+    signed char vp8_filter, Filter1, Filter2;
+    signed char ps2 = (signed char) * op2 ^ 0x80;
+    signed char ps1 = (signed char) * op1 ^ 0x80;
+    signed char ps0 = (signed char) * op0 ^ 0x80;
+    signed char qs0 = (signed char) * oq0 ^ 0x80;
+    signed char qs1 = (signed char) * oq1 ^ 0x80;
+    signed char qs2 = (signed char) * oq2 ^ 0x80;
+
+    /* add outer taps if we have high edge variance */
+    vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
+    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
+    vp8_filter &= mask;
+
+    Filter2 = vp8_filter;
+    Filter2 &= hev;
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
+    Filter1 = vp8_signed_char_clamp(Filter2 + 4);
+    Filter2 = vp8_signed_char_clamp(Filter2 + 3);
+    Filter1 >>= 3;
+    Filter2 >>= 3;
+    qs0 = vp8_signed_char_clamp(qs0 - Filter1);
+    ps0 = vp8_signed_char_clamp(ps0 + Filter2);
+
+
+    /* only apply wider filter if not high edge variance */
+    vp8_filter &= ~hev;
+    Filter2 = vp8_filter;
+
+    /* roughly 3/7th difference across boundary */
+    u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
+    s = vp8_signed_char_clamp(qs0 - u);
+    *oq0 = s ^ 0x80;
+    s = vp8_signed_char_clamp(ps0 + u);
+    *op0 = s ^ 0x80;
+
+    /* roughly 2/7th difference across boundary */
+    u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);
+    s = vp8_signed_char_clamp(qs1 - u);
+    *oq1 = s ^ 0x80;
+    s = vp8_signed_char_clamp(ps1 + u);
+    *op1 = s ^ 0x80;
+
+    /* roughly 1/7th difference across boundary */
+    u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);
+    s = vp8_signed_char_clamp(qs2 - u);
+    *oq2 = s ^ 0x80;
+    s = vp8_signed_char_clamp(ps2 + u);
+    *op2 = s ^ 0x80;
+}
+
+void vp8_mbloop_filter_horizontal_edge_c
+(
+    unsigned char *s,
+    int p,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
+    int count
+)
+{
+    signed char hev = 0; /* high edge variance */
+    signed char mask = 0;
+    int i = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+    do
+    {
+
+        mask = vp8_filter_mask(limit[0], blimit[0],
+                               s[-4*p], s[-3*p], s[-2*p], s[-1*p],
+                               s[0*p], s[1*p], s[2*p], s[3*p]);
+
+        hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+
+        vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
+
+        ++s;
+    }
+    while (++i < count * 8);
+
+}
+
+
+void vp8_mbloop_filter_vertical_edge_c
+(
+    unsigned char *s,
+    int p,
+    const unsigned char *blimit,
+    const unsigned char *limit,
+    const unsigned char *thresh,
+    int count
+)
+{
+    signed char hev = 0; /* high edge variance */
+    signed char mask = 0;
+    int i = 0;
+
+    do
+    {
+
+        mask = vp8_filter_mask(limit[0], blimit[0],
+                               s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
+
+        hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+
+        vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
+
+        s += p;
+    }
+    while (++i < count * 8);
+
+}
+
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
+static signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
+{
+/* Why does this cause problems for win32?
+ * error C2143: syntax error : missing ';' before 'type'
+ *  (void) limit;
+ */
+    signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
+    return mask;
+}
+
+static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0, uc *oq1)
+{
+    signed char vp8_filter, Filter1, Filter2;
+    signed char p1 = (signed char) * op1 ^ 0x80;
+    signed char p0 = (signed char) * op0 ^ 0x80;
+    signed char q0 = (signed char) * oq0 ^ 0x80;
+    signed char q1 = (signed char) * oq1 ^ 0x80;
+    signed char u;
+
+    vp8_filter = vp8_signed_char_clamp(p1 - q1);
+    vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0));
+    vp8_filter &= mask;
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
+    Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
+    Filter1 >>= 3;
+    u = vp8_signed_char_clamp(q0 - Filter1);
+    *oq0  = u ^ 0x80;
+
+    Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
+    Filter2 >>= 3;
+    u = vp8_signed_char_clamp(p0 + Filter2);
+    *op0 = u ^ 0x80;
+}
+
+void vp8_loop_filter_simple_horizontal_edge_c
+(
+    unsigned char *s,
+    int p,
+    const unsigned char *blimit
+)
+{
+    signed char mask = 0;
+    int i = 0;
+
+    do
+    {
+        mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
+        vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
+        ++s;
+    }
+    while (++i < 16);
+}
+
+void vp8_loop_filter_simple_vertical_edge_c
+(
+    unsigned char *s,
+    int p,
+    const unsigned char *blimit
+)
+{
+    signed char mask = 0;
+    int i = 0;
+
+    do
+    {
+        mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
+        vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
+        s += p;
+    }
+    while (++i < 16);
+
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          loop_filter_info *lfi)
+{
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                          unsigned char *v_ptr, int y_stride, int uv_stride,
+                          loop_filter_info *lfi)
+{
+    vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
+                           const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
+}
diff --git a/libvpx/vp8/common/mbpitch.c b/libvpx/vp8/common/mbpitch.c
new file mode 100644
index 0000000..32e1b66
--- /dev/null
+++ b/libvpx/vp8/common/mbpitch.c
@@ -0,0 +1,68 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+
+void vp8_setup_block_dptrs(MACROBLOCKD *x)
+{
+    int r, c;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
+        }
+    }
+
+    for (r = 0; r < 2; r++)
+    {
+        for (c = 0; c < 2; c++)
+        {
+            x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;
+
+        }
+    }
+
+    for (r = 0; r < 2; r++)
+    {
+        for (c = 0; c < 2; c++)
+        {
+            x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;
+
+        }
+    }
+
+    for (r = 0; r < 25; r++)
+    {
+        x->block[r].qcoeff  = x->qcoeff  + r * 16;
+        x->block[r].dqcoeff = x->dqcoeff + r * 16;
+        x->block[r].eob     = x->eobs + r;
+    }
+}
+
+void vp8_build_block_doffsets(MACROBLOCKD *x)
+{
+    int block;
+
+    for (block = 0; block < 16; block++) /* y blocks */
+    {
+        x->block[block].offset =
+            (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4;
+    }
+
+    for (block = 16; block < 20; block++) /* U and V blocks */
+    {
+        x->block[block+4].offset =
+        x->block[block].offset =
+            ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4;
+    }
+}
diff --git a/libvpx/vp8/common/mfqe.c b/libvpx/vp8/common/mfqe.c
new file mode 100644
index 0000000..3dff150
--- /dev/null
+++ b/libvpx/vp8/common/mfqe.c
@@ -0,0 +1,385 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/* MFQE: Multiframe Quality Enhancement
+ * In rate limited situations keyframes may cause significant visual artifacts
+ * commonly referred to as "popping." This file implements a postproccesing
+ * algorithm which blends data from the preceeding frame when there is no
+ * motion and the q from the previous frame is lower which indicates that it is
+ * higher quality.
+ */
+
+#include "postproc.h"
+#include "variance.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_rtcd.h"
+#include "vpx_scale/yv12config.h"
+
+#include <limits.h>
+#include <stdlib.h>
+
+static void filter_by_weight(unsigned char *src, int src_stride,
+                             unsigned char *dst, int dst_stride,
+                             int block_size, int src_weight)
+{
+    int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+    int rounding_bit = 1 << (MFQE_PRECISION - 1);
+    int r, c;
+
+    for (r = 0; r < block_size; r++)
+    {
+        for (c = 0; c < block_size; c++)
+        {
+            dst[c] = (src[c] * src_weight +
+                      dst[c] * dst_weight +
+                      rounding_bit) >> MFQE_PRECISION;
+        }
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride,
+                                 unsigned char *dst, int dst_stride,
+                                 int src_weight)
+{
+    filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
+}
+
+void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride,
+                               unsigned char *dst, int dst_stride,
+                               int src_weight)
+{
+    filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
+}
+
+void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride,
+                               unsigned char *dst, int dst_stride,
+                               int src_weight)
+{
+    filter_by_weight(src, src_stride, dst, dst_stride, 4, src_weight);
+}
+
+static void apply_ifactor(unsigned char *y_src,
+                          int y_src_stride,
+                          unsigned char *y_dst,
+                          int y_dst_stride,
+                          unsigned char *u_src,
+                          unsigned char *v_src,
+                          int uv_src_stride,
+                          unsigned char *u_dst,
+                          unsigned char *v_dst,
+                          int uv_dst_stride,
+                          int block_size,
+                          int src_weight)
+{
+    if (block_size == 16)
+    {
+        vp8_filter_by_weight16x16(y_src, y_src_stride, y_dst, y_dst_stride, src_weight);
+        vp8_filter_by_weight8x8(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight);
+        vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight);
+    }
+    else /* if (block_size == 8) */
+    {
+        vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride, src_weight);
+        vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight);
+        vp8_filter_by_weight4x4(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight);
+    }
+}
+
+static unsigned int int_sqrt(unsigned int x)
+{
+    unsigned int y = x;
+    unsigned int guess;
+    int p = 1;
+    while (y>>=1) p++;
+    p>>=1;
+
+    guess=0;
+    while (p>=0)
+    {
+        guess |= (1<<p);
+        if (x<guess*guess)
+            guess -= (1<<p);
+        p--;
+    }
+    /* choose between guess or guess+1 */
+    return guess+(guess*guess+guess+1<=x);
+}
+
+#define USE_SSD
+static void multiframe_quality_enhance_block
+(
+    int blksize, /* Currently only values supported are 16, 8 */
+    int qcurr,
+    int qprev,
+    unsigned char *y,
+    unsigned char *u,
+    unsigned char *v,
+    int y_stride,
+    int uv_stride,
+    unsigned char *yd,
+    unsigned char *ud,
+    unsigned char *vd,
+    int yd_stride,
+    int uvd_stride
+)
+{
+    static const unsigned char VP8_ZEROS[16]=
+    {
+         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+    };
+    int uvblksize = blksize >> 1;
+    int qdiff = qcurr - qprev;
+
+    int i;
+    unsigned char *up;
+    unsigned char *udp;
+    unsigned char *vp;
+    unsigned char *vdp;
+
+    unsigned int act, actd, sad, usad, vsad, sse, thr, thrsq, actrisk;
+
+    if (blksize == 16)
+    {
+        actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+#ifdef USE_SSD
+        sad = (vp8_variance16x16(y, y_stride, yd, yd_stride, &sse));
+        sad = (sse + 128)>>8;
+        usad = (vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse));
+        usad = (sse + 32)>>6;
+        vsad = (vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse));
+        vsad = (sse + 32)>>6;
+#else
+        sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, UINT_MAX) + 128) >> 8;
+        usad = (vp8_sad8x8(u, uv_stride, ud, uvd_stride, UINT_MAX) + 32) >> 6;
+        vsad = (vp8_sad8x8(v, uv_stride, vd, uvd_stride, UINT_MAX)+ 32) >> 6;
+#endif
+    }
+    else /* if (blksize == 8) */
+    {
+        actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+#ifdef USE_SSD
+        sad = (vp8_variance8x8(y, y_stride, yd, yd_stride, &sse));
+        sad = (sse + 32)>>6;
+        usad = (vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse));
+        usad = (sse + 8)>>4;
+        vsad = (vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse));
+        vsad = (sse + 8)>>4;
+#else
+        sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, UINT_MAX) + 32) >> 6;
+        usad = (vp8_sad4x4(u, uv_stride, ud, uvd_stride, UINT_MAX) + 8) >> 4;
+        vsad = (vp8_sad4x4(v, uv_stride, vd, uvd_stride, UINT_MAX) + 8) >> 4;
+#endif
+    }
+
+    actrisk = (actd > act * 5);
+
+    /* thr = qdiff/16 + log2(act) + log4(qprev) */
+    thr = (qdiff >> 4);
+    while (actd >>= 1) thr++;
+    while (qprev >>= 2) thr++;
+
+#ifdef USE_SSD
+    thrsq = thr * thr;
+    if (sad < thrsq &&
+        /* additional checks for color mismatch and excessive addition of
+         * high-frequencies */
+        4 * usad < thrsq && 4 * vsad < thrsq && !actrisk)
+#else
+    if (sad < thr &&
+        /* additional checks for color mismatch and excessive addition of
+         * high-frequencies */
+        2 * usad < thr && 2 * vsad < thr && !actrisk)
+#endif
+    {
+        int ifactor;
+#ifdef USE_SSD
+        /* TODO: optimize this later to not need sqr root */
+        sad = int_sqrt(sad);
+#endif
+        ifactor = (sad << MFQE_PRECISION) / thr;
+        ifactor >>= (qdiff >> 5);
+
+        if (ifactor)
+        {
+            apply_ifactor(y, y_stride, yd, yd_stride,
+                          u, v, uv_stride,
+                          ud, vd, uvd_stride,
+                          blksize, ifactor);
+        }
+    }
+    else  /* else implicitly copy from previous frame */
+    {
+        if (blksize == 16)
+        {
+            vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
+            vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
+            vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
+        }
+        else  /* if (blksize == 8) */
+        {
+            vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
+            for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride)
+                vpx_memcpy(udp, up, uvblksize);
+            for (vp = v, vdp = vd, i = 0; i < uvblksize; ++i, vp += uv_stride, vdp += uvd_stride)
+                vpx_memcpy(vdp, vp, uvblksize);
+        }
+    }
+}
+
+static int qualify_inter_mb(const MODE_INFO *mode_info_context, int *map)
+{
+    if (mode_info_context->mbmi.mb_skip_coeff)
+        map[0] = map[1] = map[2] = map[3] = 1;
+    else if (mode_info_context->mbmi.mode==SPLITMV)
+    {
+        static int ndx[4][4] =
+        {
+            {0, 1, 4, 5},
+            {2, 3, 6, 7},
+            {8, 9, 12, 13},
+            {10, 11, 14, 15}
+        };
+        int i, j;
+        for (i=0; i<4; ++i)
+        {
+            map[i] = 1;
+            for (j=0; j<4 && map[j]; ++j)
+                map[i] &= (mode_info_context->bmi[ndx[i][j]].mv.as_mv.row <= 2 &&
+                           mode_info_context->bmi[ndx[i][j]].mv.as_mv.col <= 2);
+        }
+    }
+    else
+    {
+        map[0] = map[1] = map[2] = map[3] =
+            (mode_info_context->mbmi.mode > B_PRED &&
+             abs(mode_info_context->mbmi.mv.as_mv.row) <= 2 &&
+             abs(mode_info_context->mbmi.mv.as_mv.col) <= 2);
+    }
+    return (map[0]+map[1]+map[2]+map[3]);
+}
+
+void vp8_multiframe_quality_enhance
+(
+    VP8_COMMON *cm
+)
+{
+    YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+    YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+    FRAME_TYPE frame_type = cm->frame_type;
+    /* Point at base of Mb MODE_INFO list has motion vectors etc */
+    const MODE_INFO *mode_info_context = cm->mi;
+    int mb_row;
+    int mb_col;
+    int totmap, map[4];
+    int qcurr = cm->base_qindex;
+    int qprev = cm->postproc_state.last_base_qindex;
+
+    unsigned char *y_ptr, *u_ptr, *v_ptr;
+    unsigned char *yd_ptr, *ud_ptr, *vd_ptr;
+
+    /* Set up the buffer pointers */
+    y_ptr = show->y_buffer;
+    u_ptr = show->u_buffer;
+    v_ptr = show->v_buffer;
+    yd_ptr = dest->y_buffer;
+    ud_ptr = dest->u_buffer;
+    vd_ptr = dest->v_buffer;
+
+    /* postprocess each macro block */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            /* if motion is high there will likely be no benefit */
+            if (frame_type == INTER_FRAME) totmap = qualify_inter_mb(mode_info_context, map);
+            else totmap = (frame_type == KEY_FRAME ? 4 : 0);
+            if (totmap)
+            {
+                if (totmap < 4)
+                {
+                    int i, j;
+                    for (i=0; i<2; ++i)
+                        for (j=0; j<2; ++j)
+                        {
+                            if (map[i*2+j])
+                            {
+                                multiframe_quality_enhance_block(8, qcurr, qprev,
+                                                                 y_ptr + 8*(i*show->y_stride+j),
+                                                                 u_ptr + 4*(i*show->uv_stride+j),
+                                                                 v_ptr + 4*(i*show->uv_stride+j),
+                                                                 show->y_stride,
+                                                                 show->uv_stride,
+                                                                 yd_ptr + 8*(i*dest->y_stride+j),
+                                                                 ud_ptr + 4*(i*dest->uv_stride+j),
+                                                                 vd_ptr + 4*(i*dest->uv_stride+j),
+                                                                 dest->y_stride,
+                                                                 dest->uv_stride);
+                            }
+                            else
+                            {
+                                /* copy a 8x8 block */
+                                int k;
+                                unsigned char *up = u_ptr + 4*(i*show->uv_stride+j);
+                                unsigned char *udp = ud_ptr + 4*(i*dest->uv_stride+j);
+                                unsigned char *vp = v_ptr + 4*(i*show->uv_stride+j);
+                                unsigned char *vdp = vd_ptr + 4*(i*dest->uv_stride+j);
+                                vp8_copy_mem8x8(y_ptr + 8*(i*show->y_stride+j), show->y_stride,
+                                                yd_ptr + 8*(i*dest->y_stride+j), dest->y_stride);
+                                for (k = 0; k < 4; ++k, up += show->uv_stride, udp += dest->uv_stride,
+                                                        vp += show->uv_stride, vdp += dest->uv_stride)
+                                {
+                                    vpx_memcpy(udp, up, 4);
+                                    vpx_memcpy(vdp, vp, 4);
+                                }
+                            }
+                        }
+                }
+                else /* totmap = 4 */
+                {
+                    multiframe_quality_enhance_block(16, qcurr, qprev, y_ptr,
+                                                     u_ptr, v_ptr,
+                                                     show->y_stride,
+                                                     show->uv_stride,
+                                                     yd_ptr, ud_ptr, vd_ptr,
+                                                     dest->y_stride,
+                                                     dest->uv_stride);
+                }
+            }
+            else
+            {
+                vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
+                vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
+                vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
+            }
+            y_ptr += 16;
+            u_ptr += 8;
+            v_ptr += 8;
+            yd_ptr += 16;
+            ud_ptr += 8;
+            vd_ptr += 8;
+            mode_info_context++;     /* step to next MB */
+        }
+
+        y_ptr += show->y_stride  * 16 - 16 * cm->mb_cols;
+        u_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        v_ptr += show->uv_stride *  8 - 8 * cm->mb_cols;
+        yd_ptr += dest->y_stride  * 16 - 16 * cm->mb_cols;
+        ud_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+        vd_ptr += dest->uv_stride *  8 - 8 * cm->mb_cols;
+
+        mode_info_context++;         /* Skip border mb */
+    }
+}
diff --git a/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c b/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
new file mode 100644
index 0000000..6823325
--- /dev/null
+++ b/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_DSPR2
+void vp8_dequant_idct_add_dspr2(short *input, short *dq,
+                                unsigned char *dest, int stride)
+{
+    int i;
+
+    for (i = 0; i < 16; i++)
+    {
+        input[i] = dq[i] * input[i];
+    }
+
+    vp8_short_idct4x4llm_dspr2(input, dest, stride, dest, stride);
+
+    vpx_memset(input, 0, 32);
+
+}
+
+#endif
diff --git a/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
new file mode 100644
index 0000000..71fdcd7
--- /dev/null
+++ b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
@@ -0,0 +1,2823 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "vpx_rtcd.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+#define CROP_WIDTH 256
+unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
+
+static const unsigned short sub_pel_filterss[8][3] =
+{
+    {      0,      0,      0},
+    {      0, 0x0601, 0x7b0c},
+    { 0x0201, 0x0b08, 0x6c24},
+    {      0, 0x0906, 0x5d32},
+    { 0x0303, 0x1010, 0x4d4d},
+    {      0, 0x0609, 0x325d},
+    { 0x0102, 0x080b, 0x246c},
+    {      0, 0x0106, 0x0c7b},
+};
+
+
+static const int sub_pel_filters_int[8][3] =
+{
+    {          0,          0,          0},
+    { 0x0000fffa, 0x007b000c, 0xffff0000},
+    { 0x0002fff5, 0x006c0024, 0xfff80001},
+    { 0x0000fff7, 0x005d0032, 0xfffa0000},
+    { 0x0003fff0, 0x004d004d, 0xfff00003},
+    { 0x0000fffa, 0x0032005d, 0xfff70000},
+    { 0x0001fff8, 0x0024006c, 0xfff50002},
+    { 0x0000ffff, 0x000c007b, 0xfffa0000},
+};
+
+
+static const int sub_pel_filters_inv[8][3] =
+{
+    {          0,          0,          0},
+    { 0xfffa0000, 0x000c007b, 0x0000ffff},
+    { 0xfff50002, 0x0024006c, 0x0001fff8},
+    { 0xfff70000, 0x0032005d, 0x0000fffa},
+    { 0xfff00003, 0x004d004d, 0x0003fff0},
+    { 0xfffa0000, 0x005d0032, 0x0000fff7},
+    { 0xfff80001, 0x006c0024, 0x0002fff5},
+    { 0xffff0000, 0x007b000c, 0x0000fffa},
+};
+
+
+static const int sub_pel_filters_int_tap_4[8][2] =
+{
+    {          0,          0},
+    { 0xfffa007b, 0x000cffff},
+    {          0,          0},
+    { 0xfff7005d, 0x0032fffa},
+    {          0,          0},
+    { 0xfffa0032, 0x005dfff7},
+    {          0,          0},
+    { 0xffff000c, 0x007bfffa},
+};
+
+
+static const int sub_pel_filters_inv_tap_4[8][2] =
+{
+    {          0,          0},
+    { 0x007bfffa, 0xffff000c},
+    {          0,          0},
+    { 0x005dfff7, 0xfffa0032},
+    {          0,          0},
+    { 0x0032fffa, 0xfff7005d},
+    {          0,          0},
+    { 0x000cffff, 0xfffa007b},
+};
+
+inline void prefetch_load(unsigned char *src)
+{
+    __asm__ __volatile__ (
+        "pref   0,  0(%[src])   \n\t"
+        :
+        : [src] "r" (src)
+    );
+}
+
+
+inline void prefetch_store(unsigned char *dst)
+{
+    __asm__ __volatile__ (
+        "pref   1,  0(%[dst])   \n\t"
+        :
+        : [dst] "r" (dst)
+    );
+}
+
+void dsputil_static_init(void)
+{
+    int i;
+
+    for (i = 0; i < 256; i++) ff_cropTbl[i + CROP_WIDTH] = i;
+
+    for (i = 0; i < CROP_WIDTH; i++)
+    {
+        ff_cropTbl[i] = 0;
+        ff_cropTbl[i + CROP_WIDTH + 256] = 255;
+    }
+}
+
+void vp8_filter_block2d_first_pass_4
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT dst_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_height,
+    int xoffset,
+    int pitch
+)
+{
+    unsigned int i;
+    int Temp1, Temp2, Temp3, Temp4;
+
+    unsigned int vector4a = 64;
+    int vector1b, vector2b, vector3b;
+    unsigned int tp1, tp2, tn1, tn2;
+    unsigned int p1, p2, p3;
+    unsigned int n1, n2, n3;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector3b = sub_pel_filters_inv[xoffset][2];
+
+    /* if (xoffset == 0) we don't need any filtering */
+    if (vector3b == 0)
+    {
+        for (i = 0; i < output_height; i++)
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + src_pixels_per_line);
+            dst_ptr[0] = src_ptr[0];
+            dst_ptr[1] = src_ptr[1];
+            dst_ptr[2] = src_ptr[2];
+            dst_ptr[3] = src_ptr[3];
+
+            /* next row... */
+            src_ptr += src_pixels_per_line;
+            dst_ptr += 4;
+        }
+    }
+    else
+    {
+        if (vector3b > 65536)
+        {
+            /* 6 tap filter */
+
+            vector1b = sub_pel_filters_inv[xoffset][0];
+            vector2b = sub_pel_filters_inv[xoffset][1];
+
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + src_pixels_per_line);
+
+            for (i = output_height; i--;)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
+                    "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
+                    "balign           %[tp2],      %[tp1],         3              \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
+
+                    /* odd 1. pixel */
+                    "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    /* clamp */
+                    "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+                    "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
+                    "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+                    "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
+
+                    /* store bytes */
+                    "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
+                    "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
+                    "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
+                    "sb               %[n2],       3(%[dst_ptr])                  \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
+                      [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
+                      [p3] "=&r" (p3), [n1] "=&r" (n1), [n2] "=&r" (n2),
+                      [n3] "=&r" (n3), [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
+                      [vector3b] "r" (vector3b), [src_ptr] "r" (src_ptr)
+                );
+
+                /* Next row... */
+                src_ptr += src_pixels_per_line;
+                dst_ptr += pitch;
+            }
+        }
+        else
+        {
+            /* 4 tap filter */
+
+            vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+            vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+            for (i = output_height; i--;)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
+                    "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+
+                    /* odd 1. pixel */
+                    "srl              %[tn1],      %[tp2],         8              \n\t"
+                    "balign           %[tp2],      %[tp1],         3              \n\t"
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn1]                         \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    /* clamp and store results */
+                    "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+                    "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
+                    "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+                    "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
+                    "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
+                    "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
+                    "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
+                    "sb               %[n2],       3(%[dst_ptr])                  \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
+                      [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
+                      [src_ptr] "r" (src_ptr)
+                );
+                /*  Next row... */
+                src_ptr += src_pixels_per_line;
+                dst_ptr += pitch;
+            }
+        }
+    }
+}
+
+void vp8_filter_block2d_first_pass_8_all
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT dst_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_height,
+    int xoffset,
+    int pitch
+)
+{
+    unsigned int i;
+    int Temp1, Temp2, Temp3, Temp4;
+
+    unsigned int vector4a = 64;
+    unsigned int vector1b, vector2b, vector3b;
+    unsigned int tp1, tp2, tn1, tn2;
+    unsigned int p1, p2, p3, p4;
+    unsigned int n1, n2, n3, n4;
+
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    /* if (xoffset == 0) we don't need any filtering */
+    if (xoffset == 0)
+    {
+        for (i = 0; i < output_height; i++)
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + src_pixels_per_line);
+
+            dst_ptr[0] = src_ptr[0];
+            dst_ptr[1] = src_ptr[1];
+            dst_ptr[2] = src_ptr[2];
+            dst_ptr[3] = src_ptr[3];
+            dst_ptr[4] = src_ptr[4];
+            dst_ptr[5] = src_ptr[5];
+            dst_ptr[6] = src_ptr[6];
+            dst_ptr[7] = src_ptr[7];
+
+            /* next row... */
+            src_ptr += src_pixels_per_line;
+            dst_ptr += 8;
+        }
+    }
+    else
+    {
+        vector3b = sub_pel_filters_inv[xoffset][2];
+
+        if (vector3b > 65536)
+        {
+            /* 6 tap filter */
+
+            vector1b = sub_pel_filters_inv[xoffset][0];
+            vector2b = sub_pel_filters_inv[xoffset][1];
+
+            for (i = output_height; i--;)
+            {
+                /* prefetch src_ptr data to cache memory */
+                prefetch_load(src_ptr + src_pixels_per_line);
+
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
+                    "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
+
+                    "balign           %[tp2],      %[tp1],         3              \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+                    "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
+
+                    /* odd 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+                    "ulw              %[tp1],      6(%[src_ptr])                  \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
+                      [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
+                      [src_ptr] "r" (src_ptr)
+                );
+
+                /* clamp and store results */
+                dst_ptr[0] = cm[Temp1];
+                dst_ptr[1] = cm[Temp2];
+                dst_ptr[2] = cm[Temp3];
+                dst_ptr[3] = cm[Temp4];
+
+                /* next 4 pixels */
+                __asm__ __volatile__ (
+                    /* even 3. pixel */
+                    "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector3b]    \n\t"
+
+                    /* even 4. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[p4],       %[tp1]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
+
+                    "ulw              %[tn1],      7(%[src_ptr])                  \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+
+                    /* odd 3. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector3b]    \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+
+                    /* odd 4. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[n4],       %[tn1]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    : [tn1] "=&r" (tn1), [n2] "=&r" (n2),
+                      [p4] "=&r" (p4), [n4] "=&r" (n4),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+                    : [tp1] "r" (tp1), [vector1b] "r" (vector1b), [p2] "r" (p2),
+                      [vector2b] "r" (vector2b), [n1] "r" (n1), [p1] "r" (p1),
+                      [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
+                      [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
+                );
+
+                /* clamp and store results */
+                dst_ptr[4] = cm[Temp1];
+                dst_ptr[5] = cm[Temp2];
+                dst_ptr[6] = cm[Temp3];
+                dst_ptr[7] = cm[Temp4];
+
+                src_ptr += src_pixels_per_line;
+                dst_ptr += pitch;
+            }
+        }
+        else
+        {
+            /* 4 tap filter */
+
+            vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+            vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+            for (i = output_height; i--;)
+            {
+                /* prefetch src_ptr data to cache memory */
+                prefetch_load(src_ptr + src_pixels_per_line);
+
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+
+                    "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
+
+                    /* even 2. pixel  */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+
+                    "balign           %[tp2],      %[tp1],         3              \n\t"
+
+                    /* odd 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+
+                    "ulw              %[tn2],      4(%[src_ptr])                  \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+                    "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+                    "ulw              %[tp1],      7(%[src_ptr])                  \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+                      [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
+                      [p3] "=&r" (p3), [p4] "=&r" (p4), [n1] "=&r" (n1),
+                      [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+                );
+
+                /* clamp and store results */
+                dst_ptr[0] = cm[Temp1];
+                dst_ptr[1] = cm[Temp2];
+                dst_ptr[2] = cm[Temp3];
+                dst_ptr[3] = cm[Temp4];
+
+                /* next 4 pixels */
+                __asm__ __volatile__ (
+                    /* even 3. pixel */
+                    "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
+
+                    /* even 4. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+
+                    /* odd 3. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n4],          %[vector2b]    \n\t"
+                    "ulw              %[tn1],      8(%[src_ptr])                  \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+
+                    /* odd 4. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[n4],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+                    : [tp1] "r" (tp1), [p3] "r" (p3), [p4] "r" (p4),
+                      [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr),
+                      [n3] "r" (n3), [n4] "r" (n4)
+                );
+
+                /* clamp and store results */
+                dst_ptr[4] = cm[Temp1];
+                dst_ptr[5] = cm[Temp2];
+                dst_ptr[6] = cm[Temp3];
+                dst_ptr[7] = cm[Temp4];
+
+                /* next row... */
+                src_ptr += src_pixels_per_line;
+                dst_ptr += pitch;
+            }
+        }
+    }
+}
+
+
+void vp8_filter_block2d_first_pass16_6tap
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT dst_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_height,
+    int xoffset,
+    int pitch
+)
+{
+    unsigned int i;
+    int Temp1, Temp2, Temp3, Temp4;
+
+    unsigned int vector4a;
+    unsigned int vector1b, vector2b, vector3b;
+    unsigned int tp1, tp2, tn1, tn2;
+    unsigned int p1, p2, p3, p4;
+    unsigned int n1, n2, n3, n4;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector1b = sub_pel_filters_inv[xoffset][0];
+    vector2b = sub_pel_filters_inv[xoffset][1];
+    vector3b = sub_pel_filters_inv[xoffset][2];
+    vector4a = 64;
+
+    for (i = output_height; i--;)
+    {
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr + src_pixels_per_line);
+
+        /* apply filter with vectors pairs */
+        __asm__ __volatile__ (
+            "ulw                %[tp1],      -2(%[src_ptr])                 \n\t"
+            "ulw                %[tp2],      2(%[src_ptr])                  \n\t"
+
+            /* even 1. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[p1],       %[tp1]                         \n\t"
+            "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
+            "preceu.ph.qbr      %[p3],       %[tp2]                         \n\t"
+            "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p2],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p3],           %[vector3b]   \n\t"
+
+            /* even 2. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[p1],       %[tp2]                         \n\t"
+            "dpa.w.ph           $ac2,        %[p2],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p3],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p1],           %[vector3b]   \n\t"
+
+            "balign             %[tp2],      %[tp1],          3             \n\t"
+            "ulw                %[tn2],      3(%[src_ptr])                  \n\t"
+            "extp               %[Temp1],    $ac3,            9             \n\t"
+
+            /* odd 1. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[n1],       %[tp2]                         \n\t"
+            "preceu.ph.qbl      %[n2],       %[tp2]                         \n\t"
+            "preceu.ph.qbr      %[n3],       %[tn2]                         \n\t"
+            "extp               %[Temp3],    $ac2,            9             \n\t"
+            "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n2],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n3],           %[vector3b]   \n\t"
+
+            /* odd 2. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[n1],       %[tn2]                         \n\t"
+            "dpa.w.ph           $ac2,        %[n2],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n3],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n1],           %[vector3b]   \n\t"
+            "ulw                %[tp1],      6(%[src_ptr])                  \n\t"
+            "extp               %[Temp2],    $ac3,            9             \n\t"
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[p2],       %[tp1]                         \n\t"
+            "extp               %[Temp4],    $ac2,            9             \n\t"
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
+              [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+              [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+              [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
+              [src_ptr] "r" (src_ptr)
+        );
+
+        /* clamp and store results */
+        dst_ptr[0] = cm[Temp1];
+        dst_ptr[1] = cm[Temp2];
+        dst_ptr[2] = cm[Temp3];
+        dst_ptr[3] = cm[Temp4];
+
+        /* next 4 pixels */
+        __asm__ __volatile__ (
+            /* even 3. pixel */
+            "dpa.w.ph           $ac3,        %[p3],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p1],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p2],           %[vector3b]   \n\t"
+
+            /* even 4. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[p4],       %[tp1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[p1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p2],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p4],           %[vector3b]   \n\t"
+            "ulw                %[tn1],      7(%[src_ptr])                  \n\t"
+            "extp               %[Temp1],    $ac3,            9             \n\t"
+
+            /* odd 3. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[n2],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac3,        %[n3],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n1],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n2],           %[vector3b]   \n\t"
+            "extp               %[Temp3],    $ac2,            9             \n\t"
+
+            /* odd 4. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[n4],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[n1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n2],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n4],           %[vector3b]   \n\t"
+            "ulw                %[tp2],      10(%[src_ptr])                 \n\t"
+            "extp               %[Temp2],    $ac3,            9             \n\t"
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[p1],       %[tp2]                         \n\t"
+            "extp               %[Temp4],    $ac2,            9             \n\t"
+
+            : [tn1] "=&r" (tn1), [tp2] "=&r" (tp2), [n2] "=&r" (n2),
+              [p4] "=&r" (p4), [n4] "=&r" (n4),
+              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+              [tp1] "r" (tp1), [n1] "r" (n1), [p1] "r" (p1),
+              [vector4a] "r" (vector4a), [p2] "r" (p2), [vector3b] "r" (vector3b),
+              [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
+        );
+
+        /* clamp and store results */
+        dst_ptr[4] = cm[Temp1];
+        dst_ptr[5] = cm[Temp2];
+        dst_ptr[6] = cm[Temp3];
+        dst_ptr[7] = cm[Temp4];
+
+        /* next 4 pixels */
+        __asm__ __volatile__ (
+            /* even 5. pixel */
+            "dpa.w.ph           $ac3,        %[p2],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p4],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p1],           %[vector3b]   \n\t"
+
+            /* even 6. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[p3],       %[tp2]                         \n\t"
+            "dpa.w.ph           $ac2,        %[p4],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p1],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p3],           %[vector3b]   \n\t"
+
+            "ulw                %[tn1],      11(%[src_ptr])                 \n\t"
+            "extp               %[Temp1],    $ac3,            9             \n\t"
+
+            /* odd 5. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[n1],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac3,        %[n2],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n4],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n1],           %[vector3b]   \n\t"
+            "extp               %[Temp3],    $ac2,            9             \n\t"
+
+            /* odd 6. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[n3],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[n4],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n1],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n3],           %[vector3b]   \n\t"
+            "ulw                %[tp1],      14(%[src_ptr])                 \n\t"
+            "extp               %[Temp2],    $ac3,            9             \n\t"
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[p4],       %[tp1]                         \n\t"
+            "extp               %[Temp4],    $ac2,            9             \n\t"
+
+            : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
+              [n1] "=&r" (n1), [p3] "=&r" (p3), [n3] "=&r" (n3),
+              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+              [tp2] "r" (tp2), [p2] "r" (p2), [n2] "r" (n2),
+              [p4] "r" (p4), [n4] "r" (n4), [p1] "r" (p1), [src_ptr] "r" (src_ptr),
+              [vector4a] "r" (vector4a), [vector3b] "r" (vector3b)
+        );
+
+        /* clamp and store results */
+        dst_ptr[8] = cm[Temp1];
+        dst_ptr[9] = cm[Temp2];
+        dst_ptr[10] = cm[Temp3];
+        dst_ptr[11] = cm[Temp4];
+
+        /* next 4 pixels */
+        __asm__ __volatile__ (
+            /* even 7. pixel */
+            "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p3],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p4],           %[vector3b]   \n\t"
+
+            /* even 8. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[p3],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p4],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p2],           %[vector3b]   \n\t"
+            "ulw                %[tn1],      15(%[src_ptr])                 \n\t"
+            "extp               %[Temp1],    $ac3,            9             \n\t"
+
+            /* odd 7. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[n4],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n3],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n4],           %[vector3b]   \n\t"
+            "extp               %[Temp3],    $ac2,            9             \n\t"
+
+            /* odd 8. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[n2],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[n3],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n4],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n2],           %[vector3b]   \n\t"
+            "extp               %[Temp2],    $ac3,            9             \n\t"
+            "extp               %[Temp4],    $ac2,            9             \n\t"
+
+            /* clamp and store results */
+            "lbux               %[tp1],      %[Temp1](%[cm])                \n\t"
+            "lbux               %[tn1],      %[Temp2](%[cm])                \n\t"
+            "lbux               %[p2],       %[Temp3](%[cm])                \n\t"
+            "sb                 %[tp1],      12(%[dst_ptr])                 \n\t"
+            "sb                 %[tn1],      13(%[dst_ptr])                 \n\t"
+            "lbux               %[n2],       %[Temp4](%[cm])                \n\t"
+            "sb                 %[p2],       14(%[dst_ptr])                 \n\t"
+            "sb                 %[n2],       15(%[dst_ptr])                 \n\t"
+
+            : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2), [n4] "=&r" (n4),
+              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+              [tp1] "r" (tp1), [p4] "r" (p4), [n1] "r" (n1), [p1] "r" (p1),
+              [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), [p3] "r" (p3),
+              [n3] "r" (n3), [src_ptr] "r" (src_ptr),
+              [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+        );
+
+        src_ptr += src_pixels_per_line;
+        dst_ptr += pitch;
+    }
+}
+
+
+void vp8_filter_block2d_first_pass16_0
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    unsigned int src_pixels_per_line
+)
+{
+    int Temp1, Temp2, Temp3, Temp4;
+    int i;
+
+    /* prefetch src_ptr data to cache memory */
+    prefetch_store(output_ptr + 32);
+
+    /* copy memory from src buffer to dst buffer */
+    for (i = 0; i < 7; i++)
+    {
+        __asm__ __volatile__ (
+            "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
+            "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
+            "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
+            "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
+            "sw     %[Temp1],   0(%[output_ptr])                            \n\t"
+            "sw     %[Temp2],   4(%[output_ptr])                            \n\t"
+            "sw     %[Temp3],   8(%[output_ptr])                            \n\t"
+            "sw     %[Temp4],   12(%[output_ptr])                           \n\t"
+            "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
+
+            : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+              [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
+            : [src_pixels_per_line] "r" (src_pixels_per_line),
+              [output_ptr] "r" (output_ptr)
+        );
+
+        __asm__ __volatile__ (
+            "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
+            "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
+            "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
+            "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
+            "sw     %[Temp1],   16(%[output_ptr])                           \n\t"
+            "sw     %[Temp2],   20(%[output_ptr])                           \n\t"
+            "sw     %[Temp3],   24(%[output_ptr])                           \n\t"
+            "sw     %[Temp4],   28(%[output_ptr])                           \n\t"
+            "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
+
+            : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+              [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
+            : [src_pixels_per_line] "r" (src_pixels_per_line),
+              [output_ptr] "r" (output_ptr)
+        );
+
+        __asm__ __volatile__ (
+            "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
+            "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
+            "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
+            "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
+            "sw     %[Temp1],   32(%[output_ptr])                           \n\t"
+            "sw     %[Temp2],   36(%[output_ptr])                           \n\t"
+            "sw     %[Temp3],   40(%[output_ptr])                           \n\t"
+            "sw     %[Temp4],   44(%[output_ptr])                           \n\t"
+            "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
+
+            : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+              [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
+            : [src_pixels_per_line] "r" (src_pixels_per_line),
+              [output_ptr] "r" (output_ptr)
+        );
+
+        output_ptr += 48;
+    }
+}
+
+
+void vp8_filter_block2d_first_pass16_4tap
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    int xoffset,
+    int yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int pitch
+)
+{
+    unsigned int i, j;
+    int Temp1, Temp2, Temp3, Temp4;
+
+    unsigned int vector4a;
+    int vector1b, vector2b;
+    unsigned int tp1, tp2, tp3, tn1;
+    unsigned int p1, p2, p3;
+    unsigned int n1, n2, n3;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector4a = 64;
+
+    vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+    vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+    /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */
+    if (yoffset == 0)
+    {
+        output_height -= 5;
+        src_ptr += (src_pixels_per_line + src_pixels_per_line);
+
+        for (i = output_height; i--;)
+        {
+            __asm__ __volatile__ (
+                "ulw     %[tp3],   -1(%[src_ptr])               \n\t"
+                : [tp3] "=&r" (tp3)
+                : [src_ptr] "r" (src_ptr)
+            );
+
+            /* processing 4 adjacent pixels */
+            for (j = 0; j < 16; j += 4)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp2],      3(%[src_ptr])                    \n\t"
+                    "move             %[tp1],      %[tp3]                           \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                             \n\t"
+                    "mthi             $0,          $ac3                             \n\t"
+                    "move             %[tp3],      %[tp2]                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                           \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                           \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                           \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                             \n\t"
+                    "mthi             $0,          $ac2                             \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     \n\t"
+                    "extr.w           %[Temp1],    $ac3,            7               \n\t"
+
+                    /* odd 1. pixel */
+                    "ulw              %[tn1],      4(%[src_ptr])                    \n\t"
+                    "balign           %[tp2],      %[tp1],          3               \n\t"
+                    "mtlo             %[vector4a], $ac3                             \n\t"
+                    "mthi             $0,          $ac3                             \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                           \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                           \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn1]                           \n\t"
+                    "extr.w           %[Temp3],    $ac2,            7               \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                             \n\t"
+                    "mthi             $0,          $ac2                             \n\t"
+                    "extr.w           %[Temp2],    $ac3,            7               \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     \n\t"
+                    "extr.w           %[Temp4],    $ac2,            7               \n\t"
+
+                    /* clamp and store results */
+                    "lbux             %[tp1],      %[Temp1](%[cm])                  \n\t"
+                    "lbux             %[tn1],      %[Temp2](%[cm])                  \n\t"
+                    "lbux             %[tp2],      %[Temp3](%[cm])                  \n\t"
+                    "sb               %[tp1],      0(%[dst_ptr])                    \n\t"
+                    "sb               %[tn1],      1(%[dst_ptr])                    \n\t"
+                    "lbux             %[n2],       %[Temp4](%[cm])                  \n\t"
+                    "sb               %[tp2],      2(%[dst_ptr])                    \n\t"
+                    "sb               %[n2],       3(%[dst_ptr])                    \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+                      [tn1] "=&r" (tn1), [p1] "=&r" (p1), [p2] "=&r" (p2),
+                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [p3] "=&r" (p3),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
+                      [src_ptr] "r" (src_ptr)
+                );
+
+                src_ptr += 4;
+            }
+
+            /* Next row... */
+            src_ptr += src_pixels_per_line - 16;
+            dst_ptr += pitch;
+        }
+    }
+    else
+    {
+        for (i = output_height; i--;)
+        {
+            /* processing 4 adjacent pixels */
+            for (j = 0; j < 16; j += 4)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -1(%[src_ptr])                   \n\t"
+                    "ulw              %[tp2],      3(%[src_ptr])                    \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                             \n\t"
+                    "mthi             $0,          $ac3                             \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                           \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                           \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                           \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                             \n\t"
+                    "mthi             $0,          $ac2                             \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     \n\t"
+                    "extr.w           %[Temp1],    $ac3,            7               \n\t"
+
+                    /* odd 1. pixel */
+                    "ulw              %[tn1],      4(%[src_ptr])                    \n\t"
+                    "balign           %[tp2],      %[tp1],          3               \n\t"
+                    "mtlo             %[vector4a], $ac3                             \n\t"
+                    "mthi             $0,          $ac3                             \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                           \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                           \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn1]                           \n\t"
+                    "extr.w           %[Temp3],    $ac2,            7               \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                             \n\t"
+                    "mthi             $0,          $ac2                             \n\t"
+                    "extr.w           %[Temp2],    $ac3,            7               \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     \n\t"
+                    "extr.w           %[Temp4],    $ac2,            7               \n\t"
+
+                    /* clamp and store results */
+                    "lbux             %[tp1],      %[Temp1](%[cm])                  \n\t"
+                    "lbux             %[tn1],      %[Temp2](%[cm])                  \n\t"
+                    "lbux             %[tp2],      %[Temp3](%[cm])                  \n\t"
+                    "sb               %[tp1],      0(%[output_ptr])                 \n\t"
+                    "sb               %[tn1],      1(%[output_ptr])                 \n\t"
+                    "lbux             %[n2],       %[Temp4](%[cm])                  \n\t"
+                    "sb               %[tp2],      2(%[output_ptr])                 \n\t"
+                    "sb               %[n2],       3(%[output_ptr])                 \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
+                      [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [cm] "r" (cm),
+                      [output_ptr] "r" (output_ptr), [src_ptr] "r" (src_ptr)
+                );
+
+                src_ptr += 4;
+            }
+
+            /* next row... */
+            src_ptr += src_pixels_per_line;
+            output_ptr += output_width;
+        }
+    }
+}
+
+
+void vp8_filter_block2d_second_pass4
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    int output_pitch,
+    int yoffset
+)
+{
+    unsigned int i;
+
+    int Temp1, Temp2, Temp3, Temp4;
+    unsigned int vector1b, vector2b, vector3b, vector4a;
+
+    unsigned char src_ptr_l2;
+    unsigned char src_ptr_l1;
+    unsigned char src_ptr_0;
+    unsigned char src_ptr_r1;
+    unsigned char src_ptr_r2;
+    unsigned char src_ptr_r3;
+
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector4a = 64;
+
+    /* load filter coefficients */
+    vector1b = sub_pel_filterss[yoffset][0];
+    vector2b = sub_pel_filterss[yoffset][2];
+    vector3b = sub_pel_filterss[yoffset][1];
+
+    if (vector1b)
+    {
+        /* 6 tap filter */
+
+        for (i = 2; i--;)
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr);
+
+            /* do not allow compiler to reorder instructions */
+            __asm__ __volatile__ (
+                ".set noreorder                                                 \n\t"
+                :
+                :
+            );
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -8(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r3],  12(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -7(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r3],  13(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -6(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  14(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -5(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  15(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+
+            output_ptr += output_pitch;
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  16(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  17(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  18(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  19(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+
+            src_ptr += 8;
+            output_ptr += output_pitch;
+        }
+    }
+    else
+    {
+        /* 4 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr);
+
+        for (i = 2; i--;)
+        {
+            /* do not allow compiler to reorder instructions */
+            __asm__ __volatile__ (
+                ".set noreorder                                                 \n\t"
+                :
+                :
+            );
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+
+            output_ptr += output_pitch;
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+
+            src_ptr += 8;
+            output_ptr += output_pitch;
+        }
+    }
+}
+
+
+void vp8_filter_block2d_second_pass_8
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    int output_pitch,
+    unsigned int output_height,
+    unsigned int output_width,
+    unsigned int yoffset
+)
+{
+    unsigned int i;
+
+    int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
+    unsigned int vector1b, vector2b, vector3b, vector4a;
+
+    unsigned char src_ptr_l2;
+    unsigned char src_ptr_l1;
+    unsigned char src_ptr_0;
+    unsigned char src_ptr_r1;
+    unsigned char src_ptr_r2;
+    unsigned char src_ptr_r3;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector4a = 64;
+
+    vector1b = sub_pel_filterss[yoffset][0];
+    vector2b = sub_pel_filterss[yoffset][2];
+    vector3b = sub_pel_filterss[yoffset][1];
+
+    if (vector1b)
+    {
+        /* 6 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr);
+
+        for (i = output_height; i--;)
+        {
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -16(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  24(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -15(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  25(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -14(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  18(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  26(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -13(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  19(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  27(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -12(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  12(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  20(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  28(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                "lbu            %[src_ptr_l2],  -11(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  13(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  21(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  29(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -10(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  14(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  22(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  30(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp6],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -9(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  15(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  23(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  31(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp7],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp8],       $ac1,           9               \n\t"
+
+                : [Temp4] "=&r" (Temp4), [Temp5] "=&r" (Temp5),
+                  [Temp6] "=&r" (Temp6), [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+            output_ptr[4] = cm[Temp5];
+            output_ptr[5] = cm[Temp6];
+            output_ptr[6] = cm[Temp7];
+            output_ptr[7] = cm[Temp8];
+
+            src_ptr += 8;
+            output_ptr += output_pitch;
+        }
+    }
+    else
+    {
+        /* 4 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr);
+
+        for (i = output_height; i--;)
+        {
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                : [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+            );
+
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                : [Temp1] "=r" (Temp1),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+            );
+
+            src_ptr_l1 = src_ptr[-6];
+            src_ptr_0  = src_ptr[2];
+            src_ptr_r1 = src_ptr[10];
+            src_ptr_r2 = src_ptr[18];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                : [Temp2] "=r" (Temp2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-5];
+            src_ptr_0  = src_ptr[3];
+            src_ptr_r1 = src_ptr[11];
+            src_ptr_r2 = src_ptr[19];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+
+                : [Temp3] "=r" (Temp3)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-4];
+            src_ptr_0  = src_ptr[4];
+            src_ptr_r1 = src_ptr[12];
+            src_ptr_r2 = src_ptr[20];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp4] "=r" (Temp4)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-3];
+            src_ptr_0  = src_ptr[5];
+            src_ptr_r1 = src_ptr[13];
+            src_ptr_r2 = src_ptr[21];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                : [Temp5] "=&r" (Temp5)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-2];
+            src_ptr_0  = src_ptr[6];
+            src_ptr_r1 = src_ptr[14];
+            src_ptr_r2 = src_ptr[22];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp6],       $ac3,           9               \n\t"
+
+                : [Temp6] "=r" (Temp6)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-1];
+            src_ptr_0  = src_ptr[7];
+            src_ptr_r1 = src_ptr[15];
+            src_ptr_r2 = src_ptr[23];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp7],       $ac0,           9               \n\t"
+                "extp           %[Temp8],       $ac1,           9               \n\t"
+
+                : [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+            output_ptr[4] = cm[Temp5];
+            output_ptr[5] = cm[Temp6];
+            output_ptr[6] = cm[Temp7];
+            output_ptr[7] = cm[Temp8];
+
+            src_ptr += 8;
+            output_ptr += output_pitch;
+        }
+    }
+}
+
+
+void vp8_filter_block2d_second_pass161
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    int output_pitch,
+    const unsigned short *vp8_filter
+)
+{
+    unsigned int i, j;
+
+    int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
+    unsigned int vector4a;
+    unsigned int vector1b, vector2b, vector3b;
+
+    unsigned char src_ptr_l2;
+    unsigned char src_ptr_l1;
+    unsigned char src_ptr_0;
+    unsigned char src_ptr_r1;
+    unsigned char src_ptr_r2;
+    unsigned char src_ptr_r3;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector4a = 64;
+
+    vector1b = vp8_filter[0];
+    vector2b = vp8_filter[2];
+    vector3b = vp8_filter[1];
+
+    if (vector1b == 0)
+    {
+        /* 4 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr + 16);
+
+        for (i = 16; i--;)
+        {
+            /* unrolling for loop */
+            for (j = 0; j < 16; j += 8)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac2                            \n\t"
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac3                            \n\t"
+                    "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac1                            \n\t"
+                    "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac3                            \n\t"
+                    "extp           %[Temp3],       $ac1,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac2                            \n\t"
+                    "extp           %[Temp4],       $ac3,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac3                            \n\t"
+                    "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac1                            \n\t"
+                    "extp           %[Temp6],       $ac3,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac3                            \n\t"
+                    "extp           %[Temp7],       $ac1,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                    "extp           %[Temp8],       $ac3,           9               \n\t"
+
+                    : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+                      [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
+                      [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+                      [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                      [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                    : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                      [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+                );
+
+                /* clamp and store results */
+                output_ptr[j] = cm[Temp1];
+                output_ptr[j + 1] = cm[Temp2];
+                output_ptr[j + 2] = cm[Temp3];
+                output_ptr[j + 3] = cm[Temp4];
+                output_ptr[j + 4] = cm[Temp5];
+                output_ptr[j + 5] = cm[Temp6];
+                output_ptr[j + 6] = cm[Temp7];
+                output_ptr[j + 7] = cm[Temp8];
+
+                src_ptr += 8;
+            }
+
+            output_ptr += output_pitch;
+        }
+    }
+    else
+    {
+        /* 4 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr + 16);
+
+        /* unroll for loop */
+        for (i = 16; i--;)
+        {
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -32(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  48(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -31(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  49(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -30(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  50(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp2],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -29(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  51(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp3],       $ac1,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -28(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  52(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "extp           %[Temp4],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -27(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  53(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -26(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  54(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp6],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -25(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  55(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp7],       $ac1,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp8],       $ac3,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+                  [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
+                  [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+            output_ptr[4] = cm[Temp5];
+            output_ptr[5] = cm[Temp6];
+            output_ptr[6] = cm[Temp7];
+            output_ptr[7] = cm[Temp8];
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -24(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  24(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  40(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  56(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -23(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  25(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  41(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  57(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -22(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  26(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  42(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  58(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp2],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -21(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  27(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  43(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  59(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp3],       $ac1,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -20(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   12(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  28(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  44(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  60(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "extp           %[Temp4],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -19(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   13(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  29(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  45(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  61(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -18(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   14(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  30(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  46(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  62(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp6],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -17(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   15(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  31(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  47(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  63(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp7],       $ac1,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp8],       $ac3,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+                  [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
+                  [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            src_ptr += 16;
+            output_ptr[8] = cm[Temp1];
+            output_ptr[9] = cm[Temp2];
+            output_ptr[10] = cm[Temp3];
+            output_ptr[11] = cm[Temp4];
+            output_ptr[12] = cm[Temp5];
+            output_ptr[13] = cm[Temp6];
+            output_ptr[14] = cm[Temp7];
+            output_ptr[15] = cm[Temp8];
+
+            output_ptr += output_pitch;
+        }
+    }
+}
+
+
+void vp8_sixtap_predict4x4_dspr2
+(
+    unsigned char *RESTRICT src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int dst_pitch
+)
+{
+    unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */
+    unsigned int pos = 16;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+    );
+
+    if (yoffset)
+    {
+        /* First filter 1-D horizontally... */
+        vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData,
+                                        src_pixels_per_line, 9, xoffset, 4);
+        /* then filter verticaly... */
+        vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset);
+    }
+    else
+        /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+        vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line,
+                                        4, xoffset, dst_pitch);
+}
+
+
+void vp8_sixtap_predict8x8_dspr2
+(
+    unsigned char   *RESTRICT src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int  dst_pitch
+)
+{
+
+    unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */
+    unsigned int pos, Temp1, Temp2;
+
+    pos = 16;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+        "wrdsp      %[pos],     1               \n\t"
+        :
+        : [pos] "r" (pos)
+    );
+
+    if (yoffset)
+    {
+
+        src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+        if (xoffset)
+            /* filter 1-D horizontally... */
+            vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
+                                                13, xoffset, 8);
+
+        else
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + 2 * src_pixels_per_line);
+
+            __asm__ __volatile__ (
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   0(%[FData])                             \n\t"
+                "sw     %[Temp2],   4(%[FData])                             \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   8(%[FData])                             \n\t"
+                "sw     %[Temp2],   12(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   16(%[FData])                            \n\t"
+                "sw     %[Temp2],   20(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   24(%[FData])                            \n\t"
+                "sw     %[Temp2],   28(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   32(%[FData])                            \n\t"
+                "sw     %[Temp2],   36(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   40(%[FData])                            \n\t"
+                "sw     %[Temp2],   44(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   48(%[FData])                            \n\t"
+                "sw     %[Temp2],   52(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   56(%[FData])                            \n\t"
+                "sw     %[Temp2],   60(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   64(%[FData])                            \n\t"
+                "sw     %[Temp2],   68(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   72(%[FData])                            \n\t"
+                "sw     %[Temp2],   76(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   80(%[FData])                            \n\t"
+                "sw     %[Temp2],   84(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   88(%[FData])                            \n\t"
+                "sw     %[Temp2],   92(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   96(%[FData])                            \n\t"
+                "sw     %[Temp2],   100(%[FData])                           \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+                : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
+                  [src_pixels_per_line] "r" (src_pixels_per_line)
+            );
+        }
+
+        /* filter verticaly... */
+        vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8, yoffset);
+    }
+
+    /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+    else
+    {
+        if (xoffset)
+            vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
+                                                8, xoffset, dst_pitch);
+
+        else
+        {
+            /* copy from src buffer to dst buffer */
+            __asm__ __volatile__ (
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
+                "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
+                "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],   %[src_pixels_per_line]    \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   32(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   36(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   40(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   44(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   48(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   52(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   56(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   60(%[dst_ptr])                          \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+                : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
+                  [src_pixels_per_line] "r" (src_pixels_per_line)
+            );
+        }
+    }
+}
+
+
+void vp8_sixtap_predict8x4_dspr2
+(
+    unsigned char   *RESTRICT src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int  dst_pitch
+)
+{
+    unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */
+    unsigned int pos, Temp1, Temp2;
+
+    pos = 16;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+    );
+
+    if (yoffset)
+    {
+
+        src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+        if (xoffset)
+            /* filter 1-D horizontally... */
+            vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
+                                                9, xoffset, 8);
+
+        else
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + 2 * src_pixels_per_line);
+
+            __asm__ __volatile__ (
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   0(%[FData])                             \n\t"
+                "sw     %[Temp2],   4(%[FData])                             \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   8(%[FData])                             \n\t"
+                "sw     %[Temp2],   12(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   16(%[FData])                            \n\t"
+                "sw     %[Temp2],   20(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   24(%[FData])                            \n\t"
+                "sw     %[Temp2],   28(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   32(%[FData])                            \n\t"
+                "sw     %[Temp2],   36(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   40(%[FData])                            \n\t"
+                "sw     %[Temp2],   44(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   48(%[FData])                            \n\t"
+                "sw     %[Temp2],   52(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   56(%[FData])                            \n\t"
+                "sw     %[Temp2],   60(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   64(%[FData])                            \n\t"
+                "sw     %[Temp2],   68(%[FData])                            \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+                : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
+                  [src_pixels_per_line] "r" (src_pixels_per_line)
+            );
+        }
+
+        /* filter verticaly... */
+        vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8, yoffset);
+    }
+
+    /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+    else
+    {
+        if (xoffset)
+            vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
+                                                4, xoffset, dst_pitch);
+
+        else
+        {
+            /* copy from src buffer to dst buffer */
+            __asm__ __volatile__ (
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
+                "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
+                "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+                : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
+                  [src_pixels_per_line] "r" (src_pixels_per_line)
+            );
+        }
+    }
+}
+
+
+void vp8_sixtap_predict16x16_dspr2
+(
+    unsigned char   *RESTRICT src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int  dst_pitch
+)
+{
+    const unsigned short *VFilter;
+    unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */
+    unsigned int pos;
+
+    VFilter = sub_pel_filterss[yoffset];
+
+    pos = 16;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+    );
+
+    if (yoffset)
+    {
+
+        src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+        switch (xoffset)
+        {
+            /* filter 1-D horizontally... */
+        case 2:
+        case 4:
+        case 6:
+            /* 6 tap filter */
+            vp8_filter_block2d_first_pass16_6tap(src_ptr, FData, src_pixels_per_line,
+                                                 21, xoffset, 16);
+            break;
+
+        case 0:
+            /* only copy buffer */
+            vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line);
+            break;
+
+        case 1:
+        case 3:
+        case 5:
+        case 7:
+            /* 4 tap filter */
+            vp8_filter_block2d_first_pass16_4tap(src_ptr, FData, src_pixels_per_line, 16,
+                                                 21, xoffset, yoffset, dst_ptr, dst_pitch);
+            break;
+        }
+
+        /* filter verticaly... */
+        vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter);
+    }
+    else
+    {
+        /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+        switch (xoffset)
+        {
+        case 2:
+        case 4:
+        case 6:
+            /* 6 tap filter */
+            vp8_filter_block2d_first_pass16_6tap(src_ptr, dst_ptr, src_pixels_per_line,
+                                                 16, xoffset, dst_pitch);
+            break;
+
+        case 1:
+        case 3:
+        case 5:
+        case 7:
+            /* 4 tap filter */
+            vp8_filter_block2d_first_pass16_4tap(src_ptr, dst_ptr, src_pixels_per_line, 16,
+                                                 21, xoffset, yoffset, dst_ptr, dst_pitch);
+            break;
+        }
+    }
+}
+
+#endif
diff --git a/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c b/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
new file mode 100644
index 0000000..1e0ebd1
--- /dev/null
+++ b/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
@@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+#if HAVE_DSPR2
+
+void vp8_dequant_idct_add_y_block_dspr2
+(short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs)
+{
+    int i, j;
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_dspr2(q, dq, dst, stride);
+            else
+            {
+                vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dst, stride, dst, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q   += 16;
+            dst += 4;
+        }
+
+        dst += 4 * stride - 16;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_dspr2
+(short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    int i, j;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_dspr2(q, dq, dstu, stride);
+            else
+            {
+                vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dstu, stride, dstu, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q    += 16;
+            dstu += 4;
+        }
+
+        dstu += 4 * stride - 8;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_dspr2(q, dq, dstv, stride);
+            else
+            {
+                vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dstv, stride, dstv, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q    += 16;
+            dstv += 4;
+        }
+
+        dstv += 4 * stride - 8;
+    }
+}
+
+#endif
+
diff --git a/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c b/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c
new file mode 100644
index 0000000..25b7936
--- /dev/null
+++ b/libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c
@@ -0,0 +1,369 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_rtcd.h"
+
+#if HAVE_DSPR2
+#define CROP_WIDTH 256
+
+/******************************************************************************
+ * Notes:
+ *
+ * This implementation makes use of 16 bit fixed point version of two multiply
+ * constants:
+ *         1.   sqrt(2) * cos (pi/8)
+ *         2.   sqrt(2) * sin (pi/8)
+ * Since the first constant is bigger than 1, to maintain the same 16 bit
+ * fixed point precision as the second one, we use a trick of
+ *         x * a = x + x*(a-1)
+ * so
+ *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+ ****************************************************************************/
+extern unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2      = 35468;
+
+inline void prefetch_load_short(short *src)
+{
+    __asm__ __volatile__ (
+        "pref   0,  0(%[src])   \n\t"
+        :
+        : [src] "r" (src)
+    );
+}
+
+void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred_ptr,
+                                int pred_stride, unsigned char *dst_ptr,
+                                int dst_stride)
+{
+    int r, c;
+    int a1, b1, c1, d1;
+    short output[16];
+    short *ip = input;
+    short *op = output;
+    int temp1, temp2;
+    int shortpitch = 4;
+
+    int c2, d2;
+    int temp3, temp4;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    /* prepare data for load */
+    prefetch_load_short(ip + 8);
+
+    /* first loop is unrolled */
+    a1 = ip[0] + ip[8];
+    b1 = ip[0] - ip[8];
+
+    temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+    temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+    temp2 = (ip[12] * sinpi8sqrt2) >> 16;
+    d1 = temp1 + temp2;
+
+    temp3 = (ip[5] * sinpi8sqrt2) >> 16;
+    temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
+    c2 = temp3 - temp4;
+
+    temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
+    temp4 = (ip[13] * sinpi8sqrt2) >> 16;
+    d2 = temp3 + temp4;
+
+    op[0] = a1 + d1;
+    op[12] = a1 - d1;
+    op[4] = b1 + c1;
+    op[8] = b1 - c1;
+
+    a1 = ip[1] + ip[9];
+    b1 = ip[1] - ip[9];
+
+    op[1] = a1 + d2;
+    op[13] = a1 - d2;
+    op[5] = b1 + c2;
+    op[9] = b1 - c2;
+
+    a1 = ip[2] + ip[10];
+    b1 = ip[2] - ip[10];
+
+    temp1 = (ip[6] * sinpi8sqrt2) >> 16;
+    temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16);
+    temp2 = (ip[14] * sinpi8sqrt2) >> 16;
+    d1 = temp1 + temp2;
+
+    temp3 = (ip[7] * sinpi8sqrt2) >> 16;
+    temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
+    c2 = temp3 - temp4;
+
+    temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
+    temp4 = (ip[15] * sinpi8sqrt2) >> 16;
+    d2 = temp3 + temp4;
+
+    op[2] = a1 + d1;
+    op[14] = a1 - d1;
+    op[6] = b1 + c1;
+    op[10] = b1 - c1;
+
+    a1 = ip[3] + ip[11];
+    b1 = ip[3] - ip[11];
+
+    op[3] = a1 + d2;
+    op[15] = a1 - d2;
+    op[7] = b1 + c2;
+    op[11] = b1 - c2;
+
+    ip = output;
+
+    /* prepare data for load */
+    prefetch_load_short(ip + shortpitch);
+
+    /* second loop is unrolled */
+    a1 = ip[0] + ip[2];
+    b1 = ip[0] - ip[2];
+
+    temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+    temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+    temp2 = (ip[3] * sinpi8sqrt2) >> 16;
+    d1 = temp1 + temp2;
+
+    temp3 = (ip[5] * sinpi8sqrt2) >> 16;
+    temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
+    c2 = temp3 - temp4;
+
+    temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
+    temp4 = (ip[7] * sinpi8sqrt2) >> 16;
+    d2 = temp3 + temp4;
+
+    op[0] = (a1 + d1 + 4) >> 3;
+    op[3] = (a1 - d1 + 4) >> 3;
+    op[1] = (b1 + c1 + 4) >> 3;
+    op[2] = (b1 - c1 + 4) >> 3;
+
+    a1 = ip[4] + ip[6];
+    b1 = ip[4] - ip[6];
+
+    op[4] = (a1 + d2 + 4) >> 3;
+    op[7] = (a1 - d2 + 4) >> 3;
+    op[5] = (b1 + c2 + 4) >> 3;
+    op[6] = (b1 - c2 + 4) >> 3;
+
+    a1 = ip[8] + ip[10];
+    b1 = ip[8] - ip[10];
+
+    temp1 = (ip[9] * sinpi8sqrt2) >> 16;
+    temp2 = ip[11] + ((ip[11] * cospi8sqrt2minus1) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[9] + ((ip[9] * cospi8sqrt2minus1) >> 16);
+    temp2 = (ip[11] * sinpi8sqrt2) >> 16;
+    d1 = temp1 + temp2;
+
+    temp3 = (ip[13] * sinpi8sqrt2) >> 16;
+    temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
+    c2 = temp3 - temp4;
+
+    temp3 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
+    temp4 = (ip[15] * sinpi8sqrt2) >> 16;
+    d2 = temp3 + temp4;
+
+    op[8] = (a1 + d1 + 4) >> 3;
+    op[11] = (a1 - d1 + 4) >> 3;
+    op[9] = (b1 + c1 + 4) >> 3;
+    op[10] = (b1 - c1 + 4) >> 3;
+
+    a1 = ip[12] + ip[14];
+    b1 = ip[12] - ip[14];
+
+    op[12] = (a1 + d2 + 4) >> 3;
+    op[15] = (a1 - d2 + 4) >> 3;
+    op[13] = (b1 + c2 + 4) >> 3;
+    op[14] = (b1 - c2 + 4) >> 3;
+
+    ip = output;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            short a = ip[c] + pred_ptr[c] ;
+            dst_ptr[c] = cm[a] ;
+        }
+
+        ip += 4;
+        dst_ptr += dst_stride;
+        pred_ptr += pred_stride;
+    }
+}
+
+void vp8_dc_only_idct_add_dspr2(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride)
+{
+    int a1;
+    int i, absa1;
+    int t2, vector_a1, vector_a;
+
+    /* a1 = ((input_dc + 4) >> 3); */
+    __asm__ __volatile__ (
+        "addi  %[a1], %[input_dc], 4   \n\t"
+        "sra   %[a1], %[a1],       3   \n\t"
+        : [a1] "=r" (a1)
+        : [input_dc] "r" (input_dc)
+    );
+
+    if (a1 < 0)
+    {
+        /* use quad-byte
+         * input and output memory are four byte aligned
+         */
+        __asm__ __volatile__ (
+            "abs        %[absa1],     %[a1]         \n\t"
+            "replv.qb   %[vector_a1], %[absa1]      \n\t"
+            : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+            : [a1] "r" (a1)
+        );
+
+        /* use (a1 - predptr[c]) instead a1 + predptr[c] */
+        for (i = 4; i--;)
+        {
+            __asm__ __volatile__ (
+                "lw             %[t2],       0(%[pred_ptr])                     \n\t"
+                "add            %[pred_ptr], %[pred_ptr],    %[pred_stride]     \n\t"
+                "subu_s.qb      %[vector_a], %[t2],          %[vector_a1]       \n\t"
+                "sw             %[vector_a], 0(%[dst_ptr])                      \n\t"
+                "add            %[dst_ptr],  %[dst_ptr],     %[dst_stride]      \n\t"
+                : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+                  [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
+                : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
+            );
+        }
+    }
+    else
+    {
+        /* use quad-byte
+         * input and output memory are four byte aligned
+         */
+        __asm__ __volatile__ (
+            "replv.qb       %[vector_a1], %[a1]     \n\t"
+            : [vector_a1] "=r" (vector_a1)
+            : [a1] "r" (a1)
+        );
+
+        for (i = 4; i--;)
+        {
+            __asm__ __volatile__ (
+                "lw             %[t2],       0(%[pred_ptr])                 \n\t"
+                "add            %[pred_ptr], %[pred_ptr],    %[pred_stride] \n\t"
+                "addu_s.qb      %[vector_a], %[vector_a1],   %[t2]          \n\t"
+                "sw             %[vector_a], 0(%[dst_ptr])                  \n\t"
+                "add            %[dst_ptr],  %[dst_ptr],     %[dst_stride]  \n\t"
+                : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+                  [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
+                : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
+            );
+        }
+    }
+
+}
+
+void vp8_short_inv_walsh4x4_dspr2(short *input, short *mb_dqcoeff)
+{
+    short output[16];
+    int i;
+    int a1, b1, c1, d1;
+    int a2, b2, c2, d2;
+    short *ip = input;
+    short *op = output;
+
+    prefetch_load_short(ip);
+
+    for (i = 4; i--;)
+    {
+        a1 = ip[0] + ip[12];
+        b1 = ip[4] + ip[8];
+        c1 = ip[4] - ip[8];
+        d1 = ip[0] - ip[12];
+
+        op[0] = a1 + b1;
+        op[4] = c1 + d1;
+        op[8] = a1 - b1;
+        op[12] = d1 - c1;
+
+        ip++;
+        op++;
+    }
+
+    ip = output;
+    op = output;
+
+    prefetch_load_short(ip);
+
+    for (i = 4; i--;)
+    {
+        a1 = ip[0] + ip[3] + 3;
+        b1 = ip[1] + ip[2];
+        c1 = ip[1] - ip[2];
+        d1 = ip[0] - ip[3] + 3;
+
+        a2 = a1 + b1;
+        b2 = d1 + c1;
+        c2 = a1 - b1;
+        d2 = d1 - c1;
+
+        op[0] = a2 >> 3;
+        op[1] = b2 >> 3;
+        op[2] = c2 >> 3;
+        op[3] = d2 >> 3;
+
+        ip += 4;
+        op += 4;
+    }
+
+    for (i = 0; i < 16; i++)
+    {
+        mb_dqcoeff[i * 16] = output[i];
+    }
+}
+
+void vp8_short_inv_walsh4x4_1_dspr2(short *input, short *mb_dqcoeff)
+{
+    int a1;
+
+    a1 = ((input[0] + 3) >> 3);
+
+    __asm__ __volatile__ (
+        "sh             %[a1], 0(%[mb_dqcoeff])                    \n\t"
+        "sh             %[a1], 32(%[mb_dqcoeff])                   \n\t"
+        "sh             %[a1], 64(%[mb_dqcoeff])                   \n\t"
+        "sh             %[a1], 96(%[mb_dqcoeff])                   \n\t"
+        "sh             %[a1], 128(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 160(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 192(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 224(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 256(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 288(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 320(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 352(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 384(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 416(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 448(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 480(%[mb_dqcoeff])                  \n\t"
+
+        :
+        : [a1] "r" (a1), [mb_dqcoeff] "r" (mb_dqcoeff)
+    );
+}
+
+#endif
diff --git a/libvpx/vp8/common/mips/dspr2/loopfilter_filters_dspr2.c b/libvpx/vp8/common/mips/dspr2/loopfilter_filters_dspr2.c
new file mode 100644
index 0000000..b8e5e4d
--- /dev/null
+++ b/libvpx/vp8/common/mips/dspr2/loopfilter_filters_dspr2.c
@@ -0,0 +1,2622 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "vpx_rtcd.h"
+#include "vp8/common/onyxc_int.h"
+
+#if HAVE_DSPR2
+typedef unsigned char uc;
+
+/* prefetch data for load */
+inline void prefetch_load_lf(unsigned char *src)
+{
+    __asm__ __volatile__ (
+        "pref   0,  0(%[src])   \n\t"
+        :
+        : [src] "r" (src)
+    );
+}
+
+
+/* prefetch data for store */
+inline void prefetch_store_lf(unsigned char *dst)
+{
+    __asm__ __volatile__ (
+        "pref   1,  0(%[dst])   \n\t"
+        :
+        : [dst] "r" (dst)
+    );
+}
+
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function
+ */
+static __inline void vp8_filter_mask_vec_mips
+(
+    uint32_t limit,
+    uint32_t flimit,
+    uint32_t p1,
+    uint32_t p0,
+    uint32_t p3,
+    uint32_t p2,
+    uint32_t q0,
+    uint32_t q1,
+    uint32_t q2,
+    uint32_t q3,
+    uint32_t thresh,
+    uint32_t *hev,
+    uint32_t *mask
+)
+{
+    uint32_t c, r, r3, r_k;
+    uint32_t s1, s2, s3;
+    uint32_t ones = 0xFFFFFFFF;
+    uint32_t hev1;
+
+    __asm__ __volatile__ (
+        /* mask |= (abs(p3 - p2) > limit) */
+        "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
+        "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   $0,        %[c]         \n\t"
+
+        /* mask |= (abs(p2 - p1) > limit) */
+        "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
+        "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+
+        /* mask |= (abs(p1 - p0) > limit)
+         * hev  |= (abs(p1 - p0) > thresh)
+         */
+        "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
+        "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+        "or             %[r3],  $0,        %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+
+        /* mask |= (abs(q1 - q0) > limit)
+         * hev  |= (abs(q1 - q0) > thresh)
+         */
+        "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
+        "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+        "or             %[r3],  %[r3],     %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+
+        /* mask |= (abs(q2 - q1) > limit) */
+        "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
+        "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+        "sll            %[r3],    %[r3],    24          \n\t"
+
+        /* mask |= (abs(q3 - q2) > limit) */
+        "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
+        "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+
+        : [c] "=&r" (c), [r_k] "=&r" (r_k),
+          [r] "=&r" (r), [r3] "=&r" (r3)
+        : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+          [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+          [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
+    );
+
+    __asm__ __volatile__ (
+        /* abs(p0 - q0) */
+        "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+        "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+        "wrdsp          %[r3]                           \n\t"
+        "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+        /* abs(p1 - q1) */
+        "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+        "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+        "pick.qb        %[hev1], %[ones],  $0           \n\t"
+        "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+        "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+        /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+        "shrl.qb        %[s2],   %[s2],     1           \n\t"
+        "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+        "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+        "or             %[r],    %[r],      %[c]        \n\t"
+        "sll            %[r],    %[r],      24          \n\t"
+
+        "wrdsp          %[r]                            \n\t"
+        "pick.qb        %[s2],  $0,         %[ones]     \n\t"
+
+        : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+          [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+        : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+          [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+    );
+
+    *hev = hev1;
+    *mask = s2;
+}
+
+
+/* inputs & outputs are quad-byte vectors */
+static __inline void vp8_filter_mips
+(
+    uint32_t mask,
+    uint32_t hev,
+    uint32_t *ps1,
+    uint32_t *ps0,
+    uint32_t *qs0,
+    uint32_t *qs1
+)
+{
+    int32_t vp8_filter_l, vp8_filter_r;
+    int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+    int32_t subr_r, subr_l;
+    uint32_t t1, t2, HWM, t3;
+    uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+
+    int32_t vps1, vps0, vqs0, vqs1;
+    int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+    uint32_t N128;
+
+    N128 = 0x80808080;
+    t1  = 0x03000300;
+    t2  = 0x04000400;
+    t3  = 0x01000100;
+    HWM = 0xFF00FF00;
+
+    vps0 = (*ps0) ^ N128;
+    vps1 = (*ps1) ^ N128;
+    vqs0 = (*qs0) ^ N128;
+    vqs1 = (*qs1) ^ N128;
+
+    /* use halfword pairs instead quad-bytes because of accuracy */
+    vps0_l = vps0 & HWM;
+    vps0_r = vps0 << 8;
+    vps0_r = vps0_r & HWM;
+
+    vps1_l = vps1 & HWM;
+    vps1_r = vps1 << 8;
+    vps1_r = vps1_r & HWM;
+
+    vqs0_l = vqs0 & HWM;
+    vqs0_r = vqs0 << 8;
+    vqs0_r = vqs0_r & HWM;
+
+    vqs1_l = vqs1 & HWM;
+    vqs1_r = vqs1 << 8;
+    vqs1_r = vqs1_r & HWM;
+
+    mask_l = mask & HWM;
+    mask_r = mask << 8;
+    mask_r = mask_r & HWM;
+
+    hev_l = hev & HWM;
+    hev_r = hev << 8;
+    hev_r = hev_r & HWM;
+
+    __asm__ __volatile__ (
+        /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
+        "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+        "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+        /* qs0 - ps0 */
+        "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+        "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+        /* vp8_filter &= hev; */
+        "and          %[vp8_filter_l], %[vp8_filter_l], %[hev_l]        \n\t"
+        "and          %[vp8_filter_r], %[vp8_filter_r], %[hev_r]        \n\t"
+
+        /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+        "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+        "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+
+        /* vp8_filter &= mask; */
+        "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
+        "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
+
+        : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=&r" (vp8_filter_r),
+          [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+          [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+
+        : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+          [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+          [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+          [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+          [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
+          [HWM] "r" (HWM)
+    );
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
+    __asm__ __volatile__ (
+        /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */
+        "addq_s.ph    %[Filter1_l],    %[vp8_filter_l], %[t2]           \n\t"
+        "addq_s.ph    %[Filter1_r],    %[vp8_filter_r], %[t2]           \n\t"
+
+        /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */
+        "addq_s.ph    %[Filter2_l],    %[vp8_filter_l], %[t1]           \n\t"
+        "addq_s.ph    %[Filter2_r],    %[vp8_filter_r], %[t1]           \n\t"
+        "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+        "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+        "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+        "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+        "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+        "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+        /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+        "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+        "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+        /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+        "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+        "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+        : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+          [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+          [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+
+        : [t1] "r" (t1), [t2] "r" (t2),
+          [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
+          [HWM] "r" (HWM)
+    );
+
+    __asm__ __volatile__ (
+        /* (vp8_filter += 1) >>= 1 */
+        "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+        "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+        /* vp8_filter &= ~hev; */
+        "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+        "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+        /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */
+        "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+        "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+        /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */
+        "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+        "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+        : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+          [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+          [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+
+        : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+    );
+
+    /* Create quad-bytes from halfword pairs */
+    vqs0_l = vqs0_l & HWM;
+    vqs1_l = vqs1_l & HWM;
+    vps0_l = vps0_l & HWM;
+    vps1_l = vps1_l & HWM;
+
+    __asm__ __volatile__ (
+        "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
+        "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
+        "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
+        "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
+
+        : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+          [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+        :
+    );
+
+    vqs0 = vqs0_l | vqs0_r;
+    vqs1 = vqs1_l | vqs1_r;
+    vps0 = vps0_l | vps0_r;
+    vps1 = vps1_l | vps1_r;
+
+    *ps0 = vps0 ^ N128;
+    *ps1 = vps1 ^ N128;
+    *qs0 = vqs0 ^ N128;
+    *qs1 = vqs1 ^ N128;
+}
+
+void vp8_loop_filter_horizontal_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask;
+    uint32_t hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+    mask = 0;
+    hev = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+
+    /* prefetch data for store */
+    prefetch_store_lf(s);
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    sm1 = s - (p << 2);
+    s0 = s - p - p - p;
+    s1 = s - p - p ;
+    s2 = s - p;
+    s3 = s;
+    s4 = s + p;
+    s5 = s + p + p;
+    s6 = s + p + p + p;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+}
+
+void vp8_loop_filter_uvhorizontal_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask;
+    uint32_t hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+    mask = 0;
+    hev = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    sm1 = s - (p << 2);
+    s0  = s - p - p - p;
+    s1  = s - p - p ;
+    s2  = s - p;
+    s3  = s;
+    s4  = s + p;
+    s5  = s + p + p;
+    s6  = s + p + p + p;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+}
+
+void vp8_loop_filter_vertical_edge_mips
+(
+    unsigned char *s,
+    int p,
+    const unsigned int flimit,
+    const unsigned int limit,
+    const unsigned int thresh,
+    int count
+)
+{
+    int i;
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *s1, *s2, *s3, *s4;
+    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+    hev = 0;
+    mask = 0;
+    i = 0;
+    pm1 = 0;
+    p0 = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+    p5 = 0;
+    p6 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    /* apply filter on 4 pixesl at the same time */
+    do
+    {
+
+        /* prefetch data for store */
+        prefetch_store_lf(s + p);
+
+        s1 = s;
+        s2 = s + p;
+        s3 = s2 + p;
+        s4 = s3 + p;
+        s  = s4 + p;
+
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p2  = *((uint32_t *)(s1 - 4));
+        p6  = *((uint32_t *)(s1));
+        p1  = *((uint32_t *)(s2 - 4));
+        p5  = *((uint32_t *)(s2));
+        p0  = *((uint32_t *)(s3 - 4));
+        p4  = *((uint32_t *)(s3));
+        pm1 = *((uint32_t *)(s4 - 4));
+        p3  = *((uint32_t *)(s4));
+
+        /* transpose pm1, p0, p1, p2 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+            "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+            "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+            "append         %[p1],      %[sec3],    16          \n\t"
+            "append         %[pm1],     %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* transpose p3, p4, p5, p6 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+            "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+            "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+            "append         %[p5],      %[sec3],    16          \n\t"
+            "append         %[p3],      %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+                /* unpack processed 4x4 neighborhood
+                 * don't use transpose on output data
+                 * because memory isn't aligned
+                 */
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s4])    \n\t"
+                    "sb         %[p3],  0(%[s4])    \n\t"
+                    "sb         %[p2], -1(%[s4])    \n\t"
+                    "sb         %[p1], -2(%[s4])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s3])    \n\t"
+                    "sb         %[p3],  0(%[s3])    \n\t"
+                    "sb         %[p2], -1(%[s3])    \n\t"
+                    "sb         %[p1], -2(%[s3])    \n\t"
+                    : [p1] "+r" (p1)
+                    : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s2])    \n\t"
+                    "sb         %[p3],  0(%[s2])    \n\t"
+                    "sb         %[p2], -1(%[s2])    \n\t"
+                    "sb         %[p1], -2(%[s2])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s1])    \n\t"
+                    "sb         %[p3],  0(%[s1])    \n\t"
+                    "sb         %[p2], -1(%[s1])    \n\t"
+                    "sb         %[p1], -2(%[s1])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+            }
+        }
+
+        s1 = s;
+        s2 = s + p;
+        s3 = s2 + p;
+        s4 = s3 + p;
+        s  = s4 + p;
+
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p2  = *((uint32_t *)(s1 - 4));
+        p6  = *((uint32_t *)(s1));
+        p1  = *((uint32_t *)(s2 - 4));
+        p5  = *((uint32_t *)(s2));
+        p0  = *((uint32_t *)(s3 - 4));
+        p4  = *((uint32_t *)(s3));
+        pm1 = *((uint32_t *)(s4 - 4));
+        p3  = *((uint32_t *)(s4));
+
+        /* transpose pm1, p0, p1, p2 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+            "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+            "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+            "append         %[p1],      %[sec3],    16          \n\t"
+            "append         %[pm1],     %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* transpose p3, p4, p5, p6 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+            "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+            "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+            "append         %[p5],      %[sec3],    16          \n\t"
+            "append         %[p3],      %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+                /* unpack processed 4x4 neighborhood
+                 * don't use transpose on output data
+                 * because memory isn't aligned
+                 */
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s4])    \n\t"
+                    "sb         %[p3],  0(%[s4])    \n\t"
+                    "sb         %[p2], -1(%[s4])    \n\t"
+                    "sb         %[p1], -2(%[s4])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s3])    \n\t"
+                    "sb         %[p3],  0(%[s3])    \n\t"
+                    "sb         %[p2], -1(%[s3])    \n\t"
+                    "sb         %[p1], -2(%[s3])    \n\t"
+                    : [p1] "+r" (p1)
+                    : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s2])    \n\t"
+                    "sb         %[p3],  0(%[s2])    \n\t"
+                    "sb         %[p2], -1(%[s2])    \n\t"
+                    "sb         %[p1], -2(%[s2])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s1])    \n\t"
+                    "sb         %[p3],  0(%[s1])    \n\t"
+                    "sb         %[p2], -1(%[s1])    \n\t"
+                    "sb         %[p1], -2(%[s1])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+            }
+        }
+
+        i += 8;
+    }
+
+    while (i < count);
+}
+
+void vp8_loop_filter_uvvertical_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *s1, *s2, *s3, *s4;
+    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    /* apply filter on 4 pixesl at the same time */
+
+    s1 = s;
+    s2 = s + p;
+    s3 = s2 + p;
+    s4 = s3 + p;
+
+    /* load quad-byte vectors
+    * memory is 4 byte aligned
+    */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+    * mask will be zero and filtering is not needed
+    */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood
+             * don't use transpose on output data
+             * because memory isn't aligned
+             */
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s4])    \n\t"
+                "sb         %[p3],  0(%[s4])    \n\t"
+                "sb         %[p2], -1(%[s4])    \n\t"
+                "sb         %[p1], -2(%[s4])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s3])    \n\t"
+                "sb         %[p3],  0(%[s3])    \n\t"
+                "sb         %[p2], -1(%[s3])    \n\t"
+                "sb         %[p1], -2(%[s3])    \n\t"
+                : [p1] "+r" (p1)
+                : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s2])    \n\t"
+                "sb         %[p3],  0(%[s2])    \n\t"
+                "sb         %[p2], -1(%[s2])    \n\t"
+                "sb         %[p1], -2(%[s2])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s1])    \n\t"
+                "sb         %[p3],  0(%[s1])    \n\t"
+                "sb         %[p2], -1(%[s1])    \n\t"
+                "sb         %[p1], -2(%[s1])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1), [p2] "r" (p2), [p1] "r" (p1)
+            );
+        }
+    }
+
+    s1 = s4 + p;
+    s2 = s1 + p;
+    s3 = s2 + p;
+    s4 = s3 + p;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood
+             * don't use transpose on output data
+             * because memory isn't aligned
+             */
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s4])    \n\t"
+                "sb         %[p3],  0(%[s4])    \n\t"
+                "sb         %[p2], -1(%[s4])    \n\t"
+                "sb         %[p1], -2(%[s4])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s3])    \n\t"
+                "sb         %[p3],  0(%[s3])    \n\t"
+                "sb         %[p2], -1(%[s3])    \n\t"
+                "sb         %[p1], -2(%[s3])    \n\t"
+                : [p1] "+r" (p1)
+                : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s2])    \n\t"
+                "sb         %[p3],  0(%[s2])    \n\t"
+                "sb         %[p2], -1(%[s2])    \n\t"
+                "sb         %[p1], -2(%[s2])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s1])    \n\t"
+                "sb         %[p3],  0(%[s1])    \n\t"
+                "sb         %[p2], -1(%[s1])    \n\t"
+                "sb         %[p1], -2(%[s1])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+        }
+    }
+}
+
+/* inputs & outputs are quad-byte vectors */
+static __inline void vp8_mbfilter_mips
+(
+    uint32_t mask,
+    uint32_t hev,
+    uint32_t *ps2,
+    uint32_t *ps1,
+    uint32_t *ps0,
+    uint32_t *qs0,
+    uint32_t *qs1,
+    uint32_t *qs2
+)
+{
+    int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2;
+    int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l;
+    int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r;
+    uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r, subr_r, subr_l;
+    uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l, invhev_r;
+    uint32_t N128, R63;
+    uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r;
+
+    R63  = 0x003F003F;
+    HWM  = 0xFF00FF00;
+    N128 = 0x80808080;
+    t1   = 0x03000300;
+    t2   = 0x04000400;
+
+    vps0 = (*ps0) ^ N128;
+    vps1 = (*ps1) ^ N128;
+    vps2 = (*ps2) ^ N128;
+    vqs0 = (*qs0) ^ N128;
+    vqs1 = (*qs1) ^ N128;
+    vqs2 = (*qs2) ^ N128;
+
+    /* use halfword pairs instead quad-bytes because of accuracy */
+    vps0_l = vps0 & HWM;
+    vps0_r = vps0 << 8;
+    vps0_r = vps0_r & HWM;
+
+    vqs0_l = vqs0 & HWM;
+    vqs0_r = vqs0 << 8;
+    vqs0_r = vqs0_r & HWM;
+
+    vps1_l = vps1 & HWM;
+    vps1_r = vps1 << 8;
+    vps1_r = vps1_r & HWM;
+
+    vqs1_l = vqs1 & HWM;
+    vqs1_r = vqs1 << 8;
+    vqs1_r = vqs1_r & HWM;
+
+    vqs2_l = vqs2 & HWM;
+    vqs2_r = vqs2 << 8;
+    vqs2_r = vqs2_r & HWM;
+
+    __asm__ __volatile__ (
+        /* qs0 - ps0 */
+        "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+        "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+        /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
+        "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+        "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+        : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=r" (vp8_filter_r),
+          [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r)
+        : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+          [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+          [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r)
+    );
+
+    vps2_l = vps2 & HWM;
+    vps2_r = vps2 << 8;
+    vps2_r = vps2_r & HWM;
+
+    /* add outer taps if we have high edge variance */
+    __asm__ __volatile__ (
+        /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+        "and          %[mask_l],       %[HWM],          %[mask]         \n\t"
+        "sll          %[mask_r],       %[mask],         8               \n\t"
+        "and          %[mask_r],       %[HWM],          %[mask_r]       \n\t"
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+        "and          %[hev_l],        %[HWM],          %[hev]          \n\t"
+        "sll          %[hev_r],        %[hev],          8               \n\t"
+        "and          %[hev_r],        %[HWM],          %[hev_r]        \n\t"
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+
+        /* vp8_filter &= mask; */
+        "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
+        "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
+
+        /* Filter2 = vp8_filter & hev; */
+        "and          %[Filter2_l],    %[vp8_filter_l], %[hev_l]        \n\t"
+        "and          %[Filter2_r],    %[vp8_filter_r], %[hev_r]        \n\t"
+
+        : [vp8_filter_l] "+r" (vp8_filter_l), [vp8_filter_r] "+r" (vp8_filter_r),
+          [hev_l] "=&r" (hev_l), [hev_r] "=&r" (hev_r),
+          [mask_l] "=&r" (mask_l), [mask_r] "=&r" (mask_r),
+          [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
+        : [subr_l] "r" (subr_l), [subr_r] "r" (subr_r),
+          [HWM] "r" (HWM), [hev]  "r" (hev), [mask] "r" (mask)
+    );
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
+    __asm__ __volatile__ (
+        /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */
+        "addq_s.ph    %[Filter1_l],    %[Filter2_l],    %[t2]           \n\t"
+        "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+        "addq_s.ph    %[Filter1_r],    %[Filter2_r],    %[t2]           \n\t"
+
+        /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */
+        "addq_s.ph    %[Filter2_l],    %[Filter2_l],    %[t1]           \n\t"
+        "addq_s.ph    %[Filter2_r],    %[Filter2_r],    %[t1]           \n\t"
+
+        "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+        "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+
+        "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+        "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+        "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+        "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+        "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+
+        /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+        "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+        "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+        /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+        "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+        "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+        : [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r),
+          [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+          [Filter2_l] "+r" (Filter2_l), [Filter2_r] "+r" (Filter2_r),
+          [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+        : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+          [hev_l] "r" (hev_l), [hev_r] "r" (hev_r)
+    );
+
+    /* only apply wider filter if not high edge variance */
+    __asm__ __volatile__ (
+        /* vp8_filter &= ~hev; */
+        "and          %[Filter2_l],    %[vp8_filter_l], %[invhev_l]     \n\t"
+        "and          %[Filter2_r],    %[vp8_filter_r], %[invhev_r]     \n\t"
+
+        "shra.ph      %[Filter2_l],    %[Filter2_l],    8               \n\t"
+        "shra.ph      %[Filter2_r],    %[Filter2_r],    8               \n\t"
+
+        : [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
+        : [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
+          [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+    );
+
+    /* roughly 3/7th difference across boundary */
+    __asm__ __volatile__ (
+        "shll.ph      %[u3_l],         %[Filter2_l],    3               \n\t"
+        "shll.ph      %[u3_r],         %[Filter2_r],    3               \n\t"
+
+        "addq.ph      %[u3_l],         %[u3_l],         %[Filter2_l]    \n\t"
+        "addq.ph      %[u3_r],         %[u3_r],         %[Filter2_r]    \n\t"
+
+        "shll.ph      %[u2_l],         %[u3_l],         1               \n\t"
+        "shll.ph      %[u2_r],         %[u3_r],         1               \n\t"
+
+        "addq.ph      %[u1_l],         %[u3_l],         %[u2_l]         \n\t"
+        "addq.ph      %[u1_r],         %[u3_r],         %[u2_r]         \n\t"
+
+        "addq.ph      %[u2_l],         %[u2_l],         %[R63]          \n\t"
+        "addq.ph      %[u2_r],         %[u2_r],         %[R63]          \n\t"
+
+        "addq.ph      %[u3_l],         %[u3_l],         %[R63]          \n\t"
+        "addq.ph      %[u3_r],         %[u3_r],         %[R63]          \n\t"
+
+        /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7)
+         * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7)
+         */
+        "addq.ph      %[u1_l],         %[u1_l],         %[R63]          \n\t"
+        "addq.ph      %[u1_r],         %[u1_r],         %[R63]          \n\t"
+        "shra.ph      %[u1_l],         %[u1_l],         7               \n\t"
+        "shra.ph      %[u1_r],         %[u1_r],         7               \n\t"
+        "shra.ph      %[u2_l],         %[u2_l],         7               \n\t"
+        "shra.ph      %[u2_r],         %[u2_r],         7               \n\t"
+        "shll.ph      %[u1_l],         %[u1_l],         8               \n\t"
+        "shll.ph      %[u1_r],         %[u1_r],         8               \n\t"
+        "shll.ph      %[u2_l],         %[u2_l],         8               \n\t"
+        "shll.ph      %[u2_r],         %[u2_r],         8               \n\t"
+
+        /* vqs0 = vp8_signed_char_clamp(qs0 - u); */
+        "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[u1_l]         \n\t"
+        "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[u1_r]         \n\t"
+
+        /* vps0 = vp8_signed_char_clamp(ps0 + u); */
+        "addq_s.ph    %[vps0_l],       %[vps0_l],       %[u1_l]         \n\t"
+        "addq_s.ph    %[vps0_r],       %[vps0_r],       %[u1_r]         \n\t"
+
+        : [u1_l] "=&r" (u1_l), [u1_r] "=&r" (u1_r), [u2_l] "=&r" (u2_l),
+          [u2_r] "=&r" (u2_r), [u3_l] "=&r" (u3_l), [u3_r] "=&r" (u3_r),
+          [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+        : [R63]  "r" (R63),
+          [Filter2_l] "r" (Filter2_l), [Filter2_r] "r" (Filter2_r)
+    );
+
+    __asm__ __volatile__ (
+        /* vqs1 = vp8_signed_char_clamp(qs1 - u); */
+        "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[u2_l]         \n\t"
+        "addq_s.ph    %[vps1_l],       %[vps1_l],       %[u2_l]         \n\t"
+
+        /* vps1 = vp8_signed_char_clamp(ps1 + u); */
+        "addq_s.ph    %[vps1_r],       %[vps1_r],       %[u2_r]         \n\t"
+        "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[u2_r]         \n\t"
+
+        : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+          [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+        : [u2_l] "r" (u2_l), [u2_r] "r" (u2_r)
+    );
+
+    /* roughly 1/7th difference across boundary */
+    __asm__ __volatile__ (
+        /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */
+        "shra.ph      %[u3_l],         %[u3_l],         7               \n\t"
+        "shra.ph      %[u3_r],         %[u3_r],         7               \n\t"
+        "shll.ph      %[u3_l],         %[u3_l],         8               \n\t"
+        "shll.ph      %[u3_r],         %[u3_r],         8               \n\t"
+
+        /* vqs2 = vp8_signed_char_clamp(qs2 - u); */
+        "subq_s.ph    %[vqs2_l],       %[vqs2_l],       %[u3_l]         \n\t"
+        "subq_s.ph    %[vqs2_r],       %[vqs2_r],       %[u3_r]         \n\t"
+
+        /* vps2 = vp8_signed_char_clamp(ps2 + u); */
+        "addq_s.ph    %[vps2_l],       %[vps2_l],       %[u3_l]         \n\t"
+        "addq_s.ph    %[vps2_r],       %[vps2_r],       %[u3_r]         \n\t"
+
+        : [u3_l] "+r" (u3_l), [u3_r] "+r" (u3_r), [vps2_l] "+r" (vps2_l),
+          [vps2_r] "+r" (vps2_r), [vqs2_l] "+r" (vqs2_l), [vqs2_r] "+r" (vqs2_r)
+        :
+    );
+
+    /* Create quad-bytes from halfword pairs */
+    __asm__ __volatile__ (
+        "and          %[vqs0_l],       %[vqs0_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
+
+        "and          %[vps0_l],       %[vps0_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
+
+        "and          %[vqs1_l],       %[vqs1_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
+
+        "and          %[vps1_l],       %[vps1_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
+
+        "and          %[vqs2_l],       %[vqs2_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vqs2_r],       %[vqs2_r],       8               \n\t"
+
+        "and          %[vps2_l],       %[vps2_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vps2_r],       %[vps2_r],       8               \n\t"
+
+        "or           %[vqs0_r],       %[vqs0_l],       %[vqs0_r]       \n\t"
+        "or           %[vps0_r],       %[vps0_l],       %[vps0_r]       \n\t"
+        "or           %[vqs1_r],       %[vqs1_l],       %[vqs1_r]       \n\t"
+        "or           %[vps1_r],       %[vps1_l],       %[vps1_r]       \n\t"
+        "or           %[vqs2_r],       %[vqs2_l],       %[vqs2_r]       \n\t"
+        "or           %[vps2_r],       %[vps2_l],       %[vps2_r]       \n\t"
+
+        : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), [vqs1_l] "+r" (vqs1_l),
+          [vqs1_r] "+r" (vqs1_r), [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r), [vqs2_l] "+r" (vqs2_l),
+          [vqs2_r] "+r" (vqs2_r), [vps2_r] "+r" (vps2_r), [vps2_l] "+r" (vps2_l)
+        : [HWM] "r" (HWM)
+    );
+
+    *ps0 = vps0_r ^ N128;
+    *ps1 = vps1_r ^ N128;
+    *ps2 = vps2_r ^ N128;
+    *qs0 = vqs0_r ^ N128;
+    *qs1 = vqs1_r ^ N128;
+    *qs2 = vqs2_r ^ N128;
+}
+
+void vp8_mbloop_filter_horizontal_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    int i;
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+    mask = 0;
+    hev = 0;
+    i = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    sm1 = s - (p << 2);
+    s0  = s - p - p - p;
+    s1  = s - p - p;
+    s2  = s - p;
+    s3  = s;
+    s4  = s + p;
+    s5  = s + p + p;
+    s6  = s + p + p + p;
+
+    /* prefetch data for load */
+    prefetch_load_lf(s + p);
+
+    /* apply filter on 4 pixesl at the same time */
+    do
+    {
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p1 = *((uint32_t *)(s1));
+        p2 = *((uint32_t *)(s2));
+        p3 = *((uint32_t *)(s3));
+        p4 = *((uint32_t *)(s4));
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            pm1 = *((uint32_t *)(sm1));
+            p0  = *((uint32_t *)(s0));
+            p5  = *((uint32_t *)(s5));
+            p6  = *((uint32_t *)(s6));
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+                /* unpack processed 4x4 neighborhood
+                 * memory is 4 byte aligned
+                 */
+                *((uint32_t *)s0) = p0;
+                *((uint32_t *)s1) = p1;
+                *((uint32_t *)s2) = p2;
+                *((uint32_t *)s3) = p3;
+                *((uint32_t *)s4) = p4;
+                *((uint32_t *)s5) = p5;
+            }
+        }
+
+        sm1 += 4;
+        s0  += 4;
+        s1  += 4;
+        s2  += 4;
+        s3  += 4;
+        s4  += 4;
+        s5  += 4;
+        s6  += 4;
+
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p1 = *((uint32_t *)(s1));
+        p2 = *((uint32_t *)(s2));
+        p3 = *((uint32_t *)(s3));
+        p4 = *((uint32_t *)(s4));
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            pm1 = *((uint32_t *)(sm1));
+            p0  = *((uint32_t *)(s0));
+            p5  = *((uint32_t *)(s5));
+            p6  = *((uint32_t *)(s6));
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+                /* unpack processed 4x4 neighborhood
+                 * memory is 4 byte aligned
+                 */
+                *((uint32_t *)s0) = p0;
+                *((uint32_t *)s1) = p1;
+                *((uint32_t *)s2) = p2;
+                *((uint32_t *)s3) = p3;
+                *((uint32_t *)s4) = p4;
+                *((uint32_t *)s5) = p5;
+            }
+        }
+
+        sm1 += 4;
+        s0  += 4;
+        s1  += 4;
+        s2  += 4;
+        s3  += 4;
+        s4  += 4;
+        s5  += 4;
+        s6  += 4;
+
+        i += 8;
+    }
+
+    while (i < count);
+}
+
+void vp8_mbloop_filter_uvhorizontal_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+    mask = 0;
+    hev = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    sm1 = s - (p << 2);
+    s0  = s - p - p - p;
+    s1  = s - p - p;
+    s2  = s - p;
+    s3  = s;
+    s4  = s + p;
+    s5  = s + p + p;
+    s6  = s + p + p + p;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        /* if mask == 0 do filtering is not needed */
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        if (mask)
+        {
+            /* filtering */
+            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+            /* unpack processed 4x4 neighborhood
+             * memory is 4 byte aligned
+             */
+            *((uint32_t *)s0) = p0;
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+            *((uint32_t *)s5) = p5;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+            /* unpack processed 4x4 neighborhood
+             * memory is 4 byte aligned
+             */
+            *((uint32_t *)s0) = p0;
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+            *((uint32_t *)s5) = p5;
+        }
+    }
+}
+
+
+void vp8_mbloop_filter_vertical_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+
+    int i;
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *s1, *s2, *s3, *s4;
+    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+    mask = 0;
+    hev = 0;
+    i = 0;
+    pm1 = 0;
+    p0 = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+    p5 = 0;
+    p6 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    /* apply filter on 4 pixesl at the same time */
+    do
+    {
+        s1 = s;
+        s2 = s + p;
+        s3 = s2 + p;
+        s4 = s3 + p;
+        s  = s4 + p;
+
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p2  = *((uint32_t *)(s1 - 4));
+        p6  = *((uint32_t *)(s1));
+        p1  = *((uint32_t *)(s2 - 4));
+        p5  = *((uint32_t *)(s2));
+        p0  = *((uint32_t *)(s3 - 4));
+        p4  = *((uint32_t *)(s3));
+        pm1 = *((uint32_t *)(s4 - 4));
+        p3  = *((uint32_t *)(s4));
+
+        /* transpose pm1, p0, p1, p2 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+            "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+            "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+            "append         %[p1],      %[sec3],    16          \n\t"
+            "append         %[pm1],     %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* transpose p3, p4, p5, p6 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+            "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+            "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+            "append         %[p5],      %[sec3],    16          \n\t"
+            "append         %[p3],      %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+                /* don't use transpose on output data
+                 * because memory isn't aligned
+                 */
+                __asm__ __volatile__ (
+                    "sb         %[p5],  2(%[s4])        \n\t"
+                    "sb         %[p4],  1(%[s4])        \n\t"
+                    "sb         %[p3],  0(%[s4])        \n\t"
+                    "sb         %[p2], -1(%[s4])        \n\t"
+                    "sb         %[p1], -2(%[s4])        \n\t"
+                    "sb         %[p0], -3(%[s4])        \n\t"
+                    :
+                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p5], %[p5], 8         \n\t"
+                    "srl        %[p4], %[p4], 8         \n\t"
+                    "srl        %[p3], %[p3], 8         \n\t"
+                    "srl        %[p2], %[p2], 8         \n\t"
+                    "srl        %[p1], %[p1], 8         \n\t"
+                    "srl        %[p0], %[p0], 8         \n\t"
+                    : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                      [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p5],  2(%[s3])        \n\t"
+                    "sb         %[p4],  1(%[s3])        \n\t"
+                    "sb         %[p3],  0(%[s3])        \n\t"
+                    "sb         %[p2], -1(%[s3])        \n\t"
+                    "sb         %[p1], -2(%[s3])        \n\t"
+                    "sb         %[p0], -3(%[s3])        \n\t"
+                    :
+                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
+                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p5], %[p5], 8         \n\t"
+                    "srl        %[p4], %[p4], 8         \n\t"
+                    "srl        %[p3], %[p3], 8         \n\t"
+                    "srl        %[p2], %[p2], 8         \n\t"
+                    "srl        %[p1], %[p1], 8         \n\t"
+                    "srl        %[p0], %[p0], 8         \n\t"
+                    : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                      [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p5],  2(%[s2])        \n\t"
+                    "sb         %[p4],  1(%[s2])        \n\t"
+                    "sb         %[p3],  0(%[s2])        \n\t"
+                    "sb         %[p2], -1(%[s2])        \n\t"
+                    "sb         %[p1], -2(%[s2])        \n\t"
+                    "sb         %[p0], -3(%[s2])        \n\t"
+                    :
+                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p5], %[p5], 8         \n\t"
+                    "srl        %[p4], %[p4], 8         \n\t"
+                    "srl        %[p3], %[p3], 8         \n\t"
+                    "srl        %[p2], %[p2], 8         \n\t"
+                    "srl        %[p1], %[p1], 8         \n\t"
+                    "srl        %[p0], %[p0], 8         \n\t"
+                    : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                      [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p5],  2(%[s1])        \n\t"
+                    "sb         %[p4],  1(%[s1])        \n\t"
+                    "sb         %[p3],  0(%[s1])        \n\t"
+                    "sb         %[p2], -1(%[s1])        \n\t"
+                    "sb         %[p1], -2(%[s1])        \n\t"
+                    "sb         %[p0], -3(%[s1])        \n\t"
+                    :
+                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+                );
+            }
+        }
+
+        i += 4;
+    }
+
+    while (i < count);
+}
+
+void vp8_mbloop_filter_uvvertical_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *s1, *s2, *s3, *s4;
+    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+    mask = 0;
+    hev = 0;
+    pm1 = 0;
+    p0 = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+    p5 = 0;
+    p6 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    /* apply filter on 4 pixesl at the same time */
+
+    s1 = s;
+    s2 = s + p;
+    s3 = s2 + p;
+    s4 = s3 + p;
+
+    /* prefetch data for load */
+    prefetch_load_lf(s + 2 * p);
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+            /* don't use transpose on output data
+             * because memory isn't aligned
+             */
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s4])        \n\t"
+                "sb         %[p4],  1(%[s4])        \n\t"
+                "sb         %[p3],  0(%[s4])        \n\t"
+                "sb         %[p2], -1(%[s4])        \n\t"
+                "sb         %[p1], -2(%[s4])        \n\t"
+                "sb         %[p0], -3(%[s4])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s3])        \n\t"
+                "sb         %[p4],  1(%[s3])        \n\t"
+                "sb         %[p3],  0(%[s3])        \n\t"
+                "sb         %[p2], -1(%[s3])        \n\t"
+                "sb         %[p1], -2(%[s3])        \n\t"
+                "sb         %[p0], -3(%[s3])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s2])        \n\t"
+                "sb         %[p4],  1(%[s2])        \n\t"
+                "sb         %[p3],  0(%[s2])        \n\t"
+                "sb         %[p2], -1(%[s2])        \n\t"
+                "sb         %[p1], -2(%[s2])        \n\t"
+                "sb         %[p0], -3(%[s2])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s1])        \n\t"
+                "sb         %[p4],  1(%[s1])        \n\t"
+                "sb         %[p3],  0(%[s1])        \n\t"
+                "sb         %[p2], -1(%[s1])        \n\t"
+                "sb         %[p1], -2(%[s1])        \n\t"
+                "sb         %[p0], -3(%[s1])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+        }
+    }
+
+    s1 = s4 + p;
+    s2 = s1 + p;
+    s3 = s2 + p;
+    s4 = s3 + p;
+
+    /* load quad-byte vectors
+    * memory is 4 byte aligned
+    */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+            /* don't use transpose on output data
+             * because memory isn't aligned
+             */
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s4])        \n\t"
+                "sb         %[p4],  1(%[s4])        \n\t"
+                "sb         %[p3],  0(%[s4])        \n\t"
+                "sb         %[p2], -1(%[s4])        \n\t"
+                "sb         %[p1], -2(%[s4])        \n\t"
+                "sb         %[p0], -3(%[s4])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s3])        \n\t"
+                "sb         %[p4],  1(%[s3])        \n\t"
+                "sb         %[p3],  0(%[s3])        \n\t"
+                "sb         %[p2], -1(%[s3])        \n\t"
+                "sb         %[p1], -2(%[s3])        \n\t"
+                "sb         %[p0], -3(%[s3])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s2])        \n\t"
+                "sb         %[p4],  1(%[s2])        \n\t"
+                "sb         %[p3],  0(%[s2])        \n\t"
+                "sb         %[p2], -1(%[s2])        \n\t"
+                "sb         %[p1], -2(%[s2])        \n\t"
+                "sb         %[p0], -3(%[s2])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s1])        \n\t"
+                "sb         %[p4],  1(%[s1])        \n\t"
+                "sb         %[p3],  0(%[s1])        \n\t"
+                "sb         %[p2], -1(%[s1])        \n\t"
+                "sb         %[p1], -2(%[s1])        \n\t"
+                "sb         %[p0], -3(%[s1])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+        }
+    }
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned int thresh_vec, flimit_vec, limit_vec;
+    unsigned char thresh, flimit, limit, flimit_temp;
+
+    /* use direct value instead pointers */
+    limit = *(lfi->lim);
+    flimit_temp = *(lfi->mblim);
+    thresh = *(lfi->hev_thr);
+    flimit = flimit_temp;
+
+    /* create quad-byte */
+    __asm__ __volatile__ (
+        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
+        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
+        "replv.qb       %[limit_vec],  %[limit]     \n\t"
+        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+    );
+
+    vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+    if (u_ptr)
+    {
+        vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+    }
+
+    if (v_ptr)
+    {
+        vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+    }
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned int thresh_vec, flimit_vec, limit_vec;
+    unsigned char thresh, flimit, limit, flimit_temp;
+
+    /* use direct value instead pointers */
+    limit = *(lfi->lim);
+    flimit_temp = *(lfi->mblim);
+    thresh = *(lfi->hev_thr);
+    flimit = flimit_temp;
+
+    /* create quad-byte */
+    __asm__ __volatile__ (
+        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
+        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
+        "replv.qb       %[limit_vec],  %[limit]     \n\t"
+        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+    );
+
+    vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+    if (u_ptr)
+        vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+
+    if (v_ptr)
+        vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned int thresh_vec, flimit_vec, limit_vec;
+    unsigned char thresh, flimit, limit, flimit_temp;
+
+    /* use direct value instead pointers */
+    limit = *(lfi->lim);
+    flimit_temp = *(lfi->blim);
+    thresh = *(lfi->hev_thr);
+    flimit = flimit_temp;
+
+    /* create quad-byte */
+    __asm__ __volatile__ (
+        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
+        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
+        "replv.qb       %[limit_vec],  %[limit]     \n\t"
+        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+    );
+
+    vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+    vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+    vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+    if (u_ptr)
+        vp8_loop_filter_uvhorizontal_edge_mips(u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+
+    if (v_ptr)
+        vp8_loop_filter_uvhorizontal_edge_mips(v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned int thresh_vec, flimit_vec, limit_vec;
+    unsigned char thresh, flimit, limit, flimit_temp;
+
+    /* use direct value instead pointers */
+    limit = *(lfi->lim);
+    flimit_temp = *(lfi->blim);
+    thresh = *(lfi->hev_thr);
+    flimit = flimit_temp;
+
+    /* create quad-byte */
+    __asm__ __volatile__ (
+        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
+        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
+        "replv.qb       %[limit_vec],  %[limit]     \n\t"
+        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+    );
+
+    vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+    vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+    vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+    if (u_ptr)
+        vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+
+    if (v_ptr)
+        vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+}
+
+#endif
diff --git a/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c b/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c
new file mode 100644
index 0000000..a5239a3
--- /dev/null
+++ b/libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c
@@ -0,0 +1,121 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#if HAVE_DSPR2
+inline void prefetch_load_int(unsigned char *src)
+{
+    __asm__ __volatile__ (
+        "pref   0,  0(%[src])   \n\t"
+        :
+        : [src] "r" (src)
+    );
+}
+
+
+__inline void vp8_copy_mem16x16_dspr2(
+    unsigned char *RESTRICT src,
+    int src_stride,
+    unsigned char *RESTRICT dst,
+    int dst_stride)
+{
+    int r;
+    unsigned int a0, a1, a2, a3;
+
+    for (r = 16; r--;)
+    {
+        /* load src data in cache memory */
+        prefetch_load_int(src + src_stride);
+
+        /* use unaligned memory load and store */
+        __asm__ __volatile__ (
+            "ulw    %[a0], 0(%[src])            \n\t"
+            "ulw    %[a1], 4(%[src])            \n\t"
+            "ulw    %[a2], 8(%[src])            \n\t"
+            "ulw    %[a3], 12(%[src])           \n\t"
+            "sw     %[a0], 0(%[dst])            \n\t"
+            "sw     %[a1], 4(%[dst])            \n\t"
+            "sw     %[a2], 8(%[dst])            \n\t"
+            "sw     %[a3], 12(%[dst])           \n\t"
+            : [a0] "=&r" (a0), [a1] "=&r" (a1),
+              [a2] "=&r" (a2), [a3] "=&r" (a3)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+
+__inline void vp8_copy_mem8x8_dspr2(
+    unsigned char *RESTRICT src,
+    int src_stride,
+    unsigned char *RESTRICT dst,
+    int dst_stride)
+{
+    int r;
+    unsigned int a0, a1;
+
+    /* load src data in cache memory */
+    prefetch_load_int(src + src_stride);
+
+    for (r = 8; r--;)
+    {
+        /* use unaligned memory load and store */
+        __asm__ __volatile__ (
+            "ulw    %[a0], 0(%[src])            \n\t"
+            "ulw    %[a1], 4(%[src])            \n\t"
+            "sw     %[a0], 0(%[dst])            \n\t"
+            "sw     %[a1], 4(%[dst])            \n\t"
+            : [a0] "=&r" (a0), [a1] "=&r" (a1)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+
+__inline void vp8_copy_mem8x4_dspr2(
+    unsigned char *RESTRICT src,
+    int src_stride,
+    unsigned char *RESTRICT dst,
+    int dst_stride)
+{
+    int r;
+    unsigned int a0, a1;
+
+    /* load src data in cache memory */
+    prefetch_load_int(src + src_stride);
+
+    for (r = 4; r--;)
+    {
+        /* use unaligned memory load and store */
+        __asm__ __volatile__ (
+            "ulw    %[a0], 0(%[src])            \n\t"
+            "ulw    %[a1], 4(%[src])            \n\t"
+            "sw     %[a0], 0(%[dst])            \n\t"
+            "sw     %[a1], 4(%[dst])            \n\t"
+           : [a0] "=&r" (a0), [a1] "=&r" (a1)
+           : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+#endif
diff --git a/libvpx/vp8/common/modecont.c b/libvpx/vp8/common/modecont.c
new file mode 100644
index 0000000..86a74bc
--- /dev/null
+++ b/libvpx/vp8/common/modecont.c
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "entropy.h"
+
+const int vp8_mode_contexts[6][4] =
+{
+    {
+        /* 0 */
+        7,     1,     1,   143,
+    },
+    {
+        /* 1 */
+        14,    18,    14,   107,
+    },
+    {
+        /* 2 */
+        135,    64,    57,    68,
+    },
+    {
+        /* 3 */
+        60,    56,   128,    65,
+    },
+    {
+        /* 4 */
+        159,   134,   128,    34,
+    },
+    {
+        /* 5 */
+        234,   188,   128,    28,
+    },
+};
diff --git a/libvpx/vp8/common/modecont.h b/libvpx/vp8/common/modecont.h
new file mode 100644
index 0000000..24db882
--- /dev/null
+++ b/libvpx/vp8/common/modecont.h
@@ -0,0 +1,17 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MODECONT_H
+#define __INC_MODECONT_H
+
+extern const int vp8_mode_contexts[6][4];
+
+#endif
diff --git a/libvpx/vp8/common/mv.h b/libvpx/vp8/common/mv.h
new file mode 100644
index 0000000..b3f919d
--- /dev/null
+++ b/libvpx/vp8/common/mv.h
@@ -0,0 +1,28 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MV_H
+#define __INC_MV_H
+#include "vpx/vpx_integer.h"
+
+typedef struct
+{
+    short row;
+    short col;
+} MV;
+
+typedef union int_mv
+{
+    uint32_t  as_int;
+    MV        as_mv;
+} int_mv;        /* facilitates faster equality tests and copies */
+
+#endif
diff --git a/libvpx/vp8/common/onyx.h b/libvpx/vp8/common/onyx.h
new file mode 100644
index 0000000..766b4ea
--- /dev/null
+++ b/libvpx/vp8/common/onyx.h
@@ -0,0 +1,269 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8_H
+#define __INC_VP8_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_scale/yv12config.h"
+#include "ppflags.h"
+
+    struct VP8_COMP;
+
+    /* Create/destroy static data structures. */
+
+    typedef enum
+    {
+        NORMAL      = 0,
+        FOURFIVE    = 1,
+        THREEFIVE   = 2,
+        ONETWO      = 3
+
+    } VPX_SCALING;
+
+    typedef enum
+    {
+        USAGE_STREAM_FROM_SERVER    = 0x0,
+        USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
+        USAGE_CONSTRAINED_QUALITY   = 0x2
+    } END_USAGE;
+
+
+    typedef enum
+    {
+        MODE_REALTIME       = 0x0,
+        MODE_GOODQUALITY    = 0x1,
+        MODE_BESTQUALITY    = 0x2,
+        MODE_FIRSTPASS      = 0x3,
+        MODE_SECONDPASS     = 0x4,
+        MODE_SECONDPASS_BEST = 0x5
+    } MODE;
+
+    typedef enum
+    {
+        FRAMEFLAGS_KEY    = 1,
+        FRAMEFLAGS_GOLDEN = 2,
+        FRAMEFLAGS_ALTREF = 4
+    } FRAMETYPE_FLAGS;
+
+
+#include <assert.h>
+    static void Scale2Ratio(int mode, int *hr, int *hs)
+    {
+        switch (mode)
+        {
+        case    NORMAL:
+            *hr = 1;
+            *hs = 1;
+            break;
+        case    FOURFIVE:
+            *hr = 4;
+            *hs = 5;
+            break;
+        case    THREEFIVE:
+            *hr = 3;
+            *hs = 5;
+            break;
+        case    ONETWO:
+            *hr = 1;
+            *hs = 2;
+            break;
+        default:
+            *hr = 1;
+            *hs = 1;
+            assert(0);
+            break;
+        }
+    }
+
+    typedef struct
+    {
+        /* 4 versions of bitstream defined:
+         *   0 best quality/slowest decode, 3 lowest quality/fastest decode
+         */
+        int Version;
+        int Width;
+        int Height;
+        struct vpx_rational  timebase;
+        unsigned int target_bandwidth;    /* kilobits per second */
+
+        /* parameter used for applying pre processing blur: recommendation 0 */
+        int noise_sensitivity;
+
+        /* parameter used for sharpening output: recommendation 0: */
+        int Sharpness;
+        int cpu_used;
+        unsigned int rc_max_intra_bitrate_pct;
+
+        /* mode ->
+         *(0)=Realtime/Live Encoding. This mode is optimized for realtim
+         *    encoding (for example, capturing a television signal or feed
+         *    from a live camera). ( speed setting controls how fast )
+         *(1)=Good Quality Fast Encoding. The encoder balances quality with
+         *    the amount of time it takes to encode the output. ( speed
+         *    setting controls how fast )
+         *(2)=One Pass - Best Quality. The encoder places priority on the
+         *    quality of the output over encoding speed. The output is
+         *    compressed at the highest possible quality. This option takes
+         *    the longest amount of time to encode. ( speed setting ignored
+         *    )
+         *(3)=Two Pass - First Pass. The encoder generates a file of
+         *    statistics for use in the second encoding pass. ( speed
+         *    setting controls how fast )
+         *(4)=Two Pass - Second Pass. The encoder uses the statistics that
+         *    were generated in the first encoding pass to create the
+         *    compressed output. ( speed setting controls how fast )
+         *(5)=Two Pass - Second Pass Best.  The encoder uses the statistics
+         *    that were generated in the first encoding pass to create the
+         *    compressed output using the highest possible quality, and
+         *    taking a longer amount of time to encode.. ( speed setting
+         *    ignored )
+         */
+        int Mode;
+
+        /* Key Framing Operations */
+        int auto_key;       /* automatically detect cut scenes */
+        int key_freq;       /* maximum distance to key frame. */
+
+        /* lagged compression (if allow_lag == 0 lag_in_frames is ignored) */
+        int allow_lag;
+        int lag_in_frames; /* how many frames lag before we start encoding */
+
+        /*
+         * DATARATE CONTROL OPTIONS
+         */
+
+        int end_usage; /* vbr or cbr */
+
+        /* buffer targeting aggressiveness */
+        int under_shoot_pct;
+        int over_shoot_pct;
+
+        /* buffering parameters */
+        int64_t starting_buffer_level;
+        int64_t optimal_buffer_level;
+        int64_t maximum_buffer_size;
+
+        int64_t starting_buffer_level_in_ms;
+        int64_t optimal_buffer_level_in_ms;
+        int64_t maximum_buffer_size_in_ms;
+
+        /* controlling quality */
+        int fixed_q;
+        int worst_allowed_q;
+        int best_allowed_q;
+        int cq_level;
+
+        /* allow internal resizing */
+        int allow_spatial_resampling;
+        int resample_down_water_mark;
+        int resample_up_water_mark;
+
+        /* allow internal frame rate alterations */
+        int allow_df;
+        int drop_frames_water_mark;
+
+        /* two pass datarate control */
+        int two_pass_vbrbias;
+        int two_pass_vbrmin_section;
+        int two_pass_vbrmax_section;
+
+        /*
+         * END DATARATE CONTROL OPTIONS
+         */
+
+        /* these parameters aren't to be used in final build don't use!!! */
+        int play_alternate;
+        int alt_freq;
+        int alt_q;
+        int key_q;
+        int gold_q;
+
+
+        int multi_threaded;   /* how many threads to run the encoder on */
+        int token_partitions; /* how many token partitions to create */
+
+        /* early breakout threshold: for video conf recommend 800 */
+        int encode_breakout;
+
+        /* Bitfield defining the error resiliency features to enable.
+         * Can provide decodable frames after losses in previous
+         * frames and decodable partitions after losses in the same frame.
+         */
+        unsigned int error_resilient_mode;
+
+        int arnr_max_frames;
+        int arnr_strength;
+        int arnr_type;
+
+        struct vpx_fixed_buf        two_pass_stats_in;
+        struct vpx_codec_pkt_list  *output_pkt_list;
+
+        vp8e_tuning tuning;
+
+        /* Temporal scaling parameters */
+        unsigned int number_of_layers;
+        unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY];
+        unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY];
+        unsigned int periodicity;
+        unsigned int layer_id[VPX_TS_MAX_PERIODICITY];
+
+#if CONFIG_MULTI_RES_ENCODING
+        /* Number of total resolutions encoded */
+        unsigned int mr_total_resolutions;
+
+        /* Current encoder ID */
+        unsigned int mr_encoder_id;
+
+        /* Down-sampling factor */
+        vpx_rational_t mr_down_sampling_factor;
+
+        /* Memory location to store low-resolution encoder's mode info */
+        void* mr_low_res_mode_info;
+#endif
+    } VP8_CONFIG;
+
+
+    void vp8_initialize();
+
+    struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf);
+    void vp8_remove_compressor(struct VP8_COMP* *comp);
+
+    void vp8_init_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
+    void vp8_change_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
+
+    int vp8_receive_raw_frame(struct VP8_COMP* comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp);
+    int vp8_get_compressed_data(struct VP8_COMP* comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush);
+    int vp8_get_preview_raw_frame(struct VP8_COMP* comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
+
+    int vp8_use_as_reference(struct VP8_COMP* comp, int ref_frame_flags);
+    int vp8_update_reference(struct VP8_COMP* comp, int ref_frame_flags);
+    int vp8_get_reference(struct VP8_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_set_reference(struct VP8_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_update_entropy(struct VP8_COMP* comp, int update);
+    int vp8_set_roimap(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]);
+    int vp8_set_active_map(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols);
+    int vp8_set_internal_size(struct VP8_COMP* comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+    int vp8_get_quantizer(struct VP8_COMP* c);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libvpx/vp8/common/onyxc_int.h b/libvpx/vp8/common/onyxc_int.h
new file mode 100644
index 0000000..5325bac
--- /dev/null
+++ b/libvpx/vp8/common/onyxc_int.h
@@ -0,0 +1,179 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8C_INT_H
+#define __INC_VP8C_INT_H
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "loopfilter.h"
+#include "entropymv.h"
+#include "entropy.h"
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif
+
+/*#ifdef PACKET_TESTING*/
+#include "header.h"
+/*#endif*/
+
+#define MINQ 0
+#define MAXQ 127
+#define QINDEX_RANGE (MAXQ + 1)
+
+#define NUM_YV12_BUFFERS 4
+
+#define MAX_PARTITIONS 9
+
+typedef struct frame_contexts
+{
+    vp8_prob bmode_prob [VP8_BINTRAMODES-1];
+    vp8_prob ymode_prob [VP8_YMODES-1];   /* interframe intra mode probs */
+    vp8_prob uv_mode_prob [VP8_UV_MODES-1];
+    vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1];
+    vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+    MV_CONTEXT mvc[2];
+} FRAME_CONTEXT;
+
+typedef enum
+{
+    ONE_PARTITION  = 0,
+    TWO_PARTITION  = 1,
+    FOUR_PARTITION = 2,
+    EIGHT_PARTITION = 3
+} TOKEN_PARTITION;
+
+typedef enum
+{
+    RECON_CLAMP_REQUIRED        = 0,
+    RECON_CLAMP_NOTREQUIRED     = 1
+} CLAMP_TYPE;
+
+typedef struct VP8Common
+
+{
+    struct vpx_internal_error_info  error;
+
+    DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][2]);
+    DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][2]);
+    DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][2]);
+
+    int Width;
+    int Height;
+    int horiz_scale;
+    int vert_scale;
+
+    YUV_TYPE clr_type;
+    CLAMP_TYPE  clamp_type;
+
+    YV12_BUFFER_CONFIG *frame_to_show;
+
+    YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
+    int fb_idx_ref_cnt[NUM_YV12_BUFFERS];
+    int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;
+
+    YV12_BUFFER_CONFIG temp_scale_frame;
+
+#if CONFIG_POSTPROC
+    YV12_BUFFER_CONFIG post_proc_buffer;
+    YV12_BUFFER_CONFIG post_proc_buffer_int;
+    int post_proc_buffer_int_used;
+    unsigned char *pp_limits_buffer;   /* post-processing filter coefficients */
+#endif
+
+    FRAME_TYPE last_frame_type;  /* Save last frame's frame type for motion search. */
+    FRAME_TYPE frame_type;
+
+    int show_frame;
+
+    int frame_flags;
+    int MBs;
+    int mb_rows;
+    int mb_cols;
+    int mode_info_stride;
+
+    /* profile settings */
+    int mb_no_coeff_skip;
+    int no_lpf;
+    int use_bilinear_mc_filter;
+    int full_pixel;
+
+    int base_qindex;
+
+    int y1dc_delta_q;
+    int y2dc_delta_q;
+    int y2ac_delta_q;
+    int uvdc_delta_q;
+    int uvac_delta_q;
+
+    unsigned int frames_since_golden;
+    unsigned int frames_till_alt_ref_frame;
+
+    /* We allocate a MODE_INFO struct for each macroblock, together with
+       an extra row on top and column on the left to simplify prediction. */
+
+    MODE_INFO *mip; /* Base of allocated array */
+    MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
+#if CONFIG_ERROR_CONCEALMENT
+    MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
+    MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
+#endif
+
+    LOOPFILTERTYPE filter_type;
+
+    loop_filter_info_n lf_info;
+
+    int filter_level;
+    int last_sharpness_level;
+    int sharpness_level;
+
+    int refresh_last_frame;       /* Two state 0 = NO, 1 = YES */
+    int refresh_golden_frame;     /* Two state 0 = NO, 1 = YES */
+    int refresh_alt_ref_frame;     /* Two state 0 = NO, 1 = YES */
+
+    int copy_buffer_to_gf;         /* 0 none, 1 Last to GF, 2 ARF to GF */
+    int copy_buffer_to_arf;        /* 0 none, 1 Last to ARF, 2 GF to ARF */
+
+    int refresh_entropy_probs;    /* Two state 0 = NO, 1 = YES */
+
+    int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */
+
+    /* Y,U,V,Y2 */
+    ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
+    ENTROPY_CONTEXT_PLANES left_context;  /* (up to) 4 contexts "" */
+
+    FRAME_CONTEXT lfc; /* last frame entropy */
+    FRAME_CONTEXT fc;  /* this frame entropy */
+
+    unsigned int current_video_frame;
+
+    int near_boffset[3];
+    int version;
+
+    TOKEN_PARTITION multi_token_partition;
+
+#ifdef PACKET_TESTING
+    VP8_HEADER oh;
+#endif
+    double bitrate;
+    double framerate;
+
+#if CONFIG_MULTITHREAD
+    int processor_core_count;
+#endif
+#if CONFIG_POSTPROC
+    struct postproc_state  postproc_state;
+#endif
+    int cpu_caps;
+} VP8_COMMON;
+
+#endif
diff --git a/libvpx/vp8/common/onyxd.h b/libvpx/vp8/common/onyxd.h
new file mode 100644
index 0000000..fd7e051
--- /dev/null
+++ b/libvpx/vp8/common/onyxd.h
@@ -0,0 +1,68 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8D_H
+#define __INC_VP8D_H
+
+
+/* Create/destroy static data structures. */
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+#include "vpx_scale/yv12config.h"
+#include "ppflags.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vp8.h"
+
+    struct VP8D_COMP;
+
+    typedef struct
+    {
+        int     Width;
+        int     Height;
+        int     Version;
+        int     postprocess;
+        int     max_threads;
+        int     error_concealment;
+        int     input_fragments;
+    } VP8D_CONFIG;
+
+    typedef enum
+    {
+        VP8D_OK = 0
+    } VP8D_SETTING;
+
+    void vp8dx_initialize(void);
+
+    void vp8dx_set_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst, int x);
+
+    int vp8dx_get_setting(struct VP8D_COMP* comp, VP8D_SETTING oxst);
+
+    int vp8dx_receive_compressed_data(struct VP8D_COMP* comp,
+                                      size_t size, const uint8_t *dest,
+                                      int64_t time_stamp);
+    int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
+
+    vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+
+    struct VP8D_COMP* vp8dx_create_decompressor(VP8D_CONFIG *oxcf);
+
+    void vp8dx_remove_decompressor(struct VP8D_COMP* comp);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/libvpx/vp8/common/postproc.c b/libvpx/vp8/common/postproc.c
new file mode 100644
index 0000000..80fa530
--- /dev/null
+++ b/libvpx/vp8/common/postproc.c
@@ -0,0 +1,1206 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_scale/yv12config.h"
+#include "postproc.h"
+#include "common.h"
+#include "vpx_scale/vpxscale.h"
+#include "systemdependent.h"
+
+#include <limits.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define RGB_TO_YUV(t)                                                                       \
+    ( (0.257*(float)(t>>16)) + (0.504*(float)(t>>8&0xff)) + (0.098*(float)(t&0xff)) + 16),  \
+    (-(0.148*(float)(t>>16)) - (0.291*(float)(t>>8&0xff)) + (0.439*(float)(t&0xff)) + 128), \
+    ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)
+
+/* global constants */
+#if CONFIG_POSTPROC_VISUALIZER
+static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
+{
+    { RGB_TO_YUV(0x98FB98) },   /* PaleGreen */
+    { RGB_TO_YUV(0x00FF00) },   /* Green */
+    { RGB_TO_YUV(0xADFF2F) },   /* GreenYellow */
+    { RGB_TO_YUV(0x228B22) },   /* ForestGreen */
+    { RGB_TO_YUV(0x006400) },   /* DarkGreen */
+    { RGB_TO_YUV(0x98F5FF) },   /* Cadet Blue */
+    { RGB_TO_YUV(0x6CA6CD) },   /* Sky Blue */
+    { RGB_TO_YUV(0x00008B) },   /* Dark blue */
+    { RGB_TO_YUV(0x551A8B) },   /* Purple */
+    { RGB_TO_YUV(0xFF0000) }    /* Red */
+};
+
+static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] =
+{
+    { RGB_TO_YUV(0x6633ff) },   /* Purple */
+    { RGB_TO_YUV(0xcc33ff) },   /* Magenta */
+    { RGB_TO_YUV(0xff33cc) },   /* Pink */
+    { RGB_TO_YUV(0xff3366) },   /* Coral */
+    { RGB_TO_YUV(0x3366ff) },   /* Blue */
+    { RGB_TO_YUV(0xed00f5) },   /* Dark Blue */
+    { RGB_TO_YUV(0x2e00b8) },   /* Dark Purple */
+    { RGB_TO_YUV(0xff6633) },   /* Orange */
+    { RGB_TO_YUV(0x33ccff) },   /* Light Blue */
+    { RGB_TO_YUV(0x8ab800) },   /* Green */
+    { RGB_TO_YUV(0xffcc33) },   /* Light Orange */
+    { RGB_TO_YUV(0x33ffcc) },   /* Aqua */
+    { RGB_TO_YUV(0x66ff33) },   /* Light Green */
+    { RGB_TO_YUV(0xccff33) },   /* Yellow */
+};
+
+static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
+{
+    { RGB_TO_YUV(0x00ff00) },   /* Blue */
+    { RGB_TO_YUV(0x0000ff) },   /* Green */
+    { RGB_TO_YUV(0xffff00) },   /* Yellow */
+    { RGB_TO_YUV(0xff0000) },   /* Red */
+};
+#endif
+
+static const short kernel5[] =
+{
+    1, 1, 4, 1, 1
+};
+
+const short vp8_rv[] =
+{
+    8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
+    0, 3, 9, 0, 0, 0, 8, 3, 14, 4,
+    10, 1, 11, 14, 1, 14, 9, 6, 12, 11,
+    8, 6, 10, 0, 0, 8, 9, 0, 3, 14,
+    8, 11, 13, 4, 2, 9, 0, 3, 9, 6,
+    1, 2, 3, 14, 13, 1, 8, 2, 9, 7,
+    3, 3, 1, 13, 13, 6, 6, 5, 2, 7,
+    11, 9, 11, 8, 7, 3, 2, 0, 13, 13,
+    14, 4, 12, 5, 12, 10, 8, 10, 13, 10,
+    4, 14, 4, 10, 0, 8, 11, 1, 13, 7,
+    7, 14, 6, 14, 13, 2, 13, 5, 4, 4,
+    0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+    8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+    3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+    3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+    13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+    5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+    9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+    4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+    3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+    11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+    5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+    0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+    10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+    4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+    0, 10, 0, 5, 13, 2, 12, 7, 11, 13,
+    8, 0, 4, 10, 7, 2, 7, 2, 2, 5,
+    3, 4, 7, 3, 3, 14, 14, 5, 9, 13,
+    3, 14, 3, 6, 3, 0, 11, 8, 13, 1,
+    13, 1, 12, 0, 10, 9, 7, 6, 2, 8,
+    5, 2, 13, 7, 1, 13, 14, 7, 6, 7,
+    9, 6, 10, 11, 7, 8, 7, 5, 14, 8,
+    4, 4, 0, 8, 7, 10, 0, 8, 14, 11,
+    3, 12, 5, 7, 14, 3, 14, 5, 2, 6,
+    11, 12, 12, 8, 0, 11, 13, 1, 2, 0,
+    5, 10, 14, 7, 8, 0, 4, 11, 0, 8,
+    0, 3, 10, 5, 8, 0, 11, 6, 7, 8,
+    10, 7, 13, 9, 2, 5, 1, 5, 10, 2,
+    4, 3, 5, 6, 10, 8, 9, 4, 11, 14,
+    3, 8, 3, 7, 8, 5, 11, 4, 12, 3,
+    11, 9, 14, 8, 14, 13, 4, 3, 1, 2,
+    14, 6, 5, 4, 4, 11, 4, 6, 2, 1,
+    5, 8, 8, 12, 13, 5, 14, 10, 12, 13,
+    0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
+};
+
+extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
+extern void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch);
+/***********************************************************************************************************
+ */
+void vp8_post_proc_down_and_across_mb_row_c
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    int src_pixels_per_line,
+    int dst_pixels_per_line,
+    int cols,
+    unsigned char *f,
+    int size
+)
+{
+    unsigned char *p_src, *p_dst;
+    int row;
+    int col;
+    unsigned char v;
+    unsigned char d[4];
+
+    for (row = 0; row < size; row++)
+    {
+        /* post_proc_down for one row */
+        p_src = src_ptr;
+        p_dst = dst_ptr;
+
+        for (col = 0; col < cols; col++)
+        {
+            unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
+            unsigned char p_above1 = p_src[col - src_pixels_per_line];
+            unsigned char p_below1 = p_src[col + src_pixels_per_line];
+            unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
+
+            v = p_src[col];
+
+            if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col])
+                && (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col]))
+            {
+                unsigned char k1, k2, k3;
+                k1 = (p_above2 + p_above1 + 1) >> 1;
+                k2 = (p_below2 + p_below1 + 1) >> 1;
+                k3 = (k1 + k2 + 1) >> 1;
+                v = (k3 + v + 1) >> 1;
+            }
+
+            p_dst[col] = v;
+        }
+
+        /* now post_proc_across */
+        p_src = dst_ptr;
+        p_dst = dst_ptr;
+
+        p_src[-2] = p_src[-1] = p_src[0];
+        p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
+
+        for (col = 0; col < cols; col++)
+        {
+            v = p_src[col];
+
+            if ((abs(v - p_src[col - 2]) < f[col])
+                && (abs(v - p_src[col - 1]) < f[col])
+                && (abs(v - p_src[col + 1]) < f[col])
+                && (abs(v - p_src[col + 2]) < f[col]))
+            {
+                unsigned char k1, k2, k3;
+                k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
+                k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
+                k3 = (k1 + k2 + 1) >> 1;
+                v = (k3 + v + 1) >> 1;
+            }
+
+            d[col & 3] = v;
+
+            if (col >= 2)
+                p_dst[col - 2] = d[(col - 2) & 3];
+        }
+
+        /* handle the last two pixels */
+        p_dst[col - 2] = d[(col - 2) & 3];
+        p_dst[col - 1] = d[(col - 1) & 3];
+
+        /* next row */
+        src_ptr += src_pixels_per_line;
+        dst_ptr += dst_pixels_per_line;
+    }
+}
+
+static int q2mbl(int x)
+{
+    if (x < 20) x = 20;
+
+    x = 50 + (x - 50) * 10 / 8;
+    return x * x / 3;
+}
+void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit)
+{
+    int r, c, i;
+
+    unsigned char *s = src;
+    unsigned char d[16];
+
+    for (r = 0; r < rows; r++)
+    {
+        int sumsq = 0;
+        int sum   = 0;
+
+        for (i = -8; i<0; i++)
+          s[i]=s[0];
+
+        /* 17 avoids valgrind warning - we buffer values in c in d
+         * and only write them when we've read 8 ahead...
+         */
+        for (i = cols; i<cols+17; i++)
+          s[i]=s[cols-1];
+
+        for (i = -8; i <= 6; i++)
+        {
+            sumsq += s[i] * s[i];
+            sum   += s[i];
+            d[i+8] = 0;
+        }
+
+        for (c = 0; c < cols + 8; c++)
+        {
+            int x = s[c+7] - s[c-8];
+            int y = s[c+7] + s[c-8];
+
+            sum  += x;
+            sumsq += x * y;
+
+            d[c&15] = s[c];
+
+            if (sumsq * 15 - sum * sum < flimit)
+            {
+                d[c&15] = (8 + sum + s[c]) >> 4;
+            }
+
+            s[c-8] = d[(c-8)&15];
+        }
+
+        s += pitch;
+    }
+}
+
+
+void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit)
+{
+    int r, c, i;
+    const short *rv3 = &vp8_rv[63&rand()];
+
+    for (c = 0; c < cols; c++ )
+    {
+        unsigned char *s = &dst[c];
+        int sumsq = 0;
+        int sum   = 0;
+        unsigned char d[16];
+        const short *rv2 = rv3 + ((c * 17) & 127);
+
+        for (i = -8; i < 0; i++)
+          s[i*pitch]=s[0];
+
+        /* 17 avoids valgrind warning - we buffer values in c in d
+         * and only write them when we've read 8 ahead...
+         */
+        for (i = rows; i < rows+17; i++)
+          s[i*pitch]=s[(rows-1)*pitch];
+
+        for (i = -8; i <= 6; i++)
+        {
+            sumsq += s[i*pitch] * s[i*pitch];
+            sum   += s[i*pitch];
+        }
+
+        for (r = 0; r < rows + 8; r++)
+        {
+            sumsq += s[7*pitch] * s[ 7*pitch] - s[-8*pitch] * s[-8*pitch];
+            sum  += s[7*pitch] - s[-8*pitch];
+            d[r&15] = s[0];
+
+            if (sumsq * 15 - sum * sum < flimit)
+            {
+                d[r&15] = (rv2[r&127] + sum + s[0]) >> 4;
+            }
+
+            s[-8*pitch] = d[(r-8)&15];
+            s += pitch;
+        }
+    }
+}
+
+static void vp8_de_mblock(YV12_BUFFER_CONFIG         *post,
+                          int                         q)
+{
+    vp8_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height,
+                              post->y_width, q2mbl(q));
+    vp8_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height,
+                         post->y_width, q2mbl(q));
+}
+
+void vp8_deblock(VP8_COMMON                 *cm,
+                 YV12_BUFFER_CONFIG         *source,
+                 YV12_BUFFER_CONFIG         *post,
+                 int                         q,
+                 int                         low_var_thresh,
+                 int                         flag)
+{
+    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+    int ppl = (int)(level + .5);
+
+    const MODE_INFO *mode_info_context = cm->mi;
+    int mbr, mbc;
+
+    /* The pixel thresholds are adjusted according to if or not the macroblock
+     * is a skipped block.  */
+    unsigned char *ylimits = cm->pp_limits_buffer;
+    unsigned char *uvlimits = cm->pp_limits_buffer + 16 * cm->mb_cols;
+    (void) low_var_thresh;
+    (void) flag;
+
+    if (ppl > 0)
+    {
+        for (mbr = 0; mbr < cm->mb_rows; mbr++)
+        {
+            unsigned char *ylptr = ylimits;
+            unsigned char *uvlptr = uvlimits;
+            for (mbc = 0; mbc < cm->mb_cols; mbc++)
+            {
+                unsigned char mb_ppl;
+
+                if (mode_info_context->mbmi.mb_skip_coeff)
+                    mb_ppl = (unsigned char)ppl >> 1;
+                else
+                    mb_ppl = (unsigned char)ppl;
+
+                vpx_memset(ylptr, mb_ppl, 16);
+                vpx_memset(uvlptr, mb_ppl, 8);
+
+                ylptr += 16;
+                uvlptr += 8;
+                mode_info_context++;
+            }
+            mode_info_context++;
+
+            vp8_post_proc_down_and_across_mb_row(
+                source->y_buffer + 16 * mbr * source->y_stride,
+                post->y_buffer + 16 * mbr * post->y_stride, source->y_stride,
+                post->y_stride, source->y_width, ylimits, 16);
+
+            vp8_post_proc_down_and_across_mb_row(
+                source->u_buffer + 8 * mbr * source->uv_stride,
+                post->u_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
+                post->uv_stride, source->uv_width, uvlimits, 8);
+            vp8_post_proc_down_and_across_mb_row(
+                source->v_buffer + 8 * mbr * source->uv_stride,
+                post->v_buffer + 8 * mbr * post->uv_stride, source->uv_stride,
+                post->uv_stride, source->uv_width, uvlimits, 8);
+        }
+    } else
+    {
+        vp8_yv12_copy_frame(source, post);
+    }
+}
+
+#if !(CONFIG_TEMPORAL_DENOISING)
+void vp8_de_noise(VP8_COMMON                 *cm,
+                  YV12_BUFFER_CONFIG         *source,
+                  YV12_BUFFER_CONFIG         *post,
+                  int                         q,
+                  int                         low_var_thresh,
+                  int                         flag)
+{
+    double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
+    int ppl = (int)(level + .5);
+    int mb_rows = source->y_width >> 4;
+    int mb_cols = source->y_height >> 4;
+    unsigned char *limits = cm->pp_limits_buffer;;
+    int mbr, mbc;
+    (void) post;
+    (void) low_var_thresh;
+    (void) flag;
+
+    vpx_memset(limits, (unsigned char)ppl, 16 * mb_cols);
+
+    /* TODO: The original code don't filter the 2 outer rows and columns. */
+    for (mbr = 0; mbr < mb_rows; mbr++)
+    {
+        vp8_post_proc_down_and_across_mb_row(
+            source->y_buffer + 16 * mbr * source->y_stride,
+            source->y_buffer + 16 * mbr * source->y_stride,
+            source->y_stride, source->y_stride, source->y_width, limits, 16);
+
+        vp8_post_proc_down_and_across_mb_row(
+            source->u_buffer + 8 * mbr * source->uv_stride,
+            source->u_buffer + 8 * mbr * source->uv_stride,
+            source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
+        vp8_post_proc_down_and_across_mb_row(
+            source->v_buffer + 8 * mbr * source->uv_stride,
+            source->v_buffer + 8 * mbr * source->uv_stride,
+            source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
+    }
+}
+#endif
+
+double vp8_gaussian(double sigma, double mu, double x)
+{
+    return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+           (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+static void fillrd(struct postproc_state *state, int q, int a)
+{
+    char char_dist[300];
+
+    double sigma;
+    int ai = a, qi = q, i;
+
+    vp8_clear_system_state();
+
+
+    sigma = ai + .5 + .6 * (63 - qi) / 63.0;
+
+    /* set up a lookup table of 256 entries that matches
+     * a gaussian distribution with sigma determined by q.
+     */
+    {
+        double i;
+        int next, j;
+
+        next = 0;
+
+        for (i = -32; i < 32; i++)
+        {
+            int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
+
+            if (a)
+            {
+                for (j = 0; j < a; j++)
+                {
+                    char_dist[next+j] = (char) i;
+                }
+
+                next = next + j;
+            }
+
+        }
+
+        for (; next < 256; next++)
+            char_dist[next] = 0;
+
+    }
+
+    for (i = 0; i < 3072; i++)
+    {
+        state->noise[i] = char_dist[rand() & 0xff];
+    }
+
+    for (i = 0; i < 16; i++)
+    {
+        state->blackclamp[i] = -char_dist[0];
+        state->whiteclamp[i] = -char_dist[0];
+        state->bothclamp[i] = -2 * char_dist[0];
+    }
+
+    state->last_q = q;
+    state->last_noise = a;
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : plane_add_noise_c
+ *
+ *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
+ *                                  noise to
+ *                  unsigned int Width    width of plane
+ *                  unsigned int Height   height of plane
+ *                  int  Pitch    distance between subsequent lines of frame
+ *                  int  q        quantizer used to determine amount of noise
+ *                                  to add
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void.
+ *
+ *  FUNCTION      : adds gaussian noise to a plane of pixels
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
+                           char blackclamp[16],
+                           char whiteclamp[16],
+                           char bothclamp[16],
+                           unsigned int Width, unsigned int Height, int Pitch)
+{
+    unsigned int i, j;
+
+    for (i = 0; i < Height; i++)
+    {
+        unsigned char *Pos = Start + i * Pitch;
+        char  *Ref = (char *)(noise + (rand() & 0xff));
+
+        for (j = 0; j < Width; j++)
+        {
+            if (Pos[j] < blackclamp[0])
+                Pos[j] = blackclamp[0];
+
+            if (Pos[j] > 255 + whiteclamp[0])
+                Pos[j] = 255 + whiteclamp[0];
+
+            Pos[j] += Ref[j];
+        }
+    }
+}
+
+/* Blend the macro block with a solid colored square.  Leave the
+ * edges unblended to give distinction to macro blocks in areas
+ * filled with the same color block.
+ */
+void vp8_blend_mb_inner_c (unsigned char *y, unsigned char *u, unsigned char *v,
+                        int y1, int u1, int v1, int alpha, int stride)
+{
+    int i, j;
+    int y1_const = y1*((1<<16)-alpha);
+    int u1_const = u1*((1<<16)-alpha);
+    int v1_const = v1*((1<<16)-alpha);
+
+    y += 2*stride + 2;
+    for (i = 0; i < 12; i++)
+    {
+        for (j = 0; j < 12; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    stride >>= 1;
+
+    u += stride + 1;
+    v += stride + 1;
+
+    for (i = 0; i < 6; i++)
+    {
+        for (j = 0; j < 6; j++)
+        {
+            u[j] = (u[j]*alpha + u1_const)>>16;
+            v[j] = (v[j]*alpha + v1_const)>>16;
+        }
+        u += stride;
+        v += stride;
+    }
+}
+
+/* Blend only the edge of the macro block.  Leave center
+ * unblended to allow for other visualizations to be layered.
+ */
+void vp8_blend_mb_outer_c (unsigned char *y, unsigned char *u, unsigned char *v,
+                        int y1, int u1, int v1, int alpha, int stride)
+{
+    int i, j;
+    int y1_const = y1*((1<<16)-alpha);
+    int u1_const = u1*((1<<16)-alpha);
+    int v1_const = v1*((1<<16)-alpha);
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 16; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    for (i = 0; i < 12; i++)
+    {
+        y[0]  = (y[0]*alpha  + y1_const)>>16;
+        y[1]  = (y[1]*alpha  + y1_const)>>16;
+        y[14] = (y[14]*alpha + y1_const)>>16;
+        y[15] = (y[15]*alpha + y1_const)>>16;
+        y += stride;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 16; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    stride >>= 1;
+
+    for (j = 0; j < 8; j++)
+    {
+        u[j] = (u[j]*alpha + u1_const)>>16;
+        v[j] = (v[j]*alpha + v1_const)>>16;
+    }
+    u += stride;
+    v += stride;
+
+    for (i = 0; i < 6; i++)
+    {
+        u[0] = (u[0]*alpha + u1_const)>>16;
+        v[0] = (v[0]*alpha + v1_const)>>16;
+
+        u[7] = (u[7]*alpha + u1_const)>>16;
+        v[7] = (v[7]*alpha + v1_const)>>16;
+
+        u += stride;
+        v += stride;
+    }
+
+    for (j = 0; j < 8; j++)
+    {
+        u[j] = (u[j]*alpha + u1_const)>>16;
+        v[j] = (v[j]*alpha + v1_const)>>16;
+    }
+}
+
+void vp8_blend_b_c (unsigned char *y, unsigned char *u, unsigned char *v,
+                        int y1, int u1, int v1, int alpha, int stride)
+{
+    int i, j;
+    int y1_const = y1*((1<<16)-alpha);
+    int u1_const = u1*((1<<16)-alpha);
+    int v1_const = v1*((1<<16)-alpha);
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            y[j] = (y[j]*alpha + y1_const)>>16;
+        }
+        y += stride;
+    }
+
+    stride >>= 1;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            u[j] = (u[j]*alpha + u1_const)>>16;
+            v[j] = (v[j]*alpha + v1_const)>>16;
+        }
+        u += stride;
+        v += stride;
+    }
+}
+
+static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int height)
+{
+    int dx;
+    int dy;
+
+    if (*x1 > width)
+    {
+        dx = *x1 - x0;
+        dy = *y1 - y0;
+
+        *x1 = width;
+        if (dx)
+            *y1 = ((width-x0)*dy)/dx + y0;
+    }
+    if (*x1 < 0)
+    {
+        dx = *x1 - x0;
+        dy = *y1 - y0;
+
+        *x1 = 0;
+        if (dx)
+            *y1 = ((0-x0)*dy)/dx + y0;
+    }
+    if (*y1 > height)
+    {
+        dx = *x1 - x0;
+        dy = *y1 - y0;
+
+        *y1 = height;
+        if (dy)
+            *x1 = ((height-y0)*dx)/dy + x0;
+    }
+    if (*y1 < 0)
+    {
+        dx = *x1 - x0;
+        dy = *y1 - y0;
+
+        *y1 = 0;
+        if (dy)
+            *x1 = ((0-y0)*dx)/dy + x0;
+    }
+}
+
+#if CONFIG_POSTPROC
+int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
+{
+    int q = oci->filter_level * 10 / 6;
+    int flags = ppflags->post_proc_flag;
+    int deblock_level = ppflags->deblocking_level;
+    int noise_level = ppflags->noise_level;
+
+    if (!oci->frame_to_show)
+        return -1;
+
+    if (q > 63)
+        q = 63;
+
+    if (!flags)
+    {
+        *dest = *oci->frame_to_show;
+
+        /* handle problem with extending borders */
+        dest->y_width = oci->Width;
+        dest->y_height = oci->Height;
+        dest->uv_height = dest->y_height / 2;
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
+        oci->postproc_state.last_frame_valid = 1;
+        return 0;
+    }
+
+    /* Allocate post_proc_buffer_int if needed */
+    if ((flags & VP8D_MFQE) && !oci->post_proc_buffer_int_used)
+    {
+        if ((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK))
+        {
+            int width = (oci->Width + 15) & ~15;
+            int height = (oci->Height + 15) & ~15;
+
+            if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer_int,
+                                            width, height, VP8BORDERINPIXELS))
+                vpx_internal_error(&oci->error, VPX_CODEC_MEM_ERROR,
+                                   "Failed to allocate MFQE framebuffer");
+
+            oci->post_proc_buffer_int_used = 1;
+
+            /* insure that postproc is set to all 0's so that post proc
+             * doesn't pull random data in from edge
+             */
+            vpx_memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);
+
+        }
+    }
+
+    vp8_clear_system_state();
+
+    if ((flags & VP8D_MFQE) &&
+         oci->postproc_state.last_frame_valid &&
+         oci->current_video_frame >= 2 &&
+         oci->postproc_state.last_base_qindex < 60 &&
+         oci->base_qindex - oci->postproc_state.last_base_qindex >= 20)
+    {
+        vp8_multiframe_quality_enhance(oci);
+        if (((flags & VP8D_DEBLOCK) || (flags & VP8D_DEMACROBLOCK)) &&
+            oci->post_proc_buffer_int_used)
+        {
+            vp8_yv12_copy_frame(&oci->post_proc_buffer, &oci->post_proc_buffer_int);
+            if (flags & VP8D_DEMACROBLOCK)
+            {
+                vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
+                                               q + (deblock_level - 5) * 10, 1, 0);
+                vp8_de_mblock(&oci->post_proc_buffer,
+                              q + (deblock_level - 5) * 10);
+            }
+            else if (flags & VP8D_DEBLOCK)
+            {
+                vp8_deblock(oci, &oci->post_proc_buffer_int, &oci->post_proc_buffer,
+                            q, 1, 0);
+            }
+        }
+        /* Move partially towards the base q of the previous frame */
+        oci->postproc_state.last_base_qindex = (3*oci->postproc_state.last_base_qindex + oci->base_qindex)>>2;
+    }
+    else if (flags & VP8D_DEMACROBLOCK)
+    {
+        vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
+                                     q + (deblock_level - 5) * 10, 1, 0);
+        vp8_de_mblock(&oci->post_proc_buffer, q + (deblock_level - 5) * 10);
+
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
+    }
+    else if (flags & VP8D_DEBLOCK)
+    {
+        vp8_deblock(oci, oci->frame_to_show, &oci->post_proc_buffer,
+                    q, 1, 0);
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
+    }
+    else
+    {
+        vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);
+        oci->postproc_state.last_base_qindex = oci->base_qindex;
+    }
+    oci->postproc_state.last_frame_valid = 1;
+
+    if (flags & VP8D_ADDNOISE)
+    {
+        if (oci->postproc_state.last_q != q
+            || oci->postproc_state.last_noise != noise_level)
+        {
+            fillrd(&oci->postproc_state, 63 - q, noise_level);
+        }
+
+        vp8_plane_add_noise
+        (oci->post_proc_buffer.y_buffer,
+         oci->postproc_state.noise,
+         oci->postproc_state.blackclamp,
+         oci->postproc_state.whiteclamp,
+         oci->postproc_state.bothclamp,
+         oci->post_proc_buffer.y_width, oci->post_proc_buffer.y_height,
+         oci->post_proc_buffer.y_stride);
+    }
+
+#if CONFIG_POSTPROC_VISUALIZER
+    if (flags & VP8D_DEBUG_TXT_FRAME_INFO)
+    {
+        char message[512];
+        sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
+                (oci->frame_type == KEY_FRAME),
+                oci->refresh_golden_frame,
+                oci->base_qindex,
+                oci->filter_level,
+                flags,
+                oci->mb_cols, oci->mb_rows);
+        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
+    }
+
+    if (flags & VP8D_DEBUG_TXT_MBLK_MODES)
+    {
+        int i, j;
+        unsigned char *y_ptr;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int mb_rows = post->y_height >> 4;
+        int mb_cols = post->y_width  >> 4;
+        int mb_index = 0;
+        MODE_INFO *mi = oci->mi;
+
+        y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+        /* vp8_filter each macro block */
+        for (i = 0; i < mb_rows; i++)
+        {
+            for (j = 0; j < mb_cols; j++)
+            {
+                char zz[4];
+
+                sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
+
+                vp8_blit_text(zz, y_ptr, post->y_stride);
+                mb_index ++;
+                y_ptr += 16;
+            }
+
+            mb_index ++; /* border */
+            y_ptr += post->y_stride  * 16 - post->y_width;
+
+        }
+    }
+
+    if (flags & VP8D_DEBUG_TXT_DC_DIFF)
+    {
+        int i, j;
+        unsigned char *y_ptr;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int mb_rows = post->y_height >> 4;
+        int mb_cols = post->y_width  >> 4;
+        int mb_index = 0;
+        MODE_INFO *mi = oci->mi;
+
+        y_ptr = post->y_buffer + 4 * post->y_stride + 4;
+
+        /* vp8_filter each macro block */
+        for (i = 0; i < mb_rows; i++)
+        {
+            for (j = 0; j < mb_cols; j++)
+            {
+                char zz[4];
+                int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
+                              mi[mb_index].mbmi.mode != SPLITMV &&
+                              mi[mb_index].mbmi.mb_skip_coeff);
+
+                if (oci->frame_type == KEY_FRAME)
+                    sprintf(zz, "a");
+                else
+                    sprintf(zz, "%c", dc_diff + '0');
+
+                vp8_blit_text(zz, y_ptr, post->y_stride);
+                mb_index ++;
+                y_ptr += 16;
+            }
+
+            mb_index ++; /* border */
+            y_ptr += post->y_stride  * 16 - post->y_width;
+
+        }
+    }
+
+    if (flags & VP8D_DEBUG_TXT_RATE_INFO)
+    {
+        char message[512];
+        sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
+        vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
+    }
+
+    /* Draw motion vectors */
+    if ((flags & VP8D_DEBUG_DRAW_MV) && ppflags->display_mv_flag)
+    {
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int width  = post->y_width;
+        int height = post->y_height;
+        unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
+        int y_stride = oci->post_proc_buffer.y_stride;
+        MODE_INFO *mi = oci->mi;
+        int x0, y0;
+
+        for (y0 = 0; y0 < height; y0 += 16)
+        {
+            for (x0 = 0; x0 < width; x0 += 16)
+            {
+                int x1, y1;
+
+                if (!(ppflags->display_mv_flag & (1<<mi->mbmi.mode)))
+                {
+                    mi++;
+                    continue;
+                }
+
+                if (mi->mbmi.mode == SPLITMV)
+                {
+                    switch (mi->mbmi.partitioning)
+                    {
+                        case 0 :    /* mv_top_bottom */
+                        {
+                            union b_mode_info *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 8 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+8, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+8,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[8];
+
+                            x1 = x0 + 8 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+8, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+8,  x1, y0+12,  y1, y_buffer, y_stride);
+
+                            break;
+                        }
+                        case 1 :    /* mv_left_right */
+                        {
+                            union b_mode_info *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 + 8 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+8, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+8,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[2];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 + 8 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+8, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+8,  y1, y_buffer, y_stride);
+
+                            break;
+                        }
+                        case 2 :    /* mv_quarters   */
+                        {
+                            union b_mode_info *bmi = &mi->bmi[0];
+                            MV *mv = &bmi->mv.as_mv;
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[2];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 + 4 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+4, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+4,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[8];
+
+                            x1 = x0 + 4 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+4, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+4,  x1, y0+12,  y1, y_buffer, y_stride);
+
+                            bmi = &mi->bmi[10];
+
+                            x1 = x0 +12 + (mv->col >> 3);
+                            y1 = y0 +12 + (mv->row >> 3);
+
+                            constrain_line (x0+12, &x1, y0+12, &y1, width, height);
+                            vp8_blit_line  (x0+12,  x1, y0+12,  y1, y_buffer, y_stride);
+                            break;
+                        }
+                        default :
+                        {
+                            union b_mode_info *bmi = mi->bmi;
+                            int bx0, by0;
+
+                            for (by0 = y0; by0 < (y0+16); by0 += 4)
+                            {
+                                for (bx0 = x0; bx0 < (x0+16); bx0 += 4)
+                                {
+                                    MV *mv = &bmi->mv.as_mv;
+
+                                    x1 = bx0 + 2 + (mv->col >> 3);
+                                    y1 = by0 + 2 + (mv->row >> 3);
+
+                                    constrain_line (bx0+2, &x1, by0+2, &y1, width, height);
+                                    vp8_blit_line  (bx0+2,  x1, by0+2,  y1, y_buffer, y_stride);
+
+                                    bmi++;
+                                }
+                            }
+                        }
+                    }
+                }
+                else if (mi->mbmi.mode >= NEARESTMV)
+                {
+                    MV *mv = &mi->mbmi.mv.as_mv;
+                    const int lx0 = x0 + 8;
+                    const int ly0 = y0 + 8;
+
+                    x1 = lx0 + (mv->col >> 3);
+                    y1 = ly0 + (mv->row >> 3);
+
+                    if (x1 != lx0 && y1 != ly0)
+                    {
+                        constrain_line (lx0, &x1, ly0-1, &y1, width, height);
+                        vp8_blit_line  (lx0,  x1, ly0-1,  y1, y_buffer, y_stride);
+
+                        constrain_line (lx0, &x1, ly0+1, &y1, width, height);
+                        vp8_blit_line  (lx0,  x1, ly0+1,  y1, y_buffer, y_stride);
+                    }
+                    else
+                        vp8_blit_line  (lx0,  x1, ly0,  y1, y_buffer, y_stride);
+                }
+
+                mi++;
+            }
+            mi++;
+        }
+    }
+
+    /* Color in block modes */
+    if ((flags & VP8D_DEBUG_CLR_BLK_MODES)
+        && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag))
+    {
+        int y, x;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int width  = post->y_width;
+        int height = post->y_height;
+        unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
+        unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
+        unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+        int y_stride = oci->post_proc_buffer.y_stride;
+        MODE_INFO *mi = oci->mi;
+
+        for (y = 0; y < height; y += 16)
+        {
+            for (x = 0; x < width; x += 16)
+            {
+                int Y = 0, U = 0, V = 0;
+
+                if (mi->mbmi.mode == B_PRED &&
+                    ((ppflags->display_mb_modes_flag & B_PRED) || ppflags->display_b_modes_flag))
+                {
+                    int by, bx;
+                    unsigned char *yl, *ul, *vl;
+                    union b_mode_info *bmi = mi->bmi;
+
+                    yl = y_ptr + x;
+                    ul = u_ptr + (x>>1);
+                    vl = v_ptr + (x>>1);
+
+                    for (by = 0; by < 16; by += 4)
+                    {
+                        for (bx = 0; bx < 16; bx += 4)
+                        {
+                            if ((ppflags->display_b_modes_flag & (1<<mi->mbmi.mode))
+                                || (ppflags->display_mb_modes_flag & B_PRED))
+                            {
+                                Y = B_PREDICTION_MODE_colors[bmi->as_mode][0];
+                                U = B_PREDICTION_MODE_colors[bmi->as_mode][1];
+                                V = B_PREDICTION_MODE_colors[bmi->as_mode][2];
+
+                                vp8_blend_b
+                                    (yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);
+                            }
+                            bmi++;
+                        }
+
+                        yl += y_stride*4;
+                        ul += y_stride*1;
+                        vl += y_stride*1;
+                    }
+                }
+                else if (ppflags->display_mb_modes_flag & (1<<mi->mbmi.mode))
+                {
+                    Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0];
+                    U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1];
+                    V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2];
+
+                    vp8_blend_mb_inner
+                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+                }
+
+                mi++;
+            }
+            y_ptr += y_stride*16;
+            u_ptr += y_stride*4;
+            v_ptr += y_stride*4;
+
+            mi++;
+        }
+    }
+
+    /* Color in frame reference blocks */
+    if ((flags & VP8D_DEBUG_CLR_FRM_REF_BLKS) && ppflags->display_ref_frame_flag)
+    {
+        int y, x;
+        YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
+        int width  = post->y_width;
+        int height = post->y_height;
+        unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
+        unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
+        unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+        int y_stride = oci->post_proc_buffer.y_stride;
+        MODE_INFO *mi = oci->mi;
+
+        for (y = 0; y < height; y += 16)
+        {
+            for (x = 0; x < width; x +=16)
+            {
+                int Y = 0, U = 0, V = 0;
+
+                if (ppflags->display_ref_frame_flag & (1<<mi->mbmi.ref_frame))
+                {
+                    Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0];
+                    U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1];
+                    V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2];
+
+                    vp8_blend_mb_outer
+                        (y_ptr+x, u_ptr+(x>>1), v_ptr+(x>>1), Y, U, V, 0xc000, y_stride);
+                }
+
+                mi++;
+            }
+            y_ptr += y_stride*16;
+            u_ptr += y_stride*4;
+            v_ptr += y_stride*4;
+
+            mi++;
+        }
+    }
+#endif
+
+    *dest = oci->post_proc_buffer;
+
+    /* handle problem with extending borders */
+    dest->y_width = oci->Width;
+    dest->y_height = oci->Height;
+    dest->uv_height = dest->y_height / 2;
+    return 0;
+}
+#endif
diff --git a/libvpx/vp8/common/postproc.h b/libvpx/vp8/common/postproc.h
new file mode 100644
index 0000000..495a2c9
--- /dev/null
+++ b/libvpx/vp8/common/postproc.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef POSTPROC_H
+#define POSTPROC_H
+
+#include "vpx_ports/mem.h"
+struct postproc_state
+{
+    int           last_q;
+    int           last_noise;
+    char          noise[3072];
+    int           last_base_qindex;
+    int           last_frame_valid;
+    DECLARE_ALIGNED(16, char, blackclamp[16]);
+    DECLARE_ALIGNED(16, char, whiteclamp[16]);
+    DECLARE_ALIGNED(16, char, bothclamp[16]);
+};
+#include "onyxc_int.h"
+#include "ppflags.h"
+int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
+                        vp8_ppflags_t *flags);
+
+
+void vp8_de_noise(struct VP8Common           *oci,
+                  YV12_BUFFER_CONFIG         *source,
+                  YV12_BUFFER_CONFIG         *post,
+                  int                         q,
+                  int                         low_var_thresh,
+                  int                         flag);
+
+void vp8_deblock(struct VP8Common           *oci,
+                 YV12_BUFFER_CONFIG         *source,
+                 YV12_BUFFER_CONFIG         *post,
+                 int                         q,
+                 int                         low_var_thresh,
+                 int                         flag);
+
+#define MFQE_PRECISION 4
+
+void vp8_multiframe_quality_enhance(struct VP8Common *cm);
+#endif
diff --git a/libvpx/vp8/common/ppc/copy_altivec.asm b/libvpx/vp8/common/ppc/copy_altivec.asm
new file mode 100644
index 0000000..a4ce915
--- /dev/null
+++ b/libvpx/vp8/common/ppc/copy_altivec.asm
@@ -0,0 +1,47 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl copy_mem16x16_ppc
+
+;# r3 unsigned char *src
+;# r4 int src_stride
+;# r5 unsigned char *dst
+;# r6 int dst_stride
+
+;# Make the assumption that input will not be aligned,
+;#  but the output will be.  So two reads and a perm
+;#  for the input, but only one store for the output.
+copy_mem16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xe000
+    mtspr   256, r12            ;# set VRSAVE
+
+    li      r10, 16
+    mtctr   r10
+
+cp_16x16_loop:
+    lvsl    v0,  0, r3          ;# permutate value for alignment
+
+    lvx     v1,   0, r3
+    lvx     v2, r10, r3
+
+    vperm   v1, v1, v2, v0
+
+    stvx    v1,  0, r5
+
+    add     r3, r3, r4          ;# increment source pointer
+    add     r5, r5, r6          ;# increment destination pointer
+
+    bdnz    cp_16x16_loop
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
diff --git a/libvpx/vp8/common/ppc/filter_altivec.asm b/libvpx/vp8/common/ppc/filter_altivec.asm
new file mode 100644
index 0000000..4da2e94
--- /dev/null
+++ b/libvpx/vp8/common/ppc/filter_altivec.asm
@@ -0,0 +1,1013 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl sixtap_predict_ppc
+    .globl sixtap_predict8x4_ppc
+    .globl sixtap_predict8x8_ppc
+    .globl sixtap_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+.macro load_hfilter V0, V1
+    load_c \V0, HFilter, r5, r9, r10
+
+    addi    r5,  r5, 16
+    lvx     \V1, r5, r10
+.endm
+
+;# Vertical filtering
+.macro Vprolog
+    load_c v0, VFilter, r6, r3, r10
+
+    vspltish v5, 8
+    vspltish v6, 3
+    vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v1, v0, 1
+    vspltb  v2, v0, 2
+    vspltb  v3, v0, 3
+    vspltb  v4, v0, 4
+    vspltb  v5, v0, 5
+    vspltb  v0, v0, 0
+.endm
+
+.macro vpre_load
+    Vprolog
+    li      r10,  16
+    lvx     v10,   0, r9    ;# v10..v14 = first 5 rows
+    lvx     v11, r10, r9
+    addi    r9,   r9, 32
+    lvx     v12,   0, r9
+    lvx     v13, r10, r9
+    addi    r9,   r9, 32
+    lvx     v14,   0, r9
+.endm
+
+.macro Msum Re, Ro, V, T, TMP
+                                ;# (Re,Ro) += (V*T)
+    vmuleub \TMP, \V, \T        ;# trashes v8
+    vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary
+    vmuloub \TMP, \V, \T
+    vadduhm \Ro, \Ro, \TMP      ;# Ro = odds
+.endm
+
+.macro vinterp_no_store P0 P1 P2 P3 P4 P5
+    vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps
+    vadduhm v16, v6, v8
+    vmuloub  v8, \P0, v0
+    vadduhm v17, v6, v8
+    Msum v16, v17, \P2, v2, v8
+    Msum v16, v17, \P3, v3, v8
+    Msum v16, v17, \P5, v5, v8
+
+    vmuleub v18, \P1, v1        ;# 2 negative taps
+    vmuloub v19, \P1, v1
+    Msum v18, v19, \P4, v4, v8
+
+    vsubuhs v16, v16, v18       ;# subtract neg from pos
+    vsubuhs v17, v17, v19
+    vsrh    v16, v16, v7        ;# divide by 128
+    vsrh    v17, v17, v7        ;# v16 v17 = evens, odds
+    vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order
+    vmrglh  v19, v16, v17
+    vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result
+.endm
+
+.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
+    vmuleub v24, \P0, v13       ;# 64 + 4 positive taps
+    vadduhm v21, v20, v24
+    vmuloub v24, \P0, v13
+    vadduhm v22, v20, v24
+    Msum v21, v22, \P2, v15, v25
+    Msum v21, v22, \P3, v16, v25
+    Msum v21, v22, \P5, v18, v25
+
+    vmuleub v23, \P1, v14       ;# 2 negative taps
+    vmuloub v24, \P1, v14
+    Msum v23, v24, \P4, v17, v25
+
+    vsubuhs v21, v21, v23       ;# subtract neg from pos
+    vsubuhs v22, v22, v24
+    vsrh    v21, v21, v19       ;# divide by 128
+    vsrh    v22, v22, v19       ;# v16 v17 = evens, odds
+    vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order
+    vmrglh  v24, v21, v22
+    vpkuhus \P0, v23, v24       ;# P0 = 8-bit result
+.endm
+
+
+.macro Vinterp P0 P1 P2 P3 P4 P5
+    vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
+    stvx    \P0, 0, r7
+    add     r7, r7, r8      ;# 33 ops per 16 pels
+.endm
+
+
+.macro luma_v P0, P1, P2, P3, P4, P5
+    addi    r9,   r9, 16        ;# P5 = newest input row
+    lvx     \P5,   0, r9
+    Vinterp \P0, \P1, \P2, \P3, \P4, \P5
+.endm
+
+.macro luma_vtwo
+    luma_v v10, v11, v12, v13, v14, v15
+    luma_v v11, v12, v13, v14, v15, v10
+.endm
+
+.macro luma_vfour
+    luma_vtwo
+    luma_v v12, v13, v14, v15, v10, v11
+    luma_v v13, v14, v15, v10, v11, v12
+.endm
+
+.macro luma_vsix
+    luma_vfour
+    luma_v v14, v15, v10, v11, v12, v13
+    luma_v v15, v10, v11, v12, v13, v14
+.endm
+
+.macro Interp4 R I I4
+    vmsummbm \R, v13, \I, v15
+    vmsummbm \R, v14, \I4, \R
+.endm
+
+.macro Read8x8 VD, RS, RP, increment_counter
+    lvsl    v21,  0, \RS        ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     \VD,   0, \RS
+    lvx     v20, r10, \RS
+
+.if \increment_counter
+    add     \RS, \RS, \RP
+.endif
+
+    vperm   \VD, \VD, v20, v21
+.endm
+
+.macro interp_8x8 R
+    vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456
+    vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A
+    Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3
+    vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx
+    Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7
+
+    vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7
+    vsrh    \R, \R, v19
+
+    vpkuhus \R, \R, \R          ;# saturate and pack
+
+.endm
+
+.macro Read4x4 VD, RS, RP, increment_counter
+    lvsl    v21,  0, \RS        ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v20,   0, \RS
+
+.if \increment_counter
+    add     \RS, \RS, \RP
+.endif
+
+    vperm   \VD, v20, v20, v21
+.endm
+    .text
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+sixtap_predict_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xff87
+    ori     r12, r12, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq-    vertical_only_4x4
+
+    ;# load up horizontal filter
+    load_hfilter v13, v14
+
+    ;# rounding added in on the multiply
+    vspltisw v16, 8
+    vspltisw v15, 3
+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
+
+    ;# Load up permutation constants
+    load_c v16, B_0123, 0, r9, r10
+    load_c v17, B_4567, 0, r9, r10
+    load_c v18, B_89AB, 0, r9, r10
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    addi    r9, r3, 0
+    li      r10, 16
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# filter a line
+    interp_8x8 v2
+    interp_8x8 v3
+    interp_8x8 v4
+    interp_8x8 v5
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional 5 lines that are needed
+    ;#  for the vertical filter.
+    beq-    store_4x4
+
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r9, r9, r4
+    sub     r9, r9, r4
+
+    Read8x8 v0, r9, r4, 1
+    Read8x8 v1, r9, r4, 0
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 0
+
+    interp_8x8 v0
+    interp_8x8 v1
+    interp_8x8 v6
+    interp_8x8 v7
+    interp_8x8 v8
+
+    b       second_pass_4x4
+
+vertical_only_4x4:
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+    li      r10, 16
+
+    Read8x8 v0, r3, r4, 1
+    Read8x8 v1, r3, r4, 1
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 0
+
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+second_pass_4x4:
+    load_c   v20, b_hilo_4x4, 0, r9, r10
+    load_c   v21, b_hilo, 0, r9, r10
+
+    ;# reposition input so that it can go through the
+    ;# filtering phase with one pass.
+    vperm   v0, v0, v1, v20     ;# 0 1 x x
+    vperm   v2, v2, v3, v20     ;# 2 3 x x
+    vperm   v4, v4, v5, v20     ;# 4 5 x x
+    vperm   v6, v6, v7, v20     ;# 6 7 x x
+
+    vperm   v0, v0, v2, v21     ;# 0 1 2 3
+    vperm   v4, v4, v6, v21     ;# 4 5 6 7
+
+    vsldoi  v1, v0, v4, 4
+    vsldoi  v2, v0, v4, 8
+    vsldoi  v3, v0, v4, 12
+
+    vsldoi  v5, v4, v8, 4
+
+    load_c   v13, VFilter, r6, r9, r10
+
+    vspltish v15, 8
+    vspltish v20, 3
+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v14, v13, 1
+    vspltb  v15, v13, 2
+    vspltb  v16, v13, 3
+    vspltb  v17, v13, 4
+    vspltb  v18, v13, 5
+    vspltb  v13, v13, 0
+
+    vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
+
+    stvx    v0, 0, r1
+
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    lwz     r0, 4(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    lwz     r0, 8(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    lwz     r0, 12(r1)
+    stw     r0, 0(r7)
+
+    b       exit_4x4
+
+store_4x4:
+
+    stvx    v2, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v3, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v4, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v5, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+
+exit_4x4:
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro w_8x8 V, D, R, P
+    stvx    \V, 0, r1
+    lwz     \R, 0(r1)
+    stw     \R, 0(r7)
+    lwz     \R, 4(r1)
+    stw     \R, 4(r7)
+    add     \D, \D, \P
+.endm
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+sixtap_predict8x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq-    second_pass_pre_copy_8x4
+
+    load_hfilter v13, v14
+
+    ;# rounding added in on the multiply
+    vspltisw v16, 8
+    vspltisw v15, 3
+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
+
+    ;# Load up permutation constants
+    load_c v16, B_0123, 0, r9, r10
+    load_c v17, B_4567, 0, r9, r10
+    load_c v18, B_89AB, 0, r9, r10
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    addi    r9, r3, 0
+    li      r10, 16
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# filter a line
+    interp_8x8 v2
+    interp_8x8 v3
+    interp_8x8 v4
+    interp_8x8 v5
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional 5 lines that are needed
+    ;#  for the vertical filter.
+    beq-    store_8x4
+
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r9, r9, r4
+    sub     r9, r9, r4
+
+    Read8x8 v0, r9, r4, 1
+    Read8x8 v1, r9, r4, 0
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 0
+
+    interp_8x8 v0
+    interp_8x8 v1
+    interp_8x8 v6
+    interp_8x8 v7
+    interp_8x8 v8
+
+    b       second_pass_8x4
+
+second_pass_pre_copy_8x4:
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+    li      r10, 16
+
+    Read8x8 v0,  r3, r4, 1
+    Read8x8 v1,  r3, r4, 1
+    Read8x8 v2,  r3, r4, 1
+    Read8x8 v3,  r3, r4, 1
+    Read8x8 v4,  r3, r4, 1
+    Read8x8 v5,  r3, r4, 1
+    Read8x8 v6,  r3, r4, 1
+    Read8x8 v7,  r3, r4, 1
+    Read8x8 v8,  r3, r4, 1
+
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+second_pass_8x4:
+    load_c v13, VFilter, r6, r9, r10
+
+    vspltish v15, 8
+    vspltish v20, 3
+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v14, v13, 1
+    vspltb  v15, v13, 2
+    vspltb  v16, v13, 3
+    vspltb  v17, v13, 4
+    vspltb  v18, v13, 5
+    vspltb  v13, v13, 0
+
+    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
+    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
+    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
+    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x4
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+
+    b       exit_8x4
+
+store_aligned_8x4:
+
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+
+    b       exit_8x4
+
+store_8x4:
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned2_8x4
+
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+
+    b       exit_8x4
+
+store_aligned2_8x4:
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+
+exit_8x4:
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Because the width that needs to be filtered will fit in a single altivec
+;#  register there is no need to loop.  Everything can stay in registers.
+sixtap_predict8x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq-    second_pass_pre_copy_8x8
+
+    load_hfilter v13, v14
+
+    ;# rounding added in on the multiply
+    vspltisw v16, 8
+    vspltisw v15, 3
+    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
+
+    ;# Load up permutation constants
+    load_c v16, B_0123, 0, r9, r10
+    load_c v17, B_4567, 0, r9, r10
+    load_c v18, B_89AB, 0, r9, r10
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    addi    r9, r3, 0
+    li      r10, 16
+    Read8x8 v2, r3, r4, 1
+    Read8x8 v3, r3, r4, 1
+    Read8x8 v4, r3, r4, 1
+    Read8x8 v5, r3, r4, 1
+    Read8x8 v6, r3, r4, 1
+    Read8x8 v7, r3, r4, 1
+    Read8x8 v8, r3, r4, 1
+    Read8x8 v9, r3, r4, 1
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# filter a line
+    interp_8x8 v2
+    interp_8x8 v3
+    interp_8x8 v4
+    interp_8x8 v5
+    interp_8x8 v6
+    interp_8x8 v7
+    interp_8x8 v8
+    interp_8x8 v9
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional 5 lines that are needed
+    ;#  for the vertical filter.
+    beq-    store_8x8
+
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r9, r9, r4
+    sub     r9, r9, r4
+
+    Read8x8 v0,  r9, r4, 1
+    Read8x8 v1,  r9, r4, 0
+    Read8x8 v10, r3, r4, 1
+    Read8x8 v11, r3, r4, 1
+    Read8x8 v12, r3, r4, 0
+
+    interp_8x8 v0
+    interp_8x8 v1
+    interp_8x8 v10
+    interp_8x8 v11
+    interp_8x8 v12
+
+    b       second_pass_8x8
+
+second_pass_pre_copy_8x8:
+    ;# only needed if there is a vertical filter present
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+    li      r10, 16
+
+    Read8x8 v0,  r3, r4, 1
+    Read8x8 v1,  r3, r4, 1
+    Read8x8 v2,  r3, r4, 1
+    Read8x8 v3,  r3, r4, 1
+    Read8x8 v4,  r3, r4, 1
+    Read8x8 v5,  r3, r4, 1
+    Read8x8 v6,  r3, r4, 1
+    Read8x8 v7,  r3, r4, 1
+    Read8x8 v8,  r3, r4, 1
+    Read8x8 v9,  r3, r4, 1
+    Read8x8 v10, r3, r4, 1
+    Read8x8 v11, r3, r4, 1
+    Read8x8 v12, r3, r4, 0
+
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+second_pass_8x8:
+    load_c v13, VFilter, r6, r9, r10
+
+    vspltish v15, 8
+    vspltish v20, 3
+    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    vspltb  v14, v13, 1
+    vspltb  v15, v13, 2
+    vspltb  v16, v13, 3
+    vspltb  v17, v13, 4
+    vspltb  v18, v13, 5
+    vspltb  v13, v13, 0
+
+    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
+    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
+    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
+    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
+    vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9
+    vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10
+    vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11
+    vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x8
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+    w_8x8   v6, r7, r0, r8
+    w_8x8   v7, r7, r0, r8
+
+    b       exit_8x8
+
+store_aligned_8x8:
+
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+    vperm   v6, v6, v7, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+    addi    r7, r7, 16
+    stvx    v6, 0, r7
+
+    b       exit_8x8
+
+store_8x8:
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned2_8x8
+
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+    w_8x8   v6, r7, r0, r8
+    w_8x8   v7, r7, r0, r8
+    w_8x8   v8, r7, r0, r8
+    w_8x8   v9, r7, r0, r8
+
+    b       exit_8x8
+
+store_aligned2_8x8:
+    load_c v10, b_hilo, 0, r9, r10
+
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+    vperm   v6, v6, v7, v10
+    vperm   v8, v8, v9, v10
+
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+    addi    r7, r7, 16
+    stvx    v6, 0, r7
+    addi    r7, r7, 16
+    stvx    v8, 0, r7
+
+exit_8x8:
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+
+;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical
+;#  edges.  One of the filters can be null, but both won't be.  Needs to use a
+;#  temporary buffer because the source buffer can't be modified and the buffer
+;#  for the destination is not large enough to hold the temporary data.
+sixtap_predict16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xf000
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-416(r1)         ;# create space on the stack
+
+    ;# Three possiblities
+    ;#  1. First filter is null.  Don't use a temp buffer.
+    ;#  2. Second filter is null.  Don't use a temp buffer.
+    ;#  3. Neither are null, use temp buffer.
+
+    ;# First Pass (horizontal edge)
+    ;#  setup pointers for src
+    ;#  if possiblity (1) then setup the src pointer to be the orginal and jump
+    ;#  to second pass.  this is based on if x_offset is 0.
+
+    ;# load up horizontal filter
+    slwi.   r5, r5, 5           ;# index into horizontal filter array
+
+    load_hfilter v4, v5
+
+    beq-    copy_horizontal_16x21
+
+    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
+    addi    r3, r3, -2
+
+    slwi.   r6, r6, 4           ;# index into vertical filter array
+
+    ;# setup constants
+    ;# v14 permutation value for alignment
+    load_c v14, b_hperm, 0, r9, r10
+
+    ;# These statements are guessing that there won't be a second pass,
+    ;#  but if there is then inside the bypass they need to be set
+    li      r0, 16              ;# prepare for no vertical filter
+
+    ;# Change the output pointer and pitch to be the actual
+    ;#  desination instead of a temporary buffer.
+    addi    r9, r7, 0
+    addi    r5, r8, 0
+
+    ;# no vertical filter, so write the output from the first pass
+    ;#  directly into the output buffer.
+    beq-    no_vertical_filter_bypass
+
+    ;# if the second filter is not null then need to back off by 2*pitch
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+
+    ;# setup counter for the number of lines that are going to be filtered
+    li      r0, 21
+
+    ;# use the stack as temporary storage
+    la      r9, 48(r1)
+    li      r5, 16
+
+no_vertical_filter_bypass:
+
+    mtctr   r0
+
+    ;# rounding added in on the multiply
+    vspltisw v10, 8
+    vspltisw v12, 3
+    vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v13, 7
+
+    ;# index to the next set of vectors in the row.
+    li      r10, 16
+    li      r12, 32
+
+horizontal_loop_16x16:
+
+    lvsl    v15,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v1,   0, r3
+    lvx     v2, r10, r3
+    lvx     v3, r12, r3
+
+    vperm   v8, v1, v2, v15
+    vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified
+
+    vsldoi  v11, v8, v9, 4
+
+    ;# set 0
+    vmsummbm v6, v4, v8, v12    ;# taps times elements
+    vmsummbm v0, v5, v11, v6
+
+    ;# set 1
+    vsldoi  v10, v8, v9, 1
+    vsldoi  v11, v8, v9, 5
+
+    vmsummbm v6, v4, v10, v12
+    vmsummbm v1, v5, v11, v6
+
+    ;# set 2
+    vsldoi  v10, v8, v9, 2
+    vsldoi  v11, v8, v9, 6
+
+    vmsummbm v6, v4, v10, v12
+    vmsummbm v2, v5, v11, v6
+
+    ;# set 3
+    vsldoi  v10, v8, v9, 3
+    vsldoi  v11, v8, v9, 7
+
+    vmsummbm v6, v4, v10, v12
+    vmsummbm v3, v5, v11, v6
+
+    vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
+    vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F
+
+    vsrh    v0, v0, v13         ;# divide v0, v1 by 128
+    vsrh    v1, v1, v13
+
+    vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result
+    vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result
+
+    stvx    v0,  0, r9
+    add     r9, r9, r5
+
+    add     r3, r3, r4
+
+    bdnz    horizontal_loop_16x16
+
+    ;# check again to see if vertical filter needs to be done.
+    cmpi    cr0, r6, 0
+    beq     cr0, end_16x16
+
+    ;# yes there is, so go to the second pass
+    b       second_pass_16x16
+
+copy_horizontal_16x21:
+    li      r10, 21
+    mtctr   r10
+
+    li      r10, 16
+
+    sub     r3, r3, r4
+    sub     r3, r3, r4
+
+    ;# this is done above if there is a horizontal filter,
+    ;#  if not it needs to be done down here.
+    slwi    r6, r6, 4           ;# index into vertical filter array
+
+    ;# always write to the stack when doing a horizontal copy
+    la      r9, 48(r1)
+
+copy_horizontal_loop_16x21:
+    lvsl    v15,  0, r3         ;# permutate value for alignment
+
+    lvx     v1,   0, r3
+    lvx     v2, r10, r3
+
+    vperm   v8, v1, v2, v15
+
+    stvx    v8,  0, r9
+    addi    r9, r9, 16
+
+    add     r3, r3, r4
+
+    bdnz    copy_horizontal_loop_16x21
+
+second_pass_16x16:
+
+    ;# always read from the stack when doing a vertical filter
+    la      r9, 48(r1)
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v7, 7
+
+    vpre_load
+
+    luma_vsix
+    luma_vsix
+    luma_vfour
+
+end_16x16:
+
+    addi    r1, r1, 416         ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+    .align 4
+HFilter:
+    .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0
+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12
+    .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0
+    .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36
+    .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0
+    .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50
+    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
+    .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77
+    .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0
+    .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93
+    .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0
+    .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108
+    .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0
+    .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123
+    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
+
+    .align 4
+VFilter:
+    .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+
+    .align 4
+b_hperm:
+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+    .align 4
+B_0123:
+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+
+    .align 4
+B_4567:
+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+
+    .align 4
+B_89AB:
+    .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+
+    .align 4
+b_hilo:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
+
+    .align 4
+b_hilo_4x4:
+    .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0
diff --git a/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm b/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm
new file mode 100644
index 0000000..fd8aa66
--- /dev/null
+++ b/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm
@@ -0,0 +1,677 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl bilinear_predict4x4_ppc
+    .globl bilinear_predict8x4_ppc
+    .globl bilinear_predict8x8_ppc
+    .globl bilinear_predict16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+    load_c \V0, vfilter_b, r6, r9, r10
+
+    addi    r6,  r6, 16
+    lvx     \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+    ;# load up horizontal filter
+    slwi.   r5, r5, 4           ;# index into horizontal filter array
+
+    ;# index to the next set of vectors in the row.
+    li      r10, 16
+    li      r12, 32
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq     \jump_label
+
+    load_c v20, hfilter_b, r5, r9, r0
+
+    ;# setup constants
+    ;# v14 permutation value for alignment
+    load_c v28, b_hperm_b, 0, r9, r0
+
+    ;# rounding added in on the multiply
+    vspltisw v21, 8
+    vspltisw v18, 3
+    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
+
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+.macro HFilter V
+    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456
+    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A
+
+    vmsummbm v24, v20, v24, v18
+    vmsummbm v25, v20, v25, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+
+    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
+.endm
+
+.macro hfilter_8 V, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 9 bytes wide, output is 8 bytes.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+
+    HFilter \V
+.endm
+
+
+.macro load_and_align_8 V, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+
+    vperm   \V, v21, v22, v17
+.endm
+
+.macro write_aligned_8 V, increment_counter
+    stvx    \V,  0, r7
+
+.if \increment_counter
+    add     r7, r7, r8
+.endif
+.endm
+
+.macro vfilter_16 P0 P1
+    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
+    vadduhm v22, v18, v22
+    vmuloub v23, \P0, v20
+    vadduhm v23, v18, v23
+
+    vmuleub v24, \P1, v21
+    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
+    vmuloub v25, \P1, v21
+    vadduhm v23, v23, v25       ;# Ro = odds
+
+    vsrh    v22, v22, v19       ;# divide by 128
+    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
+    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
+    vmrglh  v23, v22, v23
+    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
+.endm
+
+
+.macro w_8x8 V, D, R, P
+    stvx    \V, 0, r1
+    lwz     \R, 0(r1)
+    stw     \R, 0(r7)
+    lwz     \R, 4(r1)
+    stw     \R, 4(r7)
+    add     \D, \D, \P
+.endm
+
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict4x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf830
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_4x4_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r9, r12
+    load_c v11, b_4567_b, 0, r9, r12
+
+    hfilter_8 v0, 1
+    hfilter_8 v1, 1
+    hfilter_8 v2, 1
+    hfilter_8 v3, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_4x4_b
+
+    hfilter_8 v4, 0
+
+    b   second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_8  v0, 1
+    load_and_align_8  v1, 1
+    load_and_align_8  v2, 1
+    load_and_align_8  v3, 1
+    load_and_align_8  v4, 1
+
+second_pass_4x4_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+
+store_out_4x4_b:
+
+    stvx    v0, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v1, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v2, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+    add     r7, r7, r8
+
+    stvx    v3, 0, r1
+    lwz     r0, 0(r1)
+    stw     r0, 0(r7)
+
+exit_4x4:
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf830
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x4_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r9, r12
+    load_c v11, b_4567_b, 0, r9, r12
+
+    hfilter_8 v0, 1
+    hfilter_8 v1, 1
+    hfilter_8 v2, 1
+    hfilter_8 v3, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_8x4_b
+
+    hfilter_8 v4, 0
+
+    b   second_pass_8x4_b
+
+second_pass_8x4_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_8  v0, 1
+    load_and_align_8  v1, 1
+    load_and_align_8  v2, 1
+    load_and_align_8  v3, 1
+    load_and_align_8  v4, 1
+
+second_pass_8x4_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+
+store_out_8x4_b:
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x4_b
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+
+    b       exit_8x4
+
+store_aligned_8x4_b:
+    load_c v10, b_hilo_b, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+
+exit_8x4:
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict8x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfff0
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x8_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r9, r12
+    load_c v11, b_4567_b, 0, r9, r12
+
+    hfilter_8 v0, 1
+    hfilter_8 v1, 1
+    hfilter_8 v2, 1
+    hfilter_8 v3, 1
+    hfilter_8 v4, 1
+    hfilter_8 v5, 1
+    hfilter_8 v6, 1
+    hfilter_8 v7, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_8x8_b
+
+    hfilter_8 v8, 0
+
+    b   second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_8  v0, 1
+    load_and_align_8  v1, 1
+    load_and_align_8  v2, 1
+    load_and_align_8  v3, 1
+    load_and_align_8  v4, 1
+    load_and_align_8  v5, 1
+    load_and_align_8  v6, 1
+    load_and_align_8  v7, 1
+    load_and_align_8  v8, 0
+
+second_pass_8x8_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+
+store_out_8x8_b:
+
+    cmpi    cr0, r8, 8
+    beq     cr0, store_aligned_8x8_b
+
+    w_8x8   v0, r7, r0, r8
+    w_8x8   v1, r7, r0, r8
+    w_8x8   v2, r7, r0, r8
+    w_8x8   v3, r7, r0, r8
+    w_8x8   v4, r7, r0, r8
+    w_8x8   v5, r7, r0, r8
+    w_8x8   v6, r7, r0, r8
+    w_8x8   v7, r7, r0, r8
+
+    b       exit_8x8
+
+store_aligned_8x8_b:
+    load_c v10, b_hilo_b, 0, r9, r10
+
+    vperm   v0, v0, v1, v10
+    vperm   v2, v2, v3, v10
+    vperm   v4, v4, v5, v10
+    vperm   v6, v6, v7, v10
+
+    stvx    v0, 0, r7
+    addi    r7, r7, 16
+    stvx    v2, 0, r7
+    addi    r7, r7, 16
+    stvx    v4, 0, r7
+    addi    r7, r7, 16
+    stvx    v6, 0, r7
+
+exit_8x8:
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+    lvx     v23, r12, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
+
+    ;# set 0
+    vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+    ;# set 1
+    vsldoi  v23, v21, v22, 1
+    vmsummbm v25, v20, v23, v18
+
+    ;# set 2
+    vsldoi  v23, v21, v22, 2
+    vmsummbm v26, v20, v23, v18
+
+    ;# set 3
+    vsldoi  v23, v21, v22, 3
+    vmsummbm v27, v20, v23, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+    vsrh    v25, v25, v19
+
+    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
+    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
+.endm
+
+.macro load_and_align_16 V, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+
+    vperm   \V, v21, v22, v17
+.endm
+
+.macro write_16 V, increment_counter
+    stvx    \V,  0, r7
+
+.if \increment_counter
+    add     r7, r7, r8
+.endif
+.endm
+
+    .align 2
+;# r3 unsigned char * src
+;# r4 int src_pitch
+;# r5 int x_offset
+;# r6 int y_offset
+;# r7 unsigned char * dst
+;# r8 int dst_pitch
+bilinear_predict16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    HProlog second_pass_16x16_pre_copy_b
+
+    hfilter_16 v0,  1
+    hfilter_16 v1,  1
+    hfilter_16 v2,  1
+    hfilter_16 v3,  1
+    hfilter_16 v4,  1
+    hfilter_16 v5,  1
+    hfilter_16 v6,  1
+    hfilter_16 v7,  1
+    hfilter_16 v8,  1
+    hfilter_16 v9,  1
+    hfilter_16 v10, 1
+    hfilter_16 v11, 1
+    hfilter_16 v12, 1
+    hfilter_16 v13, 1
+    hfilter_16 v14, 1
+    hfilter_16 v15, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     store_out_16x16_b
+
+    hfilter_16 v16, 0
+
+    b   second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16  v0,  1
+    load_and_align_16  v1,  1
+    load_and_align_16  v2,  1
+    load_and_align_16  v3,  1
+    load_and_align_16  v4,  1
+    load_and_align_16  v5,  1
+    load_and_align_16  v6,  1
+    load_and_align_16  v7,  1
+    load_and_align_16  v8,  1
+    load_and_align_16  v9,  1
+    load_and_align_16  v10, 1
+    load_and_align_16  v11, 1
+    load_and_align_16  v12, 1
+    load_and_align_16  v13, 1
+    load_and_align_16  v14, 1
+    load_and_align_16  v15, 1
+    load_and_align_16  v16, 0
+
+second_pass_16x16_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+    vfilter_16 v8,  v9
+    vfilter_16 v9,  v10
+    vfilter_16 v10, v11
+    vfilter_16 v11, v12
+    vfilter_16 v12, v13
+    vfilter_16 v13, v14
+    vfilter_16 v14, v15
+    vfilter_16 v15, v16
+
+store_out_16x16_b:
+
+    write_16 v0,  1
+    write_16 v1,  1
+    write_16 v2,  1
+    write_16 v3,  1
+    write_16 v4,  1
+    write_16 v5,  1
+    write_16 v6,  1
+    write_16 v7,  1
+    write_16 v8,  1
+    write_16 v9,  1
+    write_16 v10, 1
+    write_16 v11, 1
+    write_16 v12, 1
+    write_16 v13, 1
+    write_16 v14, 1
+    write_16 v15, 0
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+    .align 4
+hfilter_b:
+    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
+    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
+    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
+    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
+    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
+    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
+    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
+    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
+
+    .align 4
+vfilter_b:
+    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+    .align 4
+b_hperm_b:
+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+    .align 4
+b_0123_b:
+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+
+    .align 4
+b_4567_b:
+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+
+b_hilo_b:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/libvpx/vp8/common/ppc/idctllm_altivec.asm b/libvpx/vp8/common/ppc/idctllm_altivec.asm
new file mode 100644
index 0000000..117d9cf
--- /dev/null
+++ b/libvpx/vp8/common/ppc/idctllm_altivec.asm
@@ -0,0 +1,189 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl short_idct4x4llm_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+    .align 2
+short_idct4x4llm_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    load_c v8, sinpi8sqrt2, 0, r9, r10
+    load_c v9, cospi8sqrt2minus1, 0, r9, r10
+    load_c v10, hi_hi, 0, r9, r10
+    load_c v11, lo_lo, 0, r9, r10
+    load_c v12, shift_16, 0, r9, r10
+
+    li      r10,  16
+    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]
+    lvx     v1, r10, r3         ;# input ip[8], ip[12]
+
+    ;# first pass
+    vupkhsh v2, v0
+    vupkhsh v3, v1
+    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]
+    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]
+
+    vupklsh v0, v0
+    vmulosh v4, v0, v8
+    vsraw   v4, v4, v12
+    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+    vupklsh v1, v1
+    vmulosh v5, v1, v9
+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v1
+
+    vsubsws v4, v4, v5          ;# c1
+
+    vmulosh v3, v1, v8
+    vsraw   v3, v3, v12
+    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)
+
+    vmulosh v5, v0, v9
+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v0
+
+    vaddsws v3, v3, v5          ;# d1
+
+    vaddsws v0, v6, v3          ;# a1 + d1
+    vsubsws v3, v6, v3          ;# a1 - d1
+
+    vaddsws v1, v7, v4          ;# b1 + c1
+    vsubsws v2, v7, v4          ;# b1 - c1
+
+    ;# transpose input
+    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1
+    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1
+
+    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3
+    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3
+
+    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0
+    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1
+
+    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2
+    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3
+
+    ;# second pass
+    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]
+    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]
+
+    vmulosh v4, v1, v8
+    vsraw   v4, v4, v12
+    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)
+
+    vmulosh v5, v3, v9
+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v3
+
+    vsubsws v4, v4, v5          ;# c1
+
+    vmulosh v2, v3, v8
+    vsraw   v2, v2, v12
+    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)
+
+    vmulosh v5, v1, v9
+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
+    vaddsws v5, v5, v1
+
+    vaddsws v3, v2, v5          ;# d1
+
+    vaddsws v0, v6, v3          ;# a1 + d1
+    vsubsws v3, v6, v3          ;# a1 - d1
+
+    vaddsws v1, v7, v4          ;# b1 + c1
+    vsubsws v2, v7, v4          ;# b1 - c1
+
+    vspltish v6, 4
+    vspltish v7, 3
+
+    vpkswss v0, v0, v1
+    vpkswss v1, v2, v3
+
+    vaddshs v0, v0, v6
+    vaddshs v1, v1, v6
+
+    vsrah   v0, v0, v7
+    vsrah   v1, v1, v7
+
+    ;# transpose output
+    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3
+    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3
+
+    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1
+    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3
+
+    stwu    r1,-416(r1)         ;# create space on the stack
+
+    stvx    v0,  0, r1
+    lwz     r6, 0(r1)
+    stw     r6, 0(r4)
+    lwz     r6, 4(r1)
+    stw     r6, 4(r4)
+
+    add     r4, r4, r5
+
+    lwz     r6,  8(r1)
+    stw     r6,  0(r4)
+    lwz     r6, 12(r1)
+    stw     r6,  4(r4)
+
+    add     r4, r4, r5
+
+    stvx    v1,  0, r1
+    lwz     r6, 0(r1)
+    stw     r6, 0(r4)
+    lwz     r6, 4(r1)
+    stw     r6, 4(r4)
+
+    add     r4, r4, r5
+
+    lwz     r6,  8(r1)
+    stw     r6,  0(r4)
+    lwz     r6, 12(r1)
+    stw     r6,  4(r4)
+
+    addi    r1, r1, 416         ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 4
+sinpi8sqrt2:
+    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
+
+    .align 4
+cospi8sqrt2minus1:
+    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
+
+    .align 4
+shift_16:
+    .long      16,    16,    16,    16
+
+    .align 4
+hi_hi:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
+
+    .align 4
+lo_lo:
+    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
diff --git a/libvpx/vp8/common/ppc/loopfilter_altivec.c b/libvpx/vp8/common/ppc/loopfilter_altivec.c
new file mode 100644
index 0000000..71bf6e2
--- /dev/null
+++ b/libvpx/vp8/common/ppc/loopfilter_altivec.c
@@ -0,0 +1,135 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "loopfilter.h"
+#include "onyxc_int.h"
+
+typedef void loop_filter_function_y_ppc
+(
+    unsigned char *s,   // source pointer
+    int p,              // pitch
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh
+);
+
+typedef void loop_filter_function_uv_ppc
+(
+    unsigned char *u,   // source pointer
+    unsigned char *v,   // source pointer
+    int p,              // pitch
+    const signed char *flimit,
+    const signed char *limit,
+    const signed char *thresh
+);
+
+typedef void loop_filter_function_s_ppc
+(
+    unsigned char *s,   // source pointer
+    int p,              // pitch
+    const signed char *flimit
+);
+
+loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
+loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
+
+loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
+loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
+
+loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
+loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
+
+// Horizontal MB filtering
+void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
+
+    if (u_ptr)
+        mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                          int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    (void)u_ptr;
+    (void)v_ptr;
+    (void)uv_stride;
+    loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Vertical MB Filtering
+void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
+
+    if (u_ptr)
+        mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                          int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    (void)u_ptr;
+    (void)v_ptr;
+    (void)uv_stride;
+    loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
+}
+
+// Horizontal B Filtering
+void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                        int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    // These should all be done at once with one call, instead of 3
+    loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+    loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+    loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+    if (u_ptr)
+        loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    (void)u_ptr;
+    (void)v_ptr;
+    (void)uv_stride;
+    loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
+    loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
+    loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
+}
+
+// Vertical B Filtering
+void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                        int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
+
+    if (u_ptr)
+        loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
+}
+
+void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                         int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    (void)u_ptr;
+    (void)v_ptr;
+    (void)uv_stride;
+    loop_filter_simple_vertical_edge_ppc(y_ptr + 4,  y_stride, lfi->flim);
+    loop_filter_simple_vertical_edge_ppc(y_ptr + 8,  y_stride, lfi->flim);
+    loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
+}
diff --git a/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm b/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm
new file mode 100644
index 0000000..61df4e9
--- /dev/null
+++ b/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm
@@ -0,0 +1,1253 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl mbloop_filter_horizontal_edge_y_ppc
+    .globl loop_filter_horizontal_edge_y_ppc
+    .globl mbloop_filter_vertical_edge_y_ppc
+    .globl loop_filter_vertical_edge_y_ppc
+
+    .globl mbloop_filter_horizontal_edge_uv_ppc
+    .globl loop_filter_horizontal_edge_uv_ppc
+    .globl mbloop_filter_vertical_edge_uv_ppc
+    .globl loop_filter_vertical_edge_uv_ppc
+
+    .globl loop_filter_simple_horizontal_edge_ppc
+    .globl loop_filter_simple_vertical_edge_ppc
+
+    .text
+;# We often need to perform transposes (and other transpose-like operations)
+;#   on matrices of data.  This is simplified by the fact that we usually
+;#   operate on hunks of data whose dimensions are powers of 2, or at least
+;#   divisible by highish powers of 2.
+;#
+;#   These operations can be very confusing.  They become more straightforward
+;#   when we think of them as permutations of address bits: Concatenate a
+;#   group of vector registers and think of it as occupying a block of
+;#   memory beginning at address zero.  The low four bits 0...3 of the
+;#   address then correspond to position within a register, the higher-order
+;#   address bits select the register.
+;#
+;#   Although register selection, at the code level, is arbitrary, things
+;#   are simpler if we use contiguous ranges of register numbers, simpler
+;#   still if the low-order bits of the register number correspond to
+;#   conceptual address bits.  We do this whenever reasonable.
+;#
+;#   A 16x16 transpose can then be thought of as an operation on
+;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this
+;#   memory and the effect of a transpose is to interchange address bit
+;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the
+;#   column, which is interchanged with the row addressed by bits 4..7.
+;#
+;#   The altivec merge instructions provide a rapid means of effecting
+;#   many of these transforms.  They operate at three widths (8,16,32).
+;#   Writing V(x) for vector register #x, paired merges permute address
+;#   indices as follows.
+;#
+;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:
+;#
+;#      vmrghb  V( x),          V( y), V( y + (1<<s))
+;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:
+;#
+;#      vmrghh  V( x),          V( y), V( y + (1<<s))
+;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:
+;#
+;#      vmrghw  V( x),          V( y), V( y + (1<<s))
+;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))
+;#
+;#
+;#   Unfortunately, there is no doubleword merge instruction.
+;#   The following sequence uses "vperm" is a substitute.
+;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
+;#   are in registers Vhihi and Vlolo, we can also effect the permutation
+;#
+;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:
+;#
+;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi
+;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
+;#
+;#
+;#   Except for bits s and d, the other relationships between register
+;#   number (= high-order part of address) bits are at the disposal of
+;#   the programmer.
+;#
+
+;# To avoid excess transposes, we filter all 3 vertical luma subblock
+;#   edges together.  This requires a single 16x16 transpose, which, in
+;#   the above language, amounts to the following permutation of address
+;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by
+;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
+;#
+;#   Except for the fact that the destination registers get written
+;#   before we are done referencing the old contents, the cyclic transform
+;#   is effected by
+;#
+;#      x = 0;  do {
+;#          vmrghb V(2x),   V(x), V(x+8);
+;#          vmrghb V(2x+1), V(x), V(x+8);
+;#      } while( ++x < 8);
+;#
+;#   For clarity, and because we can afford it, we do this transpose
+;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,
+;#   leaving the final result in 16 .. 31, as the lower registers are
+;#   used in the filtering itself.
+;#
+.macro Tpair A, B, X, Y
+    vmrghb  \A, \X, \Y
+    vmrglb  \B, \X, \Y
+.endm
+
+;# Each step takes 8*2 = 16 instructions
+
+.macro t16_even
+    Tpair v16,v17,  v0,v8
+    Tpair v18,v19,  v1,v9
+    Tpair v20,v21,  v2,v10
+    Tpair v22,v23,  v3,v11
+    Tpair v24,v25,  v4,v12
+    Tpair v26,v27,  v5,v13
+    Tpair v28,v29,  v6,v14
+    Tpair v30,v31,  v7,v15
+.endm
+
+.macro t16_odd
+    Tpair v0,v1, v16,v24
+    Tpair v2,v3, v17,v25
+    Tpair v4,v5, v18,v26
+    Tpair v6,v7, v19,v27
+    Tpair v8,v9, v20,v28
+    Tpair v10,v11, v21,v29
+    Tpair v12,v13, v22,v30
+    Tpair v14,v15, v23,v31
+.endm
+
+;# Whole transpose takes 4*16 = 64 instructions
+
+.macro t16_full
+    t16_odd
+    t16_even
+    t16_odd
+    t16_even
+.endm
+
+;# Vertical edge filtering requires transposes.  For the simple filter,
+;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
+;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:
+;#
+;#  v0 =  0  1 ... 14 15
+;#  v1 = 16 17 ... 30 31
+;#  v2 = 32 33 ... 47 48
+;#  v3 = 49 50 ... 62 63
+;#
+;#  In frame-buffer memory, the layout is:
+;#
+;#     0  16  32  48
+;#     1  17  33  49
+;#     ...
+;#    15  31  47  63.
+;#
+;#  We begin by reading the data 32 bits at a time (using scalar operations)
+;#  into a temporary array, reading the rows of the array into vector registers,
+;#  with the following layout:
+;#
+;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60
+;#  v1 =  1 17 33 49  5 21 ...                      45 61
+;#  v2 =  2 18 ...                                  46 62
+;#  v3 =  3 19 ...                                  47 63
+;#
+;#  From the "address-bit" perspective discussed above, we simply need to
+;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
+;#  In other words, we transpose each of the four 4x4 submatrices.
+;#
+;#  This transformation is its own inverse, and we need to perform it
+;#  again before writing the pixels back into the frame buffer.
+;#
+;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,
+;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors
+;#  defined above.  We think of both groups of 4 registers as having
+;#  "addresses" {0,1,2,3} * 16.
+;#
+.macro Transpose4times4x4 Vlo, Vhi
+
+    ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=
+
+    vmrghb  v4, v0, v1
+    vmrglb  v5, v0, v1
+    vmrghb  v6, v2, v3
+    vmrglb  v7, v2, v3
+
+    ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1
+
+    vmrghh  v0, v4, v6
+    vmrglh  v1, v4, v6
+    vmrghh  v2, v5, v7
+    vmrglh  v3, v5, v7
+
+    ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=
+
+    vmrghw  v4, v0, v1
+    vmrglw  v5, v0, v1
+    vmrghw  v6, v2, v3
+    vmrglw  v7, v2, v3
+
+    ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3
+
+    vperm   v0, v4, v6, \Vlo
+    vperm   v1, v4, v6, \Vhi
+    vperm   v2, v5, v7, \Vlo
+    vperm   v3, v5, v7, \Vhi
+.endm
+;# end Transpose4times4x4
+
+
+;# Normal mb vertical edge filter transpose.
+;#
+;#   We read 8 columns of data, initially in the following pattern:
+;#
+;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)
+;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)
+;#  ...
+;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
+;#
+;#   and wish to convert to:
+;#
+;#  (0,0) ... (0,15)
+;#  (1,0) ... (1,15)
+;#  ...
+;#  (7,0) ... (7,15).
+;#
+;#  In "address bit" language, we wish to map
+;#
+;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.
+;#
+;#  This can be accomplished by 4 iterations of the cyclic transform
+;#
+;#  I -> (I+1) mod 7;
+;#
+;#  each iteration can be realized by (d=0, s=2):
+;#
+;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);
+;#
+;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;
+;#  preserving v8 = sign converter.
+;#
+;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the
+;#  result lands in the "mirror" registers v10...v17
+;#
+.macro t8x16_odd
+    Tpair v10, v11,  v0, v4
+    Tpair v12, v13,  v1, v5
+    Tpair v14, v15,  v2, v6
+    Tpair v16, v17,  v3, v7
+.endm
+
+.macro t8x16_even
+    Tpair v0, v1,  v10, v14
+    Tpair v2, v3,  v11, v15
+    Tpair v4, v5,  v12, v16
+    Tpair v6, v7,  v13, v17
+.endm
+
+.macro transpose8x16_fwd
+    t8x16_odd
+    t8x16_even
+    t8x16_odd
+    t8x16_even
+.endm
+
+.macro transpose8x16_inv
+    t8x16_odd
+    t8x16_even
+    t8x16_odd
+.endm
+
+.macro Transpose16x16
+    vmrghb  v0, v16, v24
+    vmrglb  v1, v16, v24
+    vmrghb  v2, v17, v25
+    vmrglb  v3, v17, v25
+    vmrghb  v4, v18, v26
+    vmrglb  v5, v18, v26
+    vmrghb  v6, v19, v27
+    vmrglb  v7, v19, v27
+    vmrghb  v8, v20, v28
+    vmrglb  v9, v20, v28
+    vmrghb  v10, v21, v29
+    vmrglb  v11, v21, v29
+    vmrghb  v12, v22, v30
+    vmrglb  v13, v22, v30
+    vmrghb  v14, v23, v31
+    vmrglb  v15, v23, v31
+    vmrghb  v16, v0, v8
+    vmrglb  v17, v0, v8
+    vmrghb  v18, v1, v9
+    vmrglb  v19, v1, v9
+    vmrghb  v20, v2, v10
+    vmrglb  v21, v2, v10
+    vmrghb  v22, v3, v11
+    vmrglb  v23, v3, v11
+    vmrghb  v24, v4, v12
+    vmrglb  v25, v4, v12
+    vmrghb  v26, v5, v13
+    vmrglb  v27, v5, v13
+    vmrghb  v28, v6, v14
+    vmrglb  v29, v6, v14
+    vmrghb  v30, v7, v15
+    vmrglb  v31, v7, v15
+    vmrghb  v0, v16, v24
+    vmrglb  v1, v16, v24
+    vmrghb  v2, v17, v25
+    vmrglb  v3, v17, v25
+    vmrghb  v4, v18, v26
+    vmrglb  v5, v18, v26
+    vmrghb  v6, v19, v27
+    vmrglb  v7, v19, v27
+    vmrghb  v8, v20, v28
+    vmrglb  v9, v20, v28
+    vmrghb  v10, v21, v29
+    vmrglb  v11, v21, v29
+    vmrghb  v12, v22, v30
+    vmrglb  v13, v22, v30
+    vmrghb  v14, v23, v31
+    vmrglb  v15, v23, v31
+    vmrghb  v16, v0, v8
+    vmrglb  v17, v0, v8
+    vmrghb  v18, v1, v9
+    vmrglb  v19, v1, v9
+    vmrghb  v20, v2, v10
+    vmrglb  v21, v2, v10
+    vmrghb  v22, v3, v11
+    vmrglb  v23, v3, v11
+    vmrghb  v24, v4, v12
+    vmrglb  v25, v4, v12
+    vmrghb  v26, v5, v13
+    vmrglb  v27, v5, v13
+    vmrghb  v28, v6, v14
+    vmrglb  v29, v6, v14
+    vmrghb  v30, v7, v15
+    vmrglb  v31, v7, v15
+.endm
+
+;# load_g loads a global vector (whose address is in the local variable Gptr)
+;#   into vector register Vreg.  Trashes r0
+.macro load_g Vreg, Gptr
+    lwz     r0, \Gptr
+    lvx     \Vreg, 0, r0
+.endm
+
+;# exploit the saturation here.  if the answer is negative
+;# it will be clamped to 0.  orring 0 with a positive
+;# number will be the positive number (abs)
+;# RES = abs( A-B), trashes TMP
+.macro Abs RES, TMP, A, B
+    vsububs \RES, \A, \B
+    vsububs \TMP, \B, \A
+    vor     \RES, \RES, \TMP
+.endm
+
+;# RES = Max( RES, abs( A-B)), trashes TMP
+.macro max_abs RES, TMP, A, B
+    vsububs \TMP, \A, \B
+    vmaxub  \RES, \RES, \TMP
+    vsububs \TMP, \B, \A
+    vmaxub  \RES, \RES, \TMP
+.endm
+
+.macro Masks
+    ;# build masks
+    ;# input is all 8 bit unsigned (0-255).  need to
+    ;# do abs(vala-valb) > limit.  but no need to compare each
+    ;# value to the limit.  find the max of the absolute differences
+    ;# and compare that to the limit.
+    ;# First hev
+    Abs     v14, v13, v2, v3    ;# |P1 - P0|
+    max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|
+
+    vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded
+
+    ;# Next limit
+    max_abs  v14, v13, v0, v1    ;# |P3 - P2|
+    max_abs  v14, v13, v1, v2    ;# |P2 - P1|
+    max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|
+    max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|
+
+    vcmpgtub v9, v14, v9        ;# R = true if limit exceeded
+
+    ;# flimit
+    Abs     v14, v13, v3, v4    ;# |P0 - Q0|
+
+    vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded
+
+    vor     v8, v8, v9          ;# R = true if flimit or limit exceeded
+    ;# done building masks
+.endm
+
+.macro build_constants RFL, RLI, RTH, FL, LI, TH
+    ;# build constants
+    lvx     \FL, 0, \RFL        ;# flimit
+    lvx     \LI, 0, \RLI        ;# limit
+    lvx     \TH, 0, \RTH        ;# thresh
+
+    vspltisb v11, 8
+    vspltisb v12, 4
+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
+.endm
+
+.macro load_data_y
+    ;# setup strides/pointers to be able to access
+    ;# all of the data
+    add     r5, r4, r4          ;# r5 = 2 * stride
+    sub     r6, r3, r5          ;# r6 -> 2 rows back
+    neg     r7, r4              ;# r7 = -stride
+
+    ;# load 16 pixels worth of data to work on
+    sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)
+    lvx     v0,  0, r0          ;# P3  (read only)
+    lvx     v1, r7, r6          ;# P2
+    lvx     v2,  0, r6          ;# P1
+    lvx     v3, r7, r3          ;# P0
+    lvx     v4,  0, r3          ;# Q0
+    lvx     v5, r4, r3          ;# Q1
+    lvx     v6, r5, r3          ;# Q2
+    add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)
+    lvx     v7, r4, r0          ;# Q3  (read only)
+.endm
+
+;# Expects
+;#  v10 == HEV
+;#  v13 == tmp
+;#  v14 == tmp
+.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
+    vxor    \P1, \P1, v11       ;# SP1
+    vxor    \P0, \P0, v11       ;# SP0
+    vxor    \Q0, \Q0, v11       ;# SQ0
+    vxor    \Q1, \Q1, v11       ;# SQ1
+
+    vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)
+.if \HEV_PRESENT
+    vand    v13, v13, v10       ;# f &= hev
+.endif
+    vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+
+    vandc   v13, v13, v8        ;# f &= mask
+
+    vspltisb v8, 3
+    vspltisb v9, 4
+
+    vaddsbs v14, v13, v9        ;# f1 = c (f+4)
+    vaddsbs v15, v13, v8        ;# f2 = c (f+3)
+
+    vsrab   v13, v14, v8        ;# f1 >>= 3
+    vsrab   v15, v15, v8        ;# f2 >>= 3
+
+    vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)
+    vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)
+.endm
+
+.macro vp8_mbfilter
+    Masks
+
+    ;# start the fitering here
+    vxor    v1, v1, v11         ;# SP2
+    vxor    v2, v2, v11         ;# SP1
+    vxor    v3, v3, v11         ;# SP0
+    vxor    v4, v4, v11         ;# SQ0
+    vxor    v5, v5, v11         ;# SQ1
+    vxor    v6, v6, v11         ;# SQ2
+
+    ;# add outer taps if we have high edge variance
+    vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)
+
+    vsubsbs v14, v4, v3         ;# SQ0-SP0
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14
+    vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))
+
+    vandc   v13, v13, v8        ;# f &= mask
+    vand    v15, v13, v10       ;# f2 = f & hev
+
+    ;# save bottom 3 bits so that we round one side +4 and the other +3
+    vspltisb v8, 3
+    vspltisb v9, 4
+
+    vaddsbs v14, v15, v9        ;# f1 = c (f+4)
+    vaddsbs v15, v15, v8        ;# f2 = c (f+3)
+
+    vsrab   v14, v14, v8        ;# f1 >>= 3
+    vsrab   v15, v15, v8        ;# f2 >>= 3
+
+    vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)
+    vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)
+
+    ;# only apply wider filter if not high edge variance
+    vandc   v13, v13, v10       ;# f &= ~hev
+
+    vspltisb v9, 2
+    vnor    v8, v8, v8
+    vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
+    vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f
+    vspltisb v8, 9
+
+    ;# roughly 1/7th difference across boundary
+    vspltish v10, 7
+    vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+    vmulesb v15, v8, v13
+    vaddshs v14, v14, v9        ;# +=  63
+    vaddshs v15, v15, v9
+    vsrah   v14, v14, v10       ;# >>= 7
+    vsrah   v15, v15, v10
+    vmrglh  v10, v15, v14
+    vmrghh  v15, v15, v14
+
+    vpkshss v10, v15, v10       ;# X = saturated down to bytes
+
+    vsubsbs v6, v6, v10         ;# subtract from Q and add to P
+    vaddsbs v1, v1, v10
+
+    vxor    v6, v6, v11
+    vxor    v1, v1, v11
+
+    ;# roughly 2/7th difference across boundary
+    vspltish v10, 7
+    vaddubm v12, v8, v8
+    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+    vmulesb v15, v12, v13
+    vaddshs v14, v14, v9
+    vaddshs v15, v15, v9
+    vsrah   v14, v14, v10       ;# >>= 7
+    vsrah   v15, v15, v10
+    vmrglh  v10, v15, v14
+    vmrghh  v15, v15, v14
+
+    vpkshss v10, v15, v10       ;# X = saturated down to bytes
+
+    vsubsbs v5, v5, v10         ;# subtract from Q and add to P
+    vaddsbs v2, v2, v10
+
+    vxor    v5, v5, v11
+    vxor    v2, v2, v11
+
+    ;# roughly 3/7th difference across boundary
+    vspltish v10, 7
+    vaddubm v12, v12, v8
+    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
+    vmulesb v15, v12, v13
+    vaddshs v14, v14, v9
+    vaddshs v15, v15, v9
+    vsrah   v14, v14, v10       ;# >>= 7
+    vsrah   v15, v15, v10
+    vmrglh  v10, v15, v14
+    vmrghh  v15, v15, v14
+
+    vpkshss v10, v15, v10       ;# X = saturated down to bytes
+
+    vsubsbs v4, v4, v10         ;# subtract from Q and add to P
+    vaddsbs v3, v3, v10
+
+    vxor    v4, v4, v11
+    vxor    v3, v3, v11
+.endm
+
+.macro SBFilter
+    Masks
+
+    common_adjust v3, v4, v2, v5, 1
+
+    ;# outer tap adjustments
+    vspltisb v8, 1
+
+    vaddubm v13, v13, v8        ;# f  += 1
+    vsrab   v13, v13, v8        ;# f >>= 1
+
+    vandc   v13, v13, v10       ;# f &= ~hev
+
+    vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)
+    vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)
+
+    vxor    v2, v2, v11
+    vxor    v3, v3, v11
+    vxor    v4, v4, v11
+    vxor    v5, v5, v11
+.endm
+
+    .align 2
+mbloop_filter_horizontal_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r5, r6, r7, v8, v9, v10
+
+    load_data_y
+
+    vp8_mbfilter
+
+    stvx     v1, r7, r6         ;# P2
+    stvx     v2,  0, r6         ;# P1
+    stvx     v3, r7, r3         ;# P0
+    stvx     v4,  0, r3         ;# Q0
+    stvx     v5, r4, r3         ;# Q1
+    stvx     v6, r5, r3         ;# Q2
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+;#  r6 const signed char *limit
+;#  r7 const signed char *thresh
+loop_filter_horizontal_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r5, r6, r7, v8, v9, v10
+
+    load_data_y
+
+    SBFilter
+
+    stvx     v2,  0, r6         ;# P1
+    stvx     v3, r7, r3         ;# P0
+    stvx     v4,  0, r3         ;# Q0
+    stvx     v5, r4, r3         ;# Q1
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.
+;#  So we can read in an entire mb aligned.  However if we want to filter the mb
+;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb
+;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit
+;#  of a waste.  So this is an even uglier way to get around that.
+;# Using the regular register file words are read in and then saved back out to
+;#  memory to align and order them up.  Then they are read in using the
+;#  vector register file.
+.macro RLVmb V, R
+    lwzux   r0, r3, r4
+    stw     r0, 4(\R)
+    lwz     r0,-4(r3)
+    stw     r0, 0(\R)
+    lwzux   r0, r3, r4
+    stw     r0,12(\R)
+    lwz     r0,-4(r3)
+    stw     r0, 8(\R)
+    lvx     \V, 0, \R
+.endm
+
+.macro WLVmb V, R
+    stvx    \V, 0, \R
+    lwz     r0,12(\R)
+    stwux   r0, r3, r4
+    lwz     r0, 8(\R)
+    stw     r0,-4(r3)
+    lwz     r0, 4(\R)
+    stwux   r0, r3, r4
+    lwz     r0, 0(\R)
+    stw     r0,-4(r3)
+.endm
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+;#  r6 const signed char *limit
+;#  r7 const signed char *thresh
+mbloop_filter_vertical_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    la      r9, -48(r1)         ;# temporary space for reading in vectors
+    sub     r3, r3, r4
+
+    RLVmb v0, r9
+    RLVmb v1, r9
+    RLVmb v2, r9
+    RLVmb v3, r9
+    RLVmb v4, r9
+    RLVmb v5, r9
+    RLVmb v6, r9
+    RLVmb v7, r9
+
+    transpose8x16_fwd
+
+    build_constants r5, r6, r7, v8, v9, v10
+
+    vp8_mbfilter
+
+    transpose8x16_inv
+
+    add r3, r3, r4
+    neg r4, r4
+
+    WLVmb v17, r9
+    WLVmb v16, r9
+    WLVmb v15, r9
+    WLVmb v14, r9
+    WLVmb v13, r9
+    WLVmb v12, r9
+    WLVmb v11, r9
+    WLVmb v10, r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro RL V, R, P
+    lvx     \V, 0,  \R
+    add     \R, \R, \P
+.endm
+
+.macro WL V, R, P
+    stvx    \V, 0,  \R
+    add     \R, \R, \P
+.endm
+
+.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
+                                ;# K = |P0-P1| already
+    Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|
+    vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)
+    vcmpgtub v10, v14, v0
+
+    Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]
+
+    max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)
+    max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)
+    max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)
+
+    vmaxub   v14, v14, v4       ;# M = max interior abs diff
+    vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded
+
+    Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)
+    vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded
+    vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded
+
+    ;# replace P1,Q1 w/signed versions
+    common_adjust \P0, \Q0, \P1, \Q1, 1
+
+    vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant
+    vsrab   v13, v13, v1
+    vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev
+    vsubsbs \Q1, \Q1, v13
+    vaddsbs \P1, \P1, v13
+
+    vxor    \P1, \P1, v11       ;# P1
+    vxor    \P0, \P0, v11       ;# P0
+    vxor    \Q0, \Q0, v11       ;# Q0
+    vxor    \Q1, \Q1, v11       ;# Q1
+.endm
+
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+;#  r6 const signed char *limit
+;#  r7 const signed char *thresh
+loop_filter_vertical_edge_y_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    addi    r9, r3, 0
+    RL      v16, r9, r4
+    RL      v17, r9, r4
+    RL      v18, r9, r4
+    RL      v19, r9, r4
+    RL      v20, r9, r4
+    RL      v21, r9, r4
+    RL      v22, r9, r4
+    RL      v23, r9, r4
+    RL      v24, r9, r4
+    RL      v25, r9, r4
+    RL      v26, r9, r4
+    RL      v27, r9, r4
+    RL      v28, r9, r4
+    RL      v29, r9, r4
+    RL      v30, r9, r4
+    lvx     v31, 0, r9
+
+    Transpose16x16
+
+    vspltisb v1, 1
+
+    build_constants r5, r6, r7, v3, v2, v0
+
+    Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|
+
+    Fil v16, v17, v18, v19,  v20, v21, v22, v23
+    Fil v20, v21, v22, v23,  v24, v25, v26, v27
+    Fil v24, v25, v26, v27,  v28, v29, v30, v31
+
+    Transpose16x16
+
+    addi    r9, r3, 0
+    WL      v16, r9, r4
+    WL      v17, r9, r4
+    WL      v18, r9, r4
+    WL      v19, r9, r4
+    WL      v20, r9, r4
+    WL      v21, r9, r4
+    WL      v22, r9, r4
+    WL      v23, r9, r4
+    WL      v24, r9, r4
+    WL      v25, r9, r4
+    WL      v26, r9, r4
+    WL      v27, r9, r4
+    WL      v28, r9, r4
+    WL      v29, r9, r4
+    WL      v30, r9, r4
+    stvx    v31, 0, r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+.macro active_chroma_sel V
+    andi.   r7, r3, 8       ;# row origin modulo 16
+    add     r7, r7, r7      ;# selects selectors
+    lis     r12, _chromaSelectors@ha
+    la      r0,  _chromaSelectors@l(r12)
+    lwzux   r0, r7, r0      ;# leave selector addr in r7
+
+    lvx     \V, 0, r0       ;# mask to concatenate active U,V pels
+.endm
+
+.macro hread_uv Dest, U, V, Offs, VMask
+    lvx     \U, \Offs, r3
+    lvx     \V, \Offs, r4
+    vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V
+.endm
+
+.macro hwrite_uv New, U, V, Offs, Umask, Vmask
+    vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings
+    vperm   \V, \New, \V, \Vmask
+    stvx    \U, \Offs, r3           ;# Write to frame buffer
+    stvx    \V, \Offs, r4
+.endm
+
+;# Process U,V in parallel.
+.macro load_chroma_h
+    neg     r9, r5          ;# r9 = -1 * stride
+    add     r8, r9, r9      ;# r8 = -2 * stride
+    add     r10, r5, r5     ;# r10 = 2 * stride
+
+    active_chroma_sel v12
+
+    ;# P3, Q3 are read-only; need not save addresses or sibling pels
+    add     r6, r8, r8      ;# r6 = -4 * stride
+    hread_uv v0, v14, v15, r6, v12
+    add     r6, r10, r5     ;# r6 =  3 * stride
+    hread_uv v7, v14, v15, r6, v12
+
+    ;# Others are read/write; save addresses and sibling pels
+
+    add     r6, r8, r9      ;# r6 = -3 * stride
+    hread_uv v1, v16, v17, r6,  v12
+    hread_uv v2, v18, v19, r8,  v12
+    hread_uv v3, v20, v21, r9,  v12
+    hread_uv v4, v22, v23, 0,   v12
+    hread_uv v5, v24, v25, r5,  v12
+    hread_uv v6, v26, v27, r10, v12
+.endm
+
+.macro uresult_sel V
+    load_g   \V, 4(r7)
+.endm
+
+.macro vresult_sel V
+    load_g   \V, 8(r7)
+.endm
+
+;# always write P1,P0,Q0,Q1
+.macro store_chroma_h
+    uresult_sel v11
+    vresult_sel v12
+    hwrite_uv v2, v18, v19, r8, v11, v12
+    hwrite_uv v3, v20, v21, r9, v11, v12
+    hwrite_uv v4, v22, v23, 0,  v11, v12
+    hwrite_uv v5, v24, v25, r5, v11, v12
+.endm
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+mbloop_filter_horizontal_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    load_chroma_h
+
+    vp8_mbfilter
+
+    store_chroma_h
+
+    hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2
+    hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+loop_filter_horizontal_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    load_chroma_h
+
+    SBFilter
+
+    store_chroma_h
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro R V, R
+    lwzux   r0, r3, r5
+    stw     r0, 4(\R)
+    lwz     r0,-4(r3)
+    stw     r0, 0(\R)
+    lwzux   r0, r4, r5
+    stw     r0,12(\R)
+    lwz     r0,-4(r4)
+    stw     r0, 8(\R)
+    lvx     \V, 0, \R
+.endm
+
+
+.macro W V, R
+    stvx    \V, 0, \R
+    lwz     r0,12(\R)
+    stwux   r0, r4, r5
+    lwz     r0, 8(\R)
+    stw     r0,-4(r4)
+    lwz     r0, 4(\R)
+    stwux   r0, r3, r5
+    lwz     r0, 0(\R)
+    stw     r0,-4(r3)
+.endm
+
+.macro chroma_vread R
+    sub r3, r3, r5          ;# back up one line for simplicity
+    sub r4, r4, r5
+
+    R v0, \R
+    R v1, \R
+    R v2, \R
+    R v3, \R
+    R v4, \R
+    R v5, \R
+    R v6, \R
+    R v7, \R
+
+    transpose8x16_fwd
+.endm
+
+.macro chroma_vwrite R
+
+    transpose8x16_inv
+
+    add     r3, r3, r5
+    add     r4, r4, r5
+    neg     r5, r5          ;# Write rows back in reverse order
+
+    W v17, \R
+    W v16, \R
+    W v15, \R
+    W v14, \R
+    W v13, \R
+    W v12, \R
+    W v11, \R
+    W v10, \R
+.endm
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+mbloop_filter_vertical_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    la      r9, -48(r1)         ;# temporary space for reading in vectors
+
+    chroma_vread r9
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    vp8_mbfilter
+
+    chroma_vwrite r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;#  r3 unsigned char *u
+;#  r4 unsigned char *v
+;#  r5 int p
+;#  r6 const signed char *flimit
+;#  r7 const signed char *limit
+;#  r8 const signed char *thresh
+loop_filter_vertical_edge_uv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    la      r9, -48(r1)         ;# temporary space for reading in vectors
+
+    chroma_vread r9
+
+    build_constants r6, r7, r8, v8, v9, v10
+
+    SBFilter
+
+    chroma_vwrite r9
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+.macro vp8_simple_filter
+    Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)
+    vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit
+
+    ;# preserve unsigned v0 and v3
+    common_adjust v1, v2, v0, v3, 0
+
+    vxor v1, v1, v11
+    vxor v2, v2, v11        ;# cvt Q0, P0 back to pels
+.endm
+
+.macro simple_vertical
+    addi    r8,  0, 16
+    addi    r7, r5, 32
+
+    lvx     v0,  0, r5
+    lvx     v1, r8, r5
+    lvx     v2,  0, r7
+    lvx     v3, r8, r7
+
+    lis     r12, _B_hihi@ha
+    la      r0,  _B_hihi@l(r12)
+    lvx     v16, 0, r0
+
+    lis     r12, _B_lolo@ha
+    la      r0,  _B_lolo@l(r12)
+    lvx     v17, 0, r0
+
+    Transpose4times4x4 v16, v17
+    vp8_simple_filter
+
+    vxor v0, v0, v11
+    vxor v3, v3, v11        ;# cvt Q0, P0 back to pels
+
+    Transpose4times4x4 v16, v17
+
+    stvx    v0,  0, r5
+    stvx    v1, r8, r5
+    stvx    v2,  0, r7
+    stvx    v3, r8, r7
+.endm
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+loop_filter_simple_horizontal_edge_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    ;# build constants
+    lvx     v8, 0, r5           ;# flimit
+
+    vspltisb v11, 8
+    vspltisb v12, 4
+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
+
+    neg     r5, r4              ;# r5 = -1 * stride
+    add     r6, r5, r5          ;# r6 = -2 * stride
+
+    lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge
+    lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge
+    lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge
+    lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge
+
+    vp8_simple_filter
+
+    stvx    v1, r5, r3          ;# store P0
+    stvx    v2,  0, r3          ;# store Q0
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+.macro RLV Offs
+    stw     r0, (\Offs*4)(r5)
+    lwzux   r0, r7, r4
+.endm
+
+.macro WLV Offs
+    lwz     r0, (\Offs*4)(r5)
+    stwux   r0, r7, r4
+.endm
+
+    .align 2
+;#  r3 unsigned char *s
+;#  r4 int p
+;#  r5 const signed char *flimit
+loop_filter_simple_vertical_edge_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xc000
+    mtspr   256, r12            ;# set VRSAVE
+
+    ;# build constants
+    lvx     v8, 0, r5           ;# flimit
+
+    vspltisb v11, 8
+    vspltisb v12, 4
+    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
+
+    la r5, -96(r1)              ;# temporary space for reading in vectors
+
+    ;# Store 4 pels at word "Offs" in temp array, then advance r7
+    ;#   to next row and read another 4 pels from the frame buffer.
+
+    subi    r7, r3,  2          ;# r7 -> 2 pels before start
+    lwzx    r0,  0, r7          ;# read first 4 pels
+
+    ;# 16 unaligned word accesses
+    RLV 0
+    RLV 4
+    RLV 8
+    RLV 12
+    RLV 1
+    RLV 5
+    RLV 9
+    RLV 13
+    RLV 2
+    RLV 6
+    RLV 10
+    RLV 14
+    RLV 3
+    RLV 7
+    RLV 11
+
+    stw     r0, (15*4)(r5)      ;# write last 4 pels
+
+    simple_vertical
+
+    ;# Read temp array, write frame buffer.
+    subi    r7, r3,  2          ;# r7 -> 2 pels before start
+    lwzx    r0,  0, r5          ;# read/write first 4 pels
+    stwx    r0,  0, r7
+
+    WLV 4
+    WLV 8
+    WLV 12
+    WLV 1
+    WLV 5
+    WLV 9
+    WLV 13
+    WLV 2
+    WLV 6
+    WLV 10
+    WLV 14
+    WLV 3
+    WLV 7
+    WLV 11
+    WLV 15
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+_chromaSelectors:
+    .long   _B_hihi
+    .long   _B_Ures0
+    .long   _B_Vres0
+    .long   0
+    .long   _B_lolo
+    .long   _B_Ures8
+    .long   _B_Vres8
+    .long   0
+
+    .align 4
+_B_Vres8:
+    .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15
+
+    .align 4
+_B_Ures8:
+    .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7
+
+    .align 4
+_B_lolo:
+    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+
+    .align 4
+_B_Vres0:
+    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
+    .align 4
+_B_Ures0:
+    .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31
+
+    .align 4
+_B_hihi:
+    .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/libvpx/vp8/common/ppc/platform_altivec.asm b/libvpx/vp8/common/ppc/platform_altivec.asm
new file mode 100644
index 0000000..f81d86f
--- /dev/null
+++ b/libvpx/vp8/common/ppc/platform_altivec.asm
@@ -0,0 +1,59 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl save_platform_context
+    .globl restore_platform_context
+
+.macro W V P
+    stvx    \V,  0, \P
+    addi    \P, \P, 16
+.endm
+
+.macro R V P
+    lvx     \V,  0, \P
+    addi    \P, \P, 16
+.endm
+
+;# r3 context_ptr
+    .align 2
+save_platform_contex:
+    W v20, r3
+    W v21, r3
+    W v22, r3
+    W v23, r3
+    W v24, r3
+    W v25, r3
+    W v26, r3
+    W v27, r3
+    W v28, r3
+    W v29, r3
+    W v30, r3
+    W v31, r3
+
+    blr
+
+;# r3 context_ptr
+    .align 2
+restore_platform_context:
+    R v20, r3
+    R v21, r3
+    R v22, r3
+    R v23, r3
+    R v24, r3
+    R v25, r3
+    R v26, r3
+    R v27, r3
+    R v28, r3
+    R v29, r3
+    R v30, r3
+    R v31, r3
+
+    blr
diff --git a/libvpx/vp8/common/ppc/recon_altivec.asm b/libvpx/vp8/common/ppc/recon_altivec.asm
new file mode 100644
index 0000000..dd39e05
--- /dev/null
+++ b/libvpx/vp8/common/ppc/recon_altivec.asm
@@ -0,0 +1,175 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl recon4b_ppc
+    .globl recon2b_ppc
+    .globl recon_b_ppc
+
+.macro row_of16 Diff Pred Dst Stride
+    lvx     v1,  0, \Pred           ;# v1 = pred = p0..p15
+    addi    \Pred, \Pred, 16        ;# next pred
+    vmrghb  v2, v0, v1              ;# v2 = 16-bit p0..p7
+    lvx     v3,  0, \Diff           ;# v3 = d0..d7
+    vaddshs v2, v2, v3              ;# v2 = r0..r7
+    vmrglb  v1, v0, v1              ;# v1 = 16-bit p8..p15
+    lvx     v3, r8, \Diff           ;# v3 = d8..d15
+    addi    \Diff, \Diff, 32        ;# next diff
+    vaddshs v3, v3, v1              ;# v3 = r8..r15
+    vpkshus v2, v2, v3              ;# v2 = 8-bit r0..r15
+    stvx    v2,  0, \Dst            ;# to dst
+    add     \Dst, \Dst, \Stride     ;# next dst
+.endm
+
+    .text
+    .align 2
+;#  r3 = short *diff_ptr,
+;#  r4 = unsigned char *pred_ptr,
+;#  r5 = unsigned char *dst_ptr,
+;#  r6 = int stride
+recon4b_ppc:
+    mfspr   r0, 256                     ;# get old VRSAVE
+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
+    oris    r0, r0, 0xf000
+    mtspr   256,r0                      ;# set VRSAVE
+
+    vxor    v0, v0, v0
+    li      r8, 16
+
+    row_of16 r3, r4, r5, r6
+    row_of16 r3, r4, r5, r6
+    row_of16 r3, r4, r5, r6
+    row_of16 r3, r4, r5, r6
+
+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
+    mtspr   256, r12                    ;# reset old VRSAVE
+
+    blr
+
+.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
+    lvx     v1,  0, \Pred       ;# v1 = pred = p0..p15
+    vmrghb  v2, v0, v1          ;# v2 = 16-bit p0..p7
+    lvx     v3,  0, \Diff       ;# v3 = d0..d7
+    vaddshs v2, v2, v3          ;# v2 = r0..r7
+    vmrglb  v1, v0, v1          ;# v1 = 16-bit p8..p15
+    lvx     v3, r8, \Diff       ;# v2 = d8..d15
+    vaddshs v3, v3, v1          ;# v3 = r8..r15
+    vpkshus v2, v2, v3          ;# v3 = 8-bit r0..r15
+    stvx    v2,  0, r10         ;# 2 rows to dst from buf
+    lwz     r0, 0(r10)
+.if \write_first_four_pels
+    stw     r0, 0(\Dst)
+    .else
+    stwux   r0, \Dst, \Stride
+.endif
+    lwz     r0, 4(r10)
+    stw     r0, 4(\Dst)
+    lwz     r0, 8(r10)
+    stwux   r0, \Dst, \Stride       ;# advance dst to next row
+    lwz     r0, 12(r10)
+    stw     r0, 4(\Dst)
+.endm
+
+    .align 2
+;#  r3 = short *diff_ptr,
+;#  r4 = unsigned char *pred_ptr,
+;#  r5 = unsigned char *dst_ptr,
+;#  r6 = int stride
+
+recon2b_ppc:
+    mfspr   r0, 256                     ;# get old VRSAVE
+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
+    oris    r0, r0, 0xf000
+    mtspr   256,r0                      ;# set VRSAVE
+
+    vxor    v0, v0, v0
+    li      r8, 16
+
+    la      r10, -48(r1)                ;# buf
+
+    two_rows_of8 r3, r4, r5, r6, 1
+
+    addi    r4, r4, 16;                 ;# next pred
+    addi    r3, r3, 32;                 ;# next diff
+
+    two_rows_of8 r3, r4, r5, r6, 0
+
+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
+    mtspr   256, r12                    ;# reset old VRSAVE
+
+    blr
+
+.macro get_two_diff_rows
+    stw     r0, 0(r10)
+    lwz     r0, 4(r3)
+    stw     r0, 4(r10)
+    lwzu    r0, 32(r3)
+    stw     r0, 8(r10)
+    lwz     r0, 4(r3)
+    stw     r0, 12(r10)
+    lvx     v3, 0, r10
+.endm
+
+    .align 2
+;#  r3 = short *diff_ptr,
+;#  r4 = unsigned char *pred_ptr,
+;#  r5 = unsigned char *dst_ptr,
+;#  r6 = int stride
+recon_b_ppc:
+    mfspr   r0, 256                     ;# get old VRSAVE
+    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
+    oris    r0, r0, 0xf000
+    mtspr   256,r0                      ;# set VRSAVE
+
+    vxor    v0, v0, v0
+
+    la      r10, -48(r1)    ;# buf
+
+    lwz     r0, 0(r4)
+    stw     r0, 0(r10)
+    lwz     r0, 16(r4)
+    stw     r0, 4(r10)
+    lwz     r0, 32(r4)
+    stw     r0, 8(r10)
+    lwz     r0, 48(r4)
+    stw     r0, 12(r10)
+
+    lvx     v1,  0, r10;    ;# v1 = pred = p0..p15
+
+    lwz r0, 0(r3)           ;# v3 = d0..d7
+
+    get_two_diff_rows
+
+    vmrghb  v2, v0, v1;     ;# v2 = 16-bit p0..p7
+    vaddshs v2, v2, v3;     ;# v2 = r0..r7
+
+    lwzu r0, 32(r3)         ;# v3 = d8..d15
+
+    get_two_diff_rows
+
+    vmrglb  v1, v0, v1;     ;# v1 = 16-bit p8..p15
+    vaddshs v3, v3, v1;     ;# v3 = r8..r15
+
+    vpkshus v2, v2, v3;     ;# v2 = 8-bit r0..r15
+    stvx    v2,  0, r10;    ;# 16 pels to dst from buf
+
+    lwz     r0, 0(r10)
+    stw     r0, 0(r5)
+    lwz     r0, 4(r10)
+    stwux   r0, r5, r6
+    lwz     r0, 8(r10)
+    stwux   r0, r5, r6
+    lwz     r0, 12(r10)
+    stwx    r0, r5, r6
+
+    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
+    mtspr   256, r12                    ;# reset old VRSAVE
+
+    blr
diff --git a/libvpx/vp8/common/ppc/sad_altivec.asm b/libvpx/vp8/common/ppc/sad_altivec.asm
new file mode 100644
index 0000000..e5f2638
--- /dev/null
+++ b/libvpx/vp8/common/ppc/sad_altivec.asm
@@ -0,0 +1,277 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_sad16x16_ppc
+    .globl vp8_sad16x8_ppc
+    .globl vp8_sad8x16_ppc
+    .globl vp8_sad8x8_ppc
+    .globl vp8_sad4x4_ppc
+
+.macro load_aligned_16 V R O
+    lvsl    v3,  0, \R          ;# permutate value for alignment
+
+    lvx     v1,  0, \R
+    lvx     v2, \O, \R
+
+    vperm   \V, v1, v2, v3
+.endm
+
+.macro prologue
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    li      r10, 16             ;# load offset and loop counter
+
+    vspltisw v8, 0              ;# zero out total to start
+.endm
+
+.macro epilogue
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+.endm
+
+.macro SAD_16
+    ;# v6 = abs (v4 - v5)
+    vsububs v6, v4, v5
+    vsububs v7, v5, v4
+    vor     v6, v6, v7
+
+    ;# v8 += abs (v4 - v5)
+    vsum4ubs v8, v6, v8
+.endm
+
+.macro sad_16_loop loop_label
+    lvsl    v3,  0, r5          ;# only needs to be done once per block
+
+    ;# preload a line of data before getting into the loop
+    lvx     v4, 0, r3
+    lvx     v1,  0, r5
+    lvx     v2, r10, r5
+
+    add     r5, r5, r6
+    add     r3, r3, r4
+
+    vperm   v5, v1, v2, v3
+
+    .align 4
+\loop_label:
+    ;# compute difference on first row
+    vsububs v6, v4, v5
+    vsububs v7, v5, v4
+
+    ;# load up next set of data
+    lvx     v9, 0, r3
+    lvx     v1,  0, r5
+    lvx     v2, r10, r5
+
+    ;# perform abs() of difference
+    vor     v6, v6, v7
+    add     r3, r3, r4
+
+    ;# add to the running tally
+    vsum4ubs v8, v6, v8
+
+    ;# now onto the next line
+    vperm   v5, v1, v2, v3
+    add     r5, r5, r6
+    lvx     v4, 0, r3
+
+    ;# compute difference on second row
+    vsububs v6, v9, v5
+    lvx     v1,  0, r5
+    vsububs v7, v5, v9
+    lvx     v2, r10, r5
+    vor     v6, v6, v7
+    add     r3, r3, r4
+    vsum4ubs v8, v6, v8
+    vperm   v5, v1, v2, v3
+    add     r5, r5, r6
+
+    bdnz    \loop_label
+
+    vspltisw v7, 0
+
+    vsumsws v8, v8, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+.endm
+
+.macro sad_8_loop loop_label
+    .align 4
+\loop_label:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v6, r3, r10
+    load_aligned_16 v7, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    vmrghb  v4, v4, v6
+    vmrghb  v5, v5, v7
+
+    SAD_16
+
+    bdnz    \loop_label
+
+    vspltisw v7, 0
+
+    vsumsws v8, v8, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad16x16_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    sad_16_loop sad16x16_loop
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad16x8_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    sad_16_loop sad16x8_loop
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad8x16_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    sad_8_loop sad8x16_loop
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad8x8_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    sad_8_loop sad8x8_loop
+
+    epilogue
+
+    blr
+
+.macro transfer_4x4 I P
+    lwz     r0, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r7, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r8, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r9, 0(\I)
+
+    stw     r0,  0(r1)
+    stw     r7,  4(r1)
+    stw     r8,  8(r1)
+    stw     r9, 12(r1)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad4x4_ppc:
+
+    prologue
+
+    transfer_4x4 r3, r4
+    lvx     v4, 0, r1
+
+    transfer_4x4 r5, r6
+    lvx     v5, 0, r1
+
+    vspltisw v8, 0              ;# zero out total to start
+
+    ;# v6 = abs (v4 - v5)
+    vsububs v6, v4, v5
+    vsububs v7, v5, v4
+    vor     v6, v6, v7
+
+    ;# v8 += abs (v4 - v5)
+    vsum4ubs v7, v6, v8
+    vsumsws v7, v7, v8
+
+    stvx    v7, 0, r1
+    lwz     r3, 12(r1)
+
+    epilogue
+
+    blr
diff --git a/libvpx/vp8/common/ppc/systemdependent.c b/libvpx/vp8/common/ppc/systemdependent.c
new file mode 100644
index 0000000..87f4cac
--- /dev/null
+++ b/libvpx/vp8/common/ppc/systemdependent.c
@@ -0,0 +1,170 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "subpixel.h"
+#include "loopfilter.h"
+#include "recon.h"
+#include "idct.h"
+#include "onyxc_int.h"
+
+void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
+void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
+
+extern void (*vp8_post_proc_down_and_across_mb_row)(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    int src_pixels_per_line,
+    int dst_pixels_per_line,
+    int cols,
+    unsigned char *f,
+    int size
+);
+
+extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
+extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
+extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
+
+extern void vp8_post_proc_down_and_across_mb_row_c
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    int src_pixels_per_line,
+    int dst_pixels_per_line,
+    int cols,
+    unsigned char *f,
+    int size
+);
+void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
+
+extern copy_mem_block_function *vp8_copy_mem16x16;
+extern copy_mem_block_function *vp8_copy_mem8x8;
+extern copy_mem_block_function *vp8_copy_mem8x4;
+
+// PPC
+extern subpixel_predict_function sixtap_predict_ppc;
+extern subpixel_predict_function sixtap_predict8x4_ppc;
+extern subpixel_predict_function sixtap_predict8x8_ppc;
+extern subpixel_predict_function sixtap_predict16x16_ppc;
+extern subpixel_predict_function bilinear_predict4x4_ppc;
+extern subpixel_predict_function bilinear_predict8x4_ppc;
+extern subpixel_predict_function bilinear_predict8x8_ppc;
+extern subpixel_predict_function bilinear_predict16x16_ppc;
+
+extern copy_mem_block_function copy_mem16x16_ppc;
+
+void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
+
+// Generic C
+extern subpixel_predict_function vp8_sixtap_predict_c;
+extern subpixel_predict_function vp8_sixtap_predict8x4_c;
+extern subpixel_predict_function vp8_sixtap_predict8x8_c;
+extern subpixel_predict_function vp8_sixtap_predict16x16_c;
+extern subpixel_predict_function vp8_bilinear_predict4x4_c;
+extern subpixel_predict_function vp8_bilinear_predict8x4_c;
+extern subpixel_predict_function vp8_bilinear_predict8x8_c;
+extern subpixel_predict_function vp8_bilinear_predict16x16_c;
+
+extern copy_mem_block_function vp8_copy_mem16x16_c;
+extern copy_mem_block_function vp8_copy_mem8x8_c;
+extern copy_mem_block_function vp8_copy_mem8x4_c;
+
+void vp8_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp8_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+void vp8_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
+
+extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch);
+extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
+
+// PPC
+extern loop_filter_block_function loop_filter_mbv_ppc;
+extern loop_filter_block_function loop_filter_bv_ppc;
+extern loop_filter_block_function loop_filter_mbh_ppc;
+extern loop_filter_block_function loop_filter_bh_ppc;
+
+extern loop_filter_block_function loop_filter_mbvs_ppc;
+extern loop_filter_block_function loop_filter_bvs_ppc;
+extern loop_filter_block_function loop_filter_mbhs_ppc;
+extern loop_filter_block_function loop_filter_bhs_ppc;
+
+// Generic C
+extern loop_filter_block_function vp8_loop_filter_mbv_c;
+extern loop_filter_block_function vp8_loop_filter_bv_c;
+extern loop_filter_block_function vp8_loop_filter_mbh_c;
+extern loop_filter_block_function vp8_loop_filter_bh_c;
+
+extern loop_filter_block_function vp8_loop_filter_mbvs_c;
+extern loop_filter_block_function vp8_loop_filter_bvs_c;
+extern loop_filter_block_function vp8_loop_filter_mbhs_c;
+extern loop_filter_block_function vp8_loop_filter_bhs_c;
+
+extern loop_filter_block_function *vp8_lf_mbvfull;
+extern loop_filter_block_function *vp8_lf_mbhfull;
+extern loop_filter_block_function *vp8_lf_bvfull;
+extern loop_filter_block_function *vp8_lf_bhfull;
+
+extern loop_filter_block_function *vp8_lf_mbvsimple;
+extern loop_filter_block_function *vp8_lf_mbhsimple;
+extern loop_filter_block_function *vp8_lf_bvsimple;
+extern loop_filter_block_function *vp8_lf_bhsimple;
+
+void vp8_clear_c(void)
+{
+}
+
+void vp8_machine_specific_config(void)
+{
+    // Pure C:
+    vp8_clear_system_state                = vp8_clear_c;
+    vp8_recon_b                          = vp8_recon_b_c;
+    vp8_recon4b                         = vp8_recon4b_c;
+    vp8_recon2b                         = vp8_recon2b_c;
+
+    vp8_bilinear_predict16x16            = bilinear_predict16x16_ppc;
+    vp8_bilinear_predict8x8              = bilinear_predict8x8_ppc;
+    vp8_bilinear_predict8x4              = bilinear_predict8x4_ppc;
+    vp8_bilinear_predict                 = bilinear_predict4x4_ppc;
+
+    vp8_sixtap_predict16x16              = sixtap_predict16x16_ppc;
+    vp8_sixtap_predict8x8                = sixtap_predict8x8_ppc;
+    vp8_sixtap_predict8x4                = sixtap_predict8x4_ppc;
+    vp8_sixtap_predict                   = sixtap_predict_ppc;
+
+    vp8_short_idct4x4_1                  = vp8_short_idct4x4llm_1_c;
+    vp8_short_idct4x4                    = short_idct4x4llm_ppc;
+    vp8_dc_only_idct                      = vp8_dc_only_idct_c;
+
+    vp8_lf_mbvfull                       = loop_filter_mbv_ppc;
+    vp8_lf_bvfull                        = loop_filter_bv_ppc;
+    vp8_lf_mbhfull                       = loop_filter_mbh_ppc;
+    vp8_lf_bhfull                        = loop_filter_bh_ppc;
+
+    vp8_lf_mbvsimple                     = loop_filter_mbvs_ppc;
+    vp8_lf_bvsimple                      = loop_filter_bvs_ppc;
+    vp8_lf_mbhsimple                     = loop_filter_mbhs_ppc;
+    vp8_lf_bhsimple                      = loop_filter_bhs_ppc;
+
+    vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
+    vp8_mbpost_proc_down                  = vp8_mbpost_proc_down_c;
+    vp8_mbpost_proc_across_ip              = vp8_mbpost_proc_across_ip_c;
+    vp8_plane_add_noise                   = vp8_plane_add_noise_c;
+
+    vp8_copy_mem16x16                    = copy_mem16x16_ppc;
+    vp8_copy_mem8x8                      = vp8_copy_mem8x8_c;
+    vp8_copy_mem8x4                      = vp8_copy_mem8x4_c;
+
+}
diff --git a/libvpx/vp8/common/ppc/variance_altivec.asm b/libvpx/vp8/common/ppc/variance_altivec.asm
new file mode 100644
index 0000000..fb8d5bb
--- /dev/null
+++ b/libvpx/vp8/common/ppc/variance_altivec.asm
@@ -0,0 +1,375 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_get8x8var_ppc
+    .globl vp8_get16x16var_ppc
+    .globl vp8_mse16x16_ppc
+    .globl vp8_variance16x16_ppc
+    .globl vp8_variance16x8_ppc
+    .globl vp8_variance8x16_ppc
+    .globl vp8_variance8x8_ppc
+    .globl vp8_variance4x4_ppc
+
+.macro load_aligned_16 V R O
+    lvsl    v3,  0, \R          ;# permutate value for alignment
+
+    lvx     v1,  0, \R
+    lvx     v2, \O, \R
+
+    vperm   \V, v1, v2, v3
+.endm
+
+.macro prologue
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    li      r10, 16             ;# load offset and loop counter
+
+    vspltisw v7, 0              ;# zero for merging
+    vspltisw v8, 0              ;# zero out total to start
+    vspltisw v9, 0              ;# zero out total for dif^2
+.endm
+
+.macro epilogue
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+.endm
+
+.macro compute_sum_sse
+    ;# Compute sum first.  Unpack to so signed subract
+    ;#  can be used.  Only have a half word signed
+    ;#  subract.  Do high, then low.
+    vmrghb  v2, v7, v4
+    vmrghb  v3, v7, v5
+    vsubshs v2, v2, v3
+    vsum4shs v8, v2, v8
+
+    vmrglb  v2, v7, v4
+    vmrglb  v3, v7, v5
+    vsubshs v2, v2, v3
+    vsum4shs v8, v2, v8
+
+    ;# Now compute sse.
+    vsububs v2, v4, v5
+    vsububs v3, v5, v4
+    vor     v2, v2, v3
+
+    vmsumubm v9, v2, v2, v9
+.endm
+
+.macro variance_16 DS loop_label store_sum
+\loop_label:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    compute_sum_sse
+
+    bdnz    \loop_label
+
+    vsumsws v8, v8, v7
+    vsumsws v9, v9, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r4, 12(r1)
+
+.if \store_sum
+    stw     r3, 0(r8)           ;# sum
+.endif
+    stw     r4, 0(r7)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srlwi   r3, r3, \DS         ;# (sum*sum) >> DS
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)
+.endm
+
+.macro variance_8 DS loop_label store_sum
+\loop_label:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v6, r3, r10
+    load_aligned_16 v0, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    vmrghb  v4, v4, v6
+    vmrghb  v5, v5, v0
+
+    compute_sum_sse
+
+    bdnz    \loop_label
+
+    vsumsws v8, v8, v7
+    vsumsws v9, v9, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r4, 12(r1)
+
+.if \store_sum
+    stw     r3, 0(r8)           ;# sum
+.endif
+    stw     r4, 0(r7)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get8x8var_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    variance_8 6, get8x8var_loop, 1
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get16x16var_ppc:
+
+    prologue
+
+    mtctr   r10
+
+    variance_16 8, get16x16var_loop, 1
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r 3 return value
+vp8_mse16x16_ppc:
+    prologue
+
+    mtctr   r10
+
+mse16x16_loop:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    ;# Now compute sse.
+    vsububs v2, v4, v5
+    vsububs v3, v5, v4
+    vor     v2, v2, v3
+
+    vmsumubm v9, v2, v2, v9
+
+    bdnz    mse16x16_loop
+
+    vsumsws v9, v9, v7
+
+    stvx    v9, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r3, 12(r1)
+
+    stw     r3, 0(r7)           ;# sse
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance16x16_ppc:
+
+    prologue
+
+    mtctr   r10
+
+    variance_16 8, variance16x16_loop, 0
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance16x8_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    variance_16 7, variance16x8_loop, 0
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance8x16_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    variance_8 7, variance8x16_loop, 0
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance8x8_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    variance_8 6, variance8x8_loop, 0
+
+    epilogue
+
+    blr
+
+.macro transfer_4x4 I P
+    lwz     r0, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r10,0(\I)
+    add     \I, \I, \P
+
+    lwz     r8, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r9, 0(\I)
+
+    stw     r0,  0(r1)
+    stw     r10, 4(r1)
+    stw     r8,  8(r1)
+    stw     r9, 12(r1)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance4x4_ppc:
+
+    prologue
+
+    transfer_4x4 r3, r4
+    lvx     v4, 0, r1
+
+    transfer_4x4 r5, r6
+    lvx     v5, 0, r1
+
+    compute_sum_sse
+
+    vsumsws v8, v8, v7
+    vsumsws v9, v9, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r4, 12(r1)
+
+    stw     r4, 0(r7)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srlwi   r3, r3, 4           ;# (sum*sum) >> 4
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)
+
+    epilogue
+
+    blr
diff --git a/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm b/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm
new file mode 100644
index 0000000..2308373
--- /dev/null
+++ b/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm
@@ -0,0 +1,865 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_sub_pixel_variance4x4_ppc
+    .globl vp8_sub_pixel_variance8x8_ppc
+    .globl vp8_sub_pixel_variance8x16_ppc
+    .globl vp8_sub_pixel_variance16x8_ppc
+    .globl vp8_sub_pixel_variance16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+    load_c \V0, vfilter_b, r6, r12, r10
+
+    addi    r6,  r6, 16
+    lvx     \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+    ;# load up horizontal filter
+    slwi.   r5, r5, 4           ;# index into horizontal filter array
+
+    ;# index to the next set of vectors in the row.
+    li      r10, 16
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq     \jump_label
+
+    load_c v20, hfilter_b, r5, r12, r0
+
+    ;# setup constants
+    ;# v14 permutation value for alignment
+    load_c v28, b_hperm_b, 0, r12, r0
+
+    ;# index to the next set of vectors in the row.
+    li      r12, 32
+
+    ;# rounding added in on the multiply
+    vspltisw v21, 8
+    vspltisw v18, 3
+    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
+
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+
+.macro hfilter_8 V, hp, lp, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 9 bytes wide, output is 8 bytes.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+
+    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456
+    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A
+
+    vmsummbm v24, v20, v24, v18
+    vmsummbm v25, v20, v25, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+
+    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
+.endm
+
+.macro vfilter_16 P0 P1
+    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
+    vadduhm v22, v18, v22
+    vmuloub v23, \P0, v20
+    vadduhm v23, v18, v23
+
+    vmuleub v24, \P1, v21
+    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
+    vmuloub v25, \P1, v21
+    vadduhm v23, v23, v25       ;# Ro = odds
+
+    vsrh    v22, v22, v19       ;# divide by 128
+    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
+    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
+    vmrglh  v23, v22, v23
+    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
+.endm
+
+.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
+    ;# Compute sum first.  Unpack to so signed subract
+    ;#  can be used.  Only have a half word signed
+    ;#  subract.  Do high, then low.
+    vmrghb  \t1, \z0, \src
+    vmrghb  \t2, \z0, \ref
+    vsubshs \t1, \t1, \t2
+    vsum4shs \sum, \t1, \sum
+
+    vmrglb  \t1, \z0, \src
+    vmrglb  \t2, \z0, \ref
+    vsubshs \t1, \t1, \t2
+    vsum4shs \sum, \t1, \sum
+
+    ;# Now compute sse.
+    vsububs \t1, \src, \ref
+    vsububs \t2, \ref, \src
+    vor     \t1, \t1, \t2
+
+    vmsumubm \sse, \t1, \t1, \sse
+.endm
+
+.macro variance_final sum, sse, z0, DS
+    vsumsws \sum, \sum, \z0
+    vsumsws \sse, \sse, \z0
+
+    stvx    \sum, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    \sse, 0, r1
+    lwz     r4, 12(r1)
+
+    stw     r4, 0(r9)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
+.endm
+
+.macro compute_sum_sse_16 V, increment_counter
+    load_and_align_16  v16, r7, r8, \increment_counter
+    compute_sum_sse \V, v16, v18, v19, v20, v21, v23
+.endm
+
+.macro load_and_align_16 V, R, P, increment_counter
+    lvsl    v17,  0, \R         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, \R
+    lvx     v22, r10, \R
+
+.if \increment_counter
+    add     \R, \R, \P
+.endif
+
+    vperm   \V, v21, v22, v17
+.endm
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance4x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf830
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_4x4_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r12, r0
+    load_c v11, b_4567_b, 0, r12, r0
+
+    hfilter_8 v0, v10, v11, 1
+    hfilter_8 v1, v10, v11, 1
+    hfilter_8 v2, v10, v11, 1
+    hfilter_8 v3, v10, v11, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_4x4_b
+
+    hfilter_8 v4, v10, v11, 0
+
+    b   second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16 v0, r3, r4, 1
+    load_and_align_16 v1, r3, r4, 1
+    load_and_align_16 v2, r3, r4, 1
+    load_and_align_16 v3, r3, r4, 1
+    load_and_align_16 v4, r3, r4, 0
+
+second_pass_4x4_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+
+compute_sum_sse_4x4_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    load_and_align_16 v4, r7, r8, 1
+    load_and_align_16 v5, r7, r8, 1
+    load_and_align_16 v6, r7, r8, 1
+    load_and_align_16 v7, r7, r8, 1
+
+    vmrghb  v0, v0, v1
+    vmrghb  v1, v2, v3
+
+    vmrghb  v2, v4, v5
+    vmrghb  v3, v6, v7
+
+    load_c v10, b_hilo_b, 0, r12, r0
+
+    vperm   v0, v0, v1, v10
+    vperm   v1, v2, v3, v10
+
+    compute_sum_sse v0, v1, v18, v19, v20, v21, v23
+
+    variance_final v18, v19, v23, 4
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance8x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfff0
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x8_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r12, r0
+    load_c v11, b_4567_b, 0, r12, r0
+
+    hfilter_8 v0, v10, v11, 1
+    hfilter_8 v1, v10, v11, 1
+    hfilter_8 v2, v10, v11, 1
+    hfilter_8 v3, v10, v11, 1
+    hfilter_8 v4, v10, v11, 1
+    hfilter_8 v5, v10, v11, 1
+    hfilter_8 v6, v10, v11, 1
+    hfilter_8 v7, v10, v11, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_8x8_b
+
+    hfilter_8 v8, v10, v11, 0
+
+    b   second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16 v0, r3, r4, 1
+    load_and_align_16 v1, r3, r4, 1
+    load_and_align_16 v2, r3, r4, 1
+    load_and_align_16 v3, r3, r4, 1
+    load_and_align_16 v4, r3, r4, 1
+    load_and_align_16 v5, r3, r4, 1
+    load_and_align_16 v6, r3, r4, 1
+    load_and_align_16 v7, r3, r4, 1
+    load_and_align_16 v8, r3, r4, 0
+
+    beq     compute_sum_sse_8x8_b
+
+second_pass_8x8_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0, v1
+    vfilter_16 v1, v2
+    vfilter_16 v2, v3
+    vfilter_16 v3, v4
+    vfilter_16 v4, v5
+    vfilter_16 v5, v6
+    vfilter_16 v6, v7
+    vfilter_16 v7, v8
+
+compute_sum_sse_8x8_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    vmrghb  v0, v0, v1
+    vmrghb  v1, v2, v3
+    vmrghb  v2, v4, v5
+    vmrghb  v3, v6, v7
+
+    load_and_align_16 v4,  r7, r8, 1
+    load_and_align_16 v5,  r7, r8, 1
+    load_and_align_16 v6,  r7, r8, 1
+    load_and_align_16 v7,  r7, r8, 1
+    load_and_align_16 v8,  r7, r8, 1
+    load_and_align_16 v9,  r7, r8, 1
+    load_and_align_16 v10, r7, r8, 1
+    load_and_align_16 v11, r7, r8, 0
+
+    vmrghb  v4, v4,  v5
+    vmrghb  v5, v6,  v7
+    vmrghb  v6, v8,  v9
+    vmrghb  v7, v10, v11
+
+    compute_sum_sse v0, v4, v18, v19, v20, v21, v23
+    compute_sum_sse v1, v5, v18, v19, v20, v21, v23
+    compute_sum_sse v2, v6, v18, v19, v20, v21, v23
+    compute_sum_sse v3, v7, v18, v19, v20, v21, v23
+
+    variance_final v18, v19, v23, 6
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+    blr
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance8x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfffc
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x16_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v29, b_0123_b, 0, r12, r0
+    load_c v30, b_4567_b, 0, r12, r0
+
+    hfilter_8 v0,  v29, v30, 1
+    hfilter_8 v1,  v29, v30, 1
+    hfilter_8 v2,  v29, v30, 1
+    hfilter_8 v3,  v29, v30, 1
+    hfilter_8 v4,  v29, v30, 1
+    hfilter_8 v5,  v29, v30, 1
+    hfilter_8 v6,  v29, v30, 1
+    hfilter_8 v7,  v29, v30, 1
+    hfilter_8 v8,  v29, v30, 1
+    hfilter_8 v9,  v29, v30, 1
+    hfilter_8 v10, v29, v30, 1
+    hfilter_8 v11, v29, v30, 1
+    hfilter_8 v12, v29, v30, 1
+    hfilter_8 v13, v29, v30, 1
+    hfilter_8 v14, v29, v30, 1
+    hfilter_8 v15, v29, v30, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_8x16_b
+
+    hfilter_8 v16, v29, v30, 0
+
+    b   second_pass_8x16_b
+
+second_pass_8x16_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16 v0,  r3, r4, 1
+    load_and_align_16 v1,  r3, r4, 1
+    load_and_align_16 v2,  r3, r4, 1
+    load_and_align_16 v3,  r3, r4, 1
+    load_and_align_16 v4,  r3, r4, 1
+    load_and_align_16 v5,  r3, r4, 1
+    load_and_align_16 v6,  r3, r4, 1
+    load_and_align_16 v7,  r3, r4, 1
+    load_and_align_16 v8,  r3, r4, 1
+    load_and_align_16 v9,  r3, r4, 1
+    load_and_align_16 v10, r3, r4, 1
+    load_and_align_16 v11, r3, r4, 1
+    load_and_align_16 v12, r3, r4, 1
+    load_and_align_16 v13, r3, r4, 1
+    load_and_align_16 v14, r3, r4, 1
+    load_and_align_16 v15, r3, r4, 1
+    load_and_align_16 v16, r3, r4, 0
+
+    beq     compute_sum_sse_8x16_b
+
+second_pass_8x16_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+    vfilter_16 v8,  v9
+    vfilter_16 v9,  v10
+    vfilter_16 v10, v11
+    vfilter_16 v11, v12
+    vfilter_16 v12, v13
+    vfilter_16 v13, v14
+    vfilter_16 v14, v15
+    vfilter_16 v15, v16
+
+compute_sum_sse_8x16_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    vmrghb  v0, v0,  v1
+    vmrghb  v1, v2,  v3
+    vmrghb  v2, v4,  v5
+    vmrghb  v3, v6,  v7
+    vmrghb  v4, v8,  v9
+    vmrghb  v5, v10, v11
+    vmrghb  v6, v12, v13
+    vmrghb  v7, v14, v15
+
+    load_and_align_16 v8,  r7, r8, 1
+    load_and_align_16 v9,  r7, r8, 1
+    load_and_align_16 v10, r7, r8, 1
+    load_and_align_16 v11, r7, r8, 1
+    load_and_align_16 v12, r7, r8, 1
+    load_and_align_16 v13, r7, r8, 1
+    load_and_align_16 v14, r7, r8, 1
+    load_and_align_16 v15, r7, r8, 1
+
+    vmrghb  v8,  v8,  v9
+    vmrghb  v9,  v10, v11
+    vmrghb  v10, v12, v13
+    vmrghb  v11, v14, v15
+
+    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23
+    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23
+    compute_sum_sse v2, v10, v18, v19, v20, v21, v23
+    compute_sum_sse v3, v11, v18, v19, v20, v21, v23
+
+    load_and_align_16 v8,  r7, r8, 1
+    load_and_align_16 v9,  r7, r8, 1
+    load_and_align_16 v10, r7, r8, 1
+    load_and_align_16 v11, r7, r8, 1
+    load_and_align_16 v12, r7, r8, 1
+    load_and_align_16 v13, r7, r8, 1
+    load_and_align_16 v14, r7, r8, 1
+    load_and_align_16 v15, r7, r8, 0
+
+    vmrghb  v8,  v8,  v9
+    vmrghb  v9,  v10, v11
+    vmrghb  v10, v12, v13
+    vmrghb  v11, v14, v15
+
+    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23
+    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23
+    compute_sum_sse v6, v10, v18, v19, v20, v21, v23
+    compute_sum_sse v7, v11, v18, v19, v20, v21, v23
+
+    variance_final v18, v19, v23, 7
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+    blr
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+    lvx     v23, r12, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
+
+    ;# set 0
+    vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+    ;# set 1
+    vsldoi  v23, v21, v22, 1
+    vmsummbm v25, v20, v23, v18
+
+    ;# set 2
+    vsldoi  v23, v21, v22, 2
+    vmsummbm v26, v20, v23, v18
+
+    ;# set 3
+    vsldoi  v23, v21, v22, 3
+    vmsummbm v27, v20, v23, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+    vsrh    v25, v25, v19
+
+    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
+    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
+.endm
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance16x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    HProlog second_pass_16x8_pre_copy_b
+
+    hfilter_16 v0, 1
+    hfilter_16 v1, 1
+    hfilter_16 v2, 1
+    hfilter_16 v3, 1
+    hfilter_16 v4, 1
+    hfilter_16 v5, 1
+    hfilter_16 v6, 1
+    hfilter_16 v7, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_16x8_b
+
+    hfilter_16 v8, 0
+
+    b   second_pass_16x8_b
+
+second_pass_16x8_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16  v0,  r3, r4, 1
+    load_and_align_16  v1,  r3, r4, 1
+    load_and_align_16  v2,  r3, r4, 1
+    load_and_align_16  v3,  r3, r4, 1
+    load_and_align_16  v4,  r3, r4, 1
+    load_and_align_16  v5,  r3, r4, 1
+    load_and_align_16  v6,  r3, r4, 1
+    load_and_align_16  v7,  r3, r4, 1
+    load_and_align_16  v8,  r3, r4, 1
+
+    beq     compute_sum_sse_16x8_b
+
+second_pass_16x8_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+
+compute_sum_sse_16x8_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    compute_sum_sse_16 v0, 1
+    compute_sum_sse_16 v1, 1
+    compute_sum_sse_16 v2, 1
+    compute_sum_sse_16 v3, 1
+    compute_sum_sse_16 v4, 1
+    compute_sum_sse_16 v5, 1
+    compute_sum_sse_16 v6, 1
+    compute_sum_sse_16 v7, 0
+
+    variance_final v18, v19, v23, 7
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    HProlog second_pass_16x16_pre_copy_b
+
+    hfilter_16 v0,  1
+    hfilter_16 v1,  1
+    hfilter_16 v2,  1
+    hfilter_16 v3,  1
+    hfilter_16 v4,  1
+    hfilter_16 v5,  1
+    hfilter_16 v6,  1
+    hfilter_16 v7,  1
+    hfilter_16 v8,  1
+    hfilter_16 v9,  1
+    hfilter_16 v10, 1
+    hfilter_16 v11, 1
+    hfilter_16 v12, 1
+    hfilter_16 v13, 1
+    hfilter_16 v14, 1
+    hfilter_16 v15, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_16x16_b
+
+    hfilter_16 v16, 0
+
+    b   second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16  v0,  r3, r4, 1
+    load_and_align_16  v1,  r3, r4, 1
+    load_and_align_16  v2,  r3, r4, 1
+    load_and_align_16  v3,  r3, r4, 1
+    load_and_align_16  v4,  r3, r4, 1
+    load_and_align_16  v5,  r3, r4, 1
+    load_and_align_16  v6,  r3, r4, 1
+    load_and_align_16  v7,  r3, r4, 1
+    load_and_align_16  v8,  r3, r4, 1
+    load_and_align_16  v9,  r3, r4, 1
+    load_and_align_16  v10, r3, r4, 1
+    load_and_align_16  v11, r3, r4, 1
+    load_and_align_16  v12, r3, r4, 1
+    load_and_align_16  v13, r3, r4, 1
+    load_and_align_16  v14, r3, r4, 1
+    load_and_align_16  v15, r3, r4, 1
+    load_and_align_16  v16, r3, r4, 0
+
+    beq     compute_sum_sse_16x16_b
+
+second_pass_16x16_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+    vfilter_16 v8,  v9
+    vfilter_16 v9,  v10
+    vfilter_16 v10, v11
+    vfilter_16 v11, v12
+    vfilter_16 v12, v13
+    vfilter_16 v13, v14
+    vfilter_16 v14, v15
+    vfilter_16 v15, v16
+
+compute_sum_sse_16x16_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    compute_sum_sse_16 v0,  1
+    compute_sum_sse_16 v1,  1
+    compute_sum_sse_16 v2,  1
+    compute_sum_sse_16 v3,  1
+    compute_sum_sse_16 v4,  1
+    compute_sum_sse_16 v5,  1
+    compute_sum_sse_16 v6,  1
+    compute_sum_sse_16 v7,  1
+    compute_sum_sse_16 v8,  1
+    compute_sum_sse_16 v9,  1
+    compute_sum_sse_16 v10, 1
+    compute_sum_sse_16 v11, 1
+    compute_sum_sse_16 v12, 1
+    compute_sum_sse_16 v13, 1
+    compute_sum_sse_16 v14, 1
+    compute_sum_sse_16 v15, 0
+
+    variance_final v18, v19, v23, 8
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+    .align 4
+hfilter_b:
+    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
+    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
+    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
+    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
+    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
+    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
+    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
+    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
+
+    .align 4
+vfilter_b:
+    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+    .align 4
+b_hperm_b:
+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+    .align 4
+b_0123_b:
+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+
+    .align 4
+b_4567_b:
+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+
+b_hilo_b:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/libvpx/vp8/common/ppflags.h b/libvpx/vp8/common/ppflags.h
new file mode 100644
index 0000000..665e21f
--- /dev/null
+++ b/libvpx/vp8/common/ppflags.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PPFLAGS_H
+#define __INC_PPFLAGS_H
+enum
+{
+    VP8D_NOFILTERING            = 0,
+    VP8D_DEBLOCK                = 1<<0,
+    VP8D_DEMACROBLOCK           = 1<<1,
+    VP8D_ADDNOISE               = 1<<2,
+    VP8D_DEBUG_TXT_FRAME_INFO   = 1<<3,
+    VP8D_DEBUG_TXT_MBLK_MODES   = 1<<4,
+    VP8D_DEBUG_TXT_DC_DIFF      = 1<<5,
+    VP8D_DEBUG_TXT_RATE_INFO    = 1<<6,
+    VP8D_DEBUG_DRAW_MV          = 1<<7,
+    VP8D_DEBUG_CLR_BLK_MODES    = 1<<8,
+    VP8D_DEBUG_CLR_FRM_REF_BLKS = 1<<9,
+    VP8D_MFQE                   = 1<<10
+};
+
+typedef struct
+{
+    int post_proc_flag;
+    int deblocking_level;
+    int noise_level;
+    int display_ref_frame_flag;
+    int display_mb_modes_flag;
+    int display_b_modes_flag;
+    int display_mv_flag;
+} vp8_ppflags_t;
+
+#endif
diff --git a/libvpx/vp8/common/pragmas.h b/libvpx/vp8/common/pragmas.h
new file mode 100644
index 0000000..99fee5a
--- /dev/null
+++ b/libvpx/vp8/common/pragmas.h
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+
+
+#ifdef __INTEL_COMPILER
+#pragma warning(disable:997 1011 170)
+#endif
+#ifdef _MSC_VER
+#pragma warning(disable:4799)
+#endif
diff --git a/libvpx/vp8/common/quant_common.c b/libvpx/vp8/common/quant_common.c
new file mode 100644
index 0000000..05f9210
--- /dev/null
+++ b/libvpx/vp8/common/quant_common.c
@@ -0,0 +1,135 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "quant_common.h"
+
+static const int dc_qlookup[QINDEX_RANGE] =
+{
+    4,    5,    6,    7,    8,    9,   10,   10,   11,   12,   13,   14,   15,   16,   17,   17,
+    18,   19,   20,   20,   21,   21,   22,   22,   23,   23,   24,   25,   25,   26,   27,   28,
+    29,   30,   31,   32,   33,   34,   35,   36,   37,   37,   38,   39,   40,   41,   42,   43,
+    44,   45,   46,   46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,
+    59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,   72,   73,   74,
+    75,   76,   76,   77,   78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,
+    91,   93,   95,   96,   98,  100,  101,  102,  104,  106,  108,  110,  112,  114,  116,  118,
+    122,  124,  126,  128,  130,  132,  134,  136,  138,  140,  143,  145,  148,  151,  154,  157,
+};
+
+static const int ac_qlookup[QINDEX_RANGE] =
+{
+    4,    5,    6,    7,    8,    9,   10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
+    20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,   33,   34,   35,
+    36,   37,   38,   39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
+    52,   53,   54,   55,   56,   57,   58,   60,   62,   64,   66,   68,   70,   72,   74,   76,
+    78,   80,   82,   84,   86,   88,   90,   92,   94,   96,   98,  100,  102,  104,  106,  108,
+    110,  112,  114,  116,  119,  122,  125,  128,  131,  134,  137,  140,  143,  146,  149,  152,
+    155,  158,  161,  164,  167,  170,  173,  177,  181,  185,  189,  193,  197,  201,  205,  209,
+    213,  217,  221,  225,  229,  234,  239,  245,  249,  254,  259,  264,  269,  274,  279,  284,
+};
+
+
+int vp8_dc_quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = dc_qlookup[ QIndex ];
+    return retval;
+}
+
+int vp8_dc2quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = dc_qlookup[ QIndex ] * 2;
+    return retval;
+
+}
+int vp8_dc_uv_quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = dc_qlookup[ QIndex ];
+
+    if (retval > 132)
+        retval = 132;
+
+    return retval;
+}
+
+int vp8_ac_yquant(int QIndex)
+{
+    int retval;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = ac_qlookup[ QIndex ];
+    return retval;
+}
+
+int vp8_ac2quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    /* For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
+     * The smallest precision for that is '(x*6349) >> 12' but 16 is a good
+     * word size. */
+    retval = (ac_qlookup[ QIndex ] * 101581) >> 16;
+
+    if (retval < 8)
+        retval = 8;
+
+    return retval;
+}
+int vp8_ac_uv_quant(int QIndex, int Delta)
+{
+    int retval;
+
+    QIndex = QIndex + Delta;
+
+    if (QIndex > 127)
+        QIndex = 127;
+    else if (QIndex < 0)
+        QIndex = 0;
+
+    retval = ac_qlookup[ QIndex ];
+    return retval;
+}
diff --git a/libvpx/vp8/common/quant_common.h b/libvpx/vp8/common/quant_common.h
new file mode 100644
index 0000000..cb64d8e
--- /dev/null
+++ b/libvpx/vp8/common/quant_common.h
@@ -0,0 +1,21 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "string.h"
+#include "blockd.h"
+#include "onyxc_int.h"
+
+extern int vp8_ac_yquant(int QIndex);
+extern int vp8_dc_quant(int QIndex, int Delta);
+extern int vp8_dc2quant(int QIndex, int Delta);
+extern int vp8_ac2quant(int QIndex, int Delta);
+extern int vp8_dc_uv_quant(int QIndex, int Delta);
+extern int vp8_ac_uv_quant(int QIndex, int Delta);
diff --git a/libvpx/vp8/common/reconinter.c b/libvpx/vp8/common/reconinter.c
new file mode 100644
index 0000000..3da3bc7
--- /dev/null
+++ b/libvpx/vp8/common/reconinter.c
@@ -0,0 +1,595 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "blockd.h"
+#include "reconinter.h"
+#if CONFIG_RUNTIME_CPU_DETECT
+#include "onyxc_int.h"
+#endif
+
+void vp8_copy_mem16x16_c(
+    unsigned char *src,
+    int src_stride,
+    unsigned char *dst,
+    int dst_stride)
+{
+
+    int r;
+
+    for (r = 0; r < 16; r++)
+    {
+#if !(CONFIG_FAST_UNALIGNED)
+        dst[0] = src[0];
+        dst[1] = src[1];
+        dst[2] = src[2];
+        dst[3] = src[3];
+        dst[4] = src[4];
+        dst[5] = src[5];
+        dst[6] = src[6];
+        dst[7] = src[7];
+        dst[8] = src[8];
+        dst[9] = src[9];
+        dst[10] = src[10];
+        dst[11] = src[11];
+        dst[12] = src[12];
+        dst[13] = src[13];
+        dst[14] = src[14];
+        dst[15] = src[15];
+
+#else
+        ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
+        ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
+        ((uint32_t *)dst)[2] = ((uint32_t *)src)[2] ;
+        ((uint32_t *)dst)[3] = ((uint32_t *)src)[3] ;
+
+#endif
+        src += src_stride;
+        dst += dst_stride;
+
+    }
+
+}
+
+void vp8_copy_mem8x8_c(
+    unsigned char *src,
+    int src_stride,
+    unsigned char *dst,
+    int dst_stride)
+{
+    int r;
+
+    for (r = 0; r < 8; r++)
+    {
+#if !(CONFIG_FAST_UNALIGNED)
+        dst[0] = src[0];
+        dst[1] = src[1];
+        dst[2] = src[2];
+        dst[3] = src[3];
+        dst[4] = src[4];
+        dst[5] = src[5];
+        dst[6] = src[6];
+        dst[7] = src[7];
+#else
+        ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
+        ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
+#endif
+        src += src_stride;
+        dst += dst_stride;
+
+    }
+
+}
+
+void vp8_copy_mem8x4_c(
+    unsigned char *src,
+    int src_stride,
+    unsigned char *dst,
+    int dst_stride)
+{
+    int r;
+
+    for (r = 0; r < 4; r++)
+    {
+#if !(CONFIG_FAST_UNALIGNED)
+        dst[0] = src[0];
+        dst[1] = src[1];
+        dst[2] = src[2];
+        dst[3] = src[3];
+        dst[4] = src[4];
+        dst[5] = src[5];
+        dst[6] = src[6];
+        dst[7] = src[7];
+#else
+        ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
+        ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
+#endif
+        src += src_stride;
+        dst += dst_stride;
+
+    }
+
+}
+
+
+void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
+{
+    int r;
+    unsigned char *pred_ptr = d->predictor;
+    unsigned char *ptr;
+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+    {
+        sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
+    }
+    else
+    {
+        for (r = 0; r < 4; r++)
+        {
+#if !(CONFIG_FAST_UNALIGNED)
+            pred_ptr[0]  = ptr[0];
+            pred_ptr[1]  = ptr[1];
+            pred_ptr[2]  = ptr[2];
+            pred_ptr[3]  = ptr[3];
+#else
+            *(uint32_t *)pred_ptr = *(uint32_t *)ptr ;
+#endif
+            pred_ptr     += pitch;
+            ptr         += pre_stride;
+        }
+    }
+}
+
+static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
+{
+    unsigned char *ptr;
+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+    {
+        x->subpixel_predict8x8(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+    }
+    else
+    {
+        vp8_copy_mem8x8(ptr, pre_stride, dst, dst_stride);
+    }
+}
+
+static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride)
+{
+    unsigned char *ptr;
+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+    {
+        x->subpixel_predict8x4(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+    }
+    else
+    {
+        vp8_copy_mem8x4(ptr, pre_stride, dst, dst_stride);
+    }
+}
+
+static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, unsigned char *base_pre, int pre_stride, vp8_subpix_fn_t sppf)
+{
+    int r;
+    unsigned char *ptr;
+    ptr = base_pre + d->offset + (d->bmi.mv.as_mv.row >> 3) * pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+    {
+        sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+    }
+    else
+    {
+        for (r = 0; r < 4; r++)
+        {
+#if !(CONFIG_FAST_UNALIGNED)
+          dst[0]  = ptr[0];
+          dst[1]  = ptr[1];
+          dst[2]  = ptr[2];
+          dst[3]  = ptr[3];
+#else
+            *(uint32_t *)dst = *(uint32_t *)ptr ;
+#endif
+            dst     += dst_stride;
+            ptr     += pre_stride;
+        }
+    }
+}
+
+
+/*encoder only*/
+void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x)
+{
+    unsigned char *uptr, *vptr;
+    unsigned char *upred_ptr = &x->predictor[256];
+    unsigned char *vpred_ptr = &x->predictor[320];
+
+    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+    int offset;
+    int pre_stride = x->pre.uv_stride;
+
+    /* calc uv motion vectors */
+    mv_row += 1 | (mv_row >> (sizeof(int) * CHAR_BIT - 1));
+    mv_col += 1 | (mv_col >> (sizeof(int) * CHAR_BIT - 1));
+    mv_row /= 2;
+    mv_col /= 2;
+    mv_row &= x->fullpixel_mask;
+    mv_col &= x->fullpixel_mask;
+
+    offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+    uptr = x->pre.u_buffer + offset;
+    vptr = x->pre.v_buffer + offset;
+
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
+        x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
+    }
+    else
+    {
+        vp8_copy_mem8x8(uptr, pre_stride, upred_ptr, 8);
+        vp8_copy_mem8x8(vptr, pre_stride, vpred_ptr, 8);
+    }
+}
+
+/*encoder only*/
+void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
+{
+    int i, j;
+    int pre_stride = x->pre.uv_stride;
+    unsigned char *base_pre;
+
+    /* build uv mvs */
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            int yoffset = i * 8 + j * 2;
+            int uoffset = 16 + i * 2 + j;
+            int voffset = 20 + i * 2 + j;
+
+            int temp;
+
+            temp = x->block[yoffset  ].bmi.mv.as_mv.row
+                   + x->block[yoffset+1].bmi.mv.as_mv.row
+                   + x->block[yoffset+4].bmi.mv.as_mv.row
+                   + x->block[yoffset+5].bmi.mv.as_mv.row;
+
+            temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);
+
+            x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
+
+            temp = x->block[yoffset  ].bmi.mv.as_mv.col
+                   + x->block[yoffset+1].bmi.mv.as_mv.col
+                   + x->block[yoffset+4].bmi.mv.as_mv.col
+                   + x->block[yoffset+5].bmi.mv.as_mv.col;
+
+            temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);
+
+            x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
+
+            x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
+        }
+    }
+
+    base_pre = x->pre.u_buffer;
+    for (i = 16; i < 20; i += 2)
+    {
+        BLOCKD *d0 = &x->block[i];
+        BLOCKD *d1 = &x->block[i+1];
+
+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
+        else
+        {
+            vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
+            vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
+        }
+    }
+
+    base_pre = x->pre.v_buffer;
+    for (i = 20; i < 24; i += 2)
+    {
+        BLOCKD *d0 = &x->block[i];
+        BLOCKD *d1 = &x->block[i+1];
+
+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            build_inter_predictors2b(x, d0, d0->predictor, 8, base_pre, pre_stride);
+        else
+        {
+            vp8_build_inter_predictors_b(d0, 8, base_pre, pre_stride, x->subpixel_predict);
+            vp8_build_inter_predictors_b(d1, 8, base_pre, pre_stride, x->subpixel_predict);
+        }
+    }
+}
+
+
+/*encoder only*/
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+                                         unsigned char *dst_y,
+                                         int dst_ystride)
+{
+    unsigned char *ptr_base;
+    unsigned char *ptr;
+    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+    int pre_stride = x->pre.y_stride;
+
+    ptr_base = x->pre.y_buffer;
+    ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7,
+                                 dst_y, dst_ystride);
+    }
+    else
+    {
+        vp8_copy_mem16x16(ptr, pre_stride, dst_y,
+            dst_ystride);
+    }
+}
+
+static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
+{
+    /* If the MV points so far into the UMV border that no visible pixels
+     * are used for reconstruction, the subpel part of the MV can be
+     * discarded and the MV limited to 16 pixels with equivalent results.
+     *
+     * This limit kicks in at 19 pixels for the top and left edges, for
+     * the 16 pixels plus 3 taps right of the central pixel when subpel
+     * filtering. The bottom and right edges use 16 pixels plus 2 pixels
+     * left of the central pixel when filtering.
+     */
+    if (mv->col < (xd->mb_to_left_edge - (19 << 3)))
+        mv->col = xd->mb_to_left_edge - (16 << 3);
+    else if (mv->col > xd->mb_to_right_edge + (18 << 3))
+        mv->col = xd->mb_to_right_edge + (16 << 3);
+
+    if (mv->row < (xd->mb_to_top_edge - (19 << 3)))
+        mv->row = xd->mb_to_top_edge - (16 << 3);
+    else if (mv->row > xd->mb_to_bottom_edge + (18 << 3))
+        mv->row = xd->mb_to_bottom_edge + (16 << 3);
+}
+
+/* A version of the above function for chroma block MVs.*/
+static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd)
+{
+    mv->col = (2*mv->col < (xd->mb_to_left_edge - (19 << 3))) ?
+        (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col;
+    mv->col = (2*mv->col > xd->mb_to_right_edge + (18 << 3)) ?
+        (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col;
+
+    mv->row = (2*mv->row < (xd->mb_to_top_edge - (19 << 3))) ?
+        (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row;
+    mv->row = (2*mv->row > xd->mb_to_bottom_edge + (18 << 3)) ?
+        (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;
+}
+
+void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
+                                        unsigned char *dst_y,
+                                        unsigned char *dst_u,
+                                        unsigned char *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride)
+{
+    int offset;
+    unsigned char *ptr;
+    unsigned char *uptr, *vptr;
+
+    int_mv _16x16mv;
+
+    unsigned char *ptr_base = x->pre.y_buffer;
+    int pre_stride = x->pre.y_stride;
+
+    _16x16mv.as_int = x->mode_info_context->mbmi.mv.as_int;
+
+    if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+    {
+        clamp_mv_to_umv_border(&_16x16mv.as_mv, x);
+    }
+
+    ptr = ptr_base + ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
+
+    if ( _16x16mv.as_int & 0x00070007)
+    {
+        x->subpixel_predict16x16(ptr, pre_stride, _16x16mv.as_mv.col & 7,  _16x16mv.as_mv.row & 7, dst_y, dst_ystride);
+    }
+    else
+    {
+        vp8_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
+    }
+
+    /* calc uv motion vectors */
+    _16x16mv.as_mv.row += 1 | (_16x16mv.as_mv.row >> (sizeof(int) * CHAR_BIT - 1));
+    _16x16mv.as_mv.col += 1 | (_16x16mv.as_mv.col >> (sizeof(int) * CHAR_BIT - 1));
+    _16x16mv.as_mv.row /= 2;
+    _16x16mv.as_mv.col /= 2;
+    _16x16mv.as_mv.row &= x->fullpixel_mask;
+    _16x16mv.as_mv.col &= x->fullpixel_mask;
+
+    pre_stride >>= 1;
+    offset = ( _16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
+    uptr = x->pre.u_buffer + offset;
+    vptr = x->pre.v_buffer + offset;
+
+    if ( _16x16mv.as_int & 0x00070007)
+    {
+        x->subpixel_predict8x8(uptr, pre_stride, _16x16mv.as_mv.col & 7,  _16x16mv.as_mv.row & 7, dst_u, dst_uvstride);
+        x->subpixel_predict8x8(vptr, pre_stride, _16x16mv.as_mv.col & 7,  _16x16mv.as_mv.row & 7, dst_v, dst_uvstride);
+    }
+    else
+    {
+        vp8_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
+        vp8_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
+    }
+}
+
+static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
+{
+    int i;
+    unsigned char *base_dst = x->dst.y_buffer;
+    unsigned char *base_pre = x->pre.y_buffer;
+
+    if (x->mode_info_context->mbmi.partitioning < 3)
+    {
+        BLOCKD *b;
+        int dst_stride = x->dst.y_stride;
+
+        x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
+        x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
+        x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
+        x->block[10].bmi = x->mode_info_context->bmi[10];
+        if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+        {
+            clamp_mv_to_umv_border(&x->block[ 0].bmi.mv.as_mv, x);
+            clamp_mv_to_umv_border(&x->block[ 2].bmi.mv.as_mv, x);
+            clamp_mv_to_umv_border(&x->block[ 8].bmi.mv.as_mv, x);
+            clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x);
+        }
+
+        b = &x->block[ 0];
+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+        b = &x->block[ 2];
+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+        b = &x->block[ 8];
+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+        b = &x->block[10];
+        build_inter_predictors4b(x, b, base_dst + b->offset, dst_stride, base_pre, dst_stride);
+    }
+    else
+    {
+        for (i = 0; i < 16; i += 2)
+        {
+            BLOCKD *d0 = &x->block[i];
+            BLOCKD *d1 = &x->block[i+1];
+            int dst_stride = x->dst.y_stride;
+
+            x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
+            x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
+            if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+            {
+                clamp_mv_to_umv_border(&x->block[i+0].bmi.mv.as_mv, x);
+                clamp_mv_to_umv_border(&x->block[i+1].bmi.mv.as_mv, x);
+            }
+
+            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+                build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
+            else
+            {
+                build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+                build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+            }
+
+        }
+
+    }
+    base_dst = x->dst.u_buffer;
+    base_pre = x->pre.u_buffer;
+    for (i = 16; i < 20; i += 2)
+    {
+        BLOCKD *d0 = &x->block[i];
+        BLOCKD *d1 = &x->block[i+1];
+        int dst_stride = x->dst.uv_stride;
+
+        /* Note: uv mvs already clamped in build_4x4uvmvs() */
+
+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
+        else
+        {
+            build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+            build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+        }
+    }
+
+    base_dst = x->dst.v_buffer;
+    base_pre = x->pre.v_buffer;
+    for (i = 20; i < 24; i += 2)
+    {
+        BLOCKD *d0 = &x->block[i];
+        BLOCKD *d1 = &x->block[i+1];
+        int dst_stride = x->dst.uv_stride;
+
+        /* Note: uv mvs already clamped in build_4x4uvmvs() */
+
+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            build_inter_predictors2b(x, d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride);
+        else
+        {
+            build_inter_predictors_b(d0, base_dst + d0->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+            build_inter_predictors_b(d1, base_dst + d1->offset, dst_stride, base_pre, dst_stride, x->subpixel_predict);
+        }
+    }
+}
+
+static
+void build_4x4uvmvs(MACROBLOCKD *x)
+{
+    int i, j;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            int yoffset = i * 8 + j * 2;
+            int uoffset = 16 + i * 2 + j;
+            int voffset = 20 + i * 2 + j;
+
+            int temp;
+
+            temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.row
+                 + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.row
+                 + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.row
+                 + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.row;
+
+            temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);
+
+            x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
+
+            temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.col
+                 + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.col
+                 + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.col
+                 + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.col;
+
+            temp += 4 + ((temp >> (sizeof(int) * CHAR_BIT - 1)) << 3);
+
+            x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
+
+            if (x->mode_info_context->mbmi.need_to_clamp_mvs)
+                clamp_uvmv_to_umv_border(&x->block[uoffset].bmi.mv.as_mv, x);
+
+            x->block[voffset].bmi.mv.as_int = x->block[uoffset].bmi.mv.as_int;
+        }
+    }
+}
+
+void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
+{
+    if (xd->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+                                           xd->dst.u_buffer, xd->dst.v_buffer,
+                                           xd->dst.y_stride, xd->dst.uv_stride);
+    }
+    else
+    {
+        build_4x4uvmvs(xd);
+        build_inter4x4_predictors_mb(xd);
+    }
+}
diff --git a/libvpx/vp8/common/reconinter.h b/libvpx/vp8/common/reconinter.h
new file mode 100644
index 0000000..233c02e
--- /dev/null
+++ b/libvpx/vp8/common/reconinter.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTER_H
+#define __INC_RECONINTER_H
+
+extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
+extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
+                                               unsigned char *dst_y,
+                                               unsigned char *dst_u,
+                                               unsigned char *dst_v,
+                                               int dst_ystride,
+                                               int dst_uvstride);
+
+
+extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
+                                                unsigned char *dst_y,
+                                                int dst_ystride);
+extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
+                                         unsigned char *base_pre,
+                                         int pre_stride,
+                                         vp8_subpix_fn_t sppf);
+
+extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
+extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
+
+#endif
diff --git a/libvpx/vp8/common/reconintra.c b/libvpx/vp8/common/reconintra.c
new file mode 100644
index 0000000..4067a68
--- /dev/null
+++ b/libvpx/vp8/common/reconintra.c
@@ -0,0 +1,288 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "blockd.h"
+
+void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x,
+                                          unsigned char * yabove_row,
+                                          unsigned char * yleft,
+                                          int left_stride,
+                                          unsigned char * ypred_ptr,
+                                          int y_stride)
+{
+    unsigned char yleft_col[16];
+    unsigned char ytop_left = yabove_row[-1];
+    int r, c, i;
+
+    for (i = 0; i < 16; i++)
+    {
+        yleft_col[i] = yleft[i* left_stride];
+    }
+
+    /* for Y */
+    switch (x->mode_info_context->mbmi.mode)
+    {
+    case DC_PRED:
+    {
+        int expected_dc;
+        int i;
+        int shift;
+        int average = 0;
+
+
+        if (x->up_available || x->left_available)
+        {
+            if (x->up_available)
+            {
+                for (i = 0; i < 16; i++)
+                {
+                    average += yabove_row[i];
+                }
+            }
+
+            if (x->left_available)
+            {
+
+                for (i = 0; i < 16; i++)
+                {
+                    average += yleft_col[i];
+                }
+
+            }
+
+
+
+            shift = 3 + x->up_available + x->left_available;
+            expected_dc = (average + (1 << (shift - 1))) >> shift;
+        }
+        else
+        {
+            expected_dc = 128;
+        }
+
+        /*vpx_memset(ypred_ptr, expected_dc, 256);*/
+        for (r = 0; r < 16; r++)
+        {
+            vpx_memset(ypred_ptr, expected_dc, 16);
+            ypred_ptr += y_stride;
+        }
+    }
+    break;
+    case V_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
+            ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
+            ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
+            ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
+            ypred_ptr += y_stride;
+        }
+    }
+    break;
+    case H_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+
+            vpx_memset(ypred_ptr, yleft_col[r], 16);
+            ypred_ptr += y_stride;
+        }
+
+    }
+    break;
+    case TM_PRED:
+    {
+
+        for (r = 0; r < 16; r++)
+        {
+            for (c = 0; c < 16; c++)
+            {
+                int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
+
+                if (pred < 0)
+                    pred = 0;
+
+                if (pred > 255)
+                    pred = 255;
+
+                ypred_ptr[c] = pred;
+            }
+
+            ypred_ptr += y_stride;
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+}
+
+void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
+                                         unsigned char * uabove_row,
+                                         unsigned char * vabove_row,
+                                         unsigned char * uleft,
+                                         unsigned char * vleft,
+                                         int left_stride,
+                                         unsigned char * upred_ptr,
+                                         unsigned char * vpred_ptr,
+                                         int pred_stride)
+{
+    unsigned char uleft_col[8];
+    unsigned char utop_left = uabove_row[-1];
+    unsigned char vleft_col[8];
+    unsigned char vtop_left = vabove_row[-1];
+
+    int i, j;
+
+    for (i = 0; i < 8; i++)
+    {
+        uleft_col[i] = uleft [i* left_stride];
+        vleft_col[i] = vleft [i* left_stride];
+    }
+
+    switch (x->mode_info_context->mbmi.uv_mode)
+    {
+    case DC_PRED:
+    {
+        int expected_udc;
+        int expected_vdc;
+        int i;
+        int shift;
+        int Uaverage = 0;
+        int Vaverage = 0;
+
+        if (x->up_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uabove_row[i];
+                Vaverage += vabove_row[i];
+            }
+        }
+
+        if (x->left_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uleft_col[i];
+                Vaverage += vleft_col[i];
+            }
+        }
+
+        if (!x->up_available && !x->left_available)
+        {
+            expected_udc = 128;
+            expected_vdc = 128;
+        }
+        else
+        {
+            shift = 2 + x->up_available + x->left_available;
+            expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
+            expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
+        }
+
+
+        /*vpx_memset(upred_ptr,expected_udc,64);*/
+        /*vpx_memset(vpred_ptr,expected_vdc,64);*/
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memset(upred_ptr, expected_udc, 8);
+            vpx_memset(vpred_ptr, expected_vdc, 8);
+            upred_ptr += pred_stride;
+            vpred_ptr += pred_stride;
+        }
+    }
+    break;
+    case V_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memcpy(upred_ptr, uabove_row, 8);
+            vpx_memcpy(vpred_ptr, vabove_row, 8);
+            upred_ptr += pred_stride;
+            vpred_ptr += pred_stride;
+        }
+
+    }
+    break;
+    case H_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            vpx_memset(upred_ptr, uleft_col[i], 8);
+            vpx_memset(vpred_ptr, vleft_col[i], 8);
+            upred_ptr += pred_stride;
+            vpred_ptr += pred_stride;
+        }
+    }
+
+    break;
+    case TM_PRED:
+    {
+        int i;
+
+        for (i = 0; i < 8; i++)
+        {
+            for (j = 0; j < 8; j++)
+            {
+                int predu = uleft_col[i] + uabove_row[j] - utop_left;
+                int predv = vleft_col[i] + vabove_row[j] - vtop_left;
+
+                if (predu < 0)
+                    predu = 0;
+
+                if (predu > 255)
+                    predu = 255;
+
+                if (predv < 0)
+                    predv = 0;
+
+                if (predv > 255)
+                    predv = 255;
+
+                upred_ptr[j] = predu;
+                vpred_ptr[j] = predv;
+            }
+
+            upred_ptr += pred_stride;
+            vpred_ptr += pred_stride;
+        }
+
+    }
+    break;
+    case B_PRED:
+    case NEARESTMV:
+    case NEARMV:
+    case ZEROMV:
+    case NEWMV:
+    case SPLITMV:
+    case MB_MODE_COUNT:
+        break;
+    }
+}
diff --git a/libvpx/vp8/common/reconintra4x4.c b/libvpx/vp8/common/reconintra4x4.c
new file mode 100644
index 0000000..7bb8d0a
--- /dev/null
+++ b/libvpx/vp8/common/reconintra4x4.c
@@ -0,0 +1,297 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "blockd.h"
+
+void vp8_intra4x4_predict_c(unsigned char *Above,
+                            unsigned char *yleft, int left_stride,
+                            B_PREDICTION_MODE b_mode,
+                            unsigned char *dst, int dst_stride,
+                            unsigned char top_left)
+{
+    int i, r, c;
+
+    unsigned char Left[4];
+    Left[0] = yleft[0];
+    Left[1] = yleft[left_stride];
+    Left[2] = yleft[2 * left_stride];
+    Left[3] = yleft[3 * left_stride];
+
+    switch (b_mode)
+    {
+    case B_DC_PRED:
+    {
+        int expected_dc = 0;
+
+        for (i = 0; i < 4; i++)
+        {
+            expected_dc += Above[i];
+            expected_dc += Left[i];
+        }
+
+        expected_dc = (expected_dc + 4) >> 3;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                dst[c] = expected_dc;
+            }
+
+            dst += dst_stride;
+        }
+    }
+    break;
+    case B_TM_PRED:
+    {
+        /* prediction similar to true_motion prediction */
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                int pred = Above[c] - top_left + Left[r];
+
+                if (pred < 0)
+                    pred = 0;
+
+                if (pred > 255)
+                    pred = 255;
+
+                dst[c] = pred;
+            }
+
+            dst += dst_stride;
+        }
+    }
+    break;
+
+    case B_VE_PRED:
+    {
+
+        unsigned int ap[4];
+        ap[0] = (top_left  + 2 * Above[0] + Above[1] + 2) >> 2;
+        ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2;
+        ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2;
+        ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+
+                dst[c] = ap[c];
+            }
+
+            dst += dst_stride;
+        }
+
+    }
+    break;
+
+
+    case B_HE_PRED:
+    {
+
+        unsigned int lp[4];
+        lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2;
+        lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2;
+        lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2;
+        lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2;
+
+        for (r = 0; r < 4; r++)
+        {
+            for (c = 0; c < 4; c++)
+            {
+                dst[c] = lp[r];
+            }
+
+            dst += dst_stride;
+        }
+    }
+    break;
+    case B_LD_PRED:
+    {
+        unsigned char *ptr = Above;
+        dst[0 * dst_stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
+        dst[0 * dst_stride + 1] =
+            dst[1 * dst_stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
+        dst[0 * dst_stride + 2] =
+            dst[1 * dst_stride + 1] =
+                dst[2 * dst_stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
+        dst[0 * dst_stride + 3] =
+            dst[1 * dst_stride + 2] =
+                dst[2 * dst_stride + 1] =
+                    dst[3 * dst_stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
+        dst[1 * dst_stride + 3] =
+            dst[2 * dst_stride + 2] =
+                dst[3 * dst_stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
+        dst[2 * dst_stride + 3] =
+            dst[3 * dst_stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
+        dst[3 * dst_stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
+
+    }
+    break;
+    case B_RD_PRED:
+    {
+
+        unsigned char pp[9];
+
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+        dst[3 * dst_stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        dst[3 * dst_stride + 1] =
+            dst[2 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        dst[3 * dst_stride + 2] =
+            dst[2 * dst_stride + 1] =
+                dst[1 * dst_stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        dst[3 * dst_stride + 3] =
+            dst[2 * dst_stride + 2] =
+                dst[1 * dst_stride + 1] =
+                    dst[0 * dst_stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        dst[2 * dst_stride + 3] =
+            dst[1 * dst_stride + 2] =
+                dst[0 * dst_stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        dst[1 * dst_stride + 3] =
+            dst[0 * dst_stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+        dst[0 * dst_stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+
+    }
+    break;
+    case B_VR_PRED:
+    {
+
+        unsigned char pp[9];
+
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+
+        dst[3 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        dst[2 * dst_stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        dst[3 * dst_stride + 1] =
+            dst[1 * dst_stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        dst[2 * dst_stride + 1] =
+            dst[0 * dst_stride + 0] = (pp[4] + pp[5] + 1) >> 1;
+        dst[3 * dst_stride + 2] =
+            dst[1 * dst_stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        dst[2 * dst_stride + 2] =
+            dst[0 * dst_stride + 1] = (pp[5] + pp[6] + 1) >> 1;
+        dst[3 * dst_stride + 3] =
+            dst[1 * dst_stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+        dst[2 * dst_stride + 3] =
+            dst[0 * dst_stride + 2] = (pp[6] + pp[7] + 1) >> 1;
+        dst[1 * dst_stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+        dst[0 * dst_stride + 3] = (pp[7] + pp[8] + 1) >> 1;
+
+    }
+    break;
+    case B_VL_PRED:
+    {
+
+        unsigned char *pp = Above;
+
+        dst[0 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1;
+        dst[1 * dst_stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        dst[2 * dst_stride + 0] =
+            dst[0 * dst_stride + 1] = (pp[1] + pp[2] + 1) >> 1;
+        dst[1 * dst_stride + 1] =
+            dst[3 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        dst[2 * dst_stride + 1] =
+            dst[0 * dst_stride + 2] = (pp[2] + pp[3] + 1) >> 1;
+        dst[3 * dst_stride + 1] =
+            dst[1 * dst_stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        dst[0 * dst_stride + 3] =
+            dst[2 * dst_stride + 2] = (pp[3] + pp[4] + 1) >> 1;
+        dst[1 * dst_stride + 3] =
+            dst[3 * dst_stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        dst[2 * dst_stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        dst[3 * dst_stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+    }
+    break;
+
+    case B_HD_PRED:
+    {
+        unsigned char pp[9];
+        pp[0] = Left[3];
+        pp[1] = Left[2];
+        pp[2] = Left[1];
+        pp[3] = Left[0];
+        pp[4] = top_left;
+        pp[5] = Above[0];
+        pp[6] = Above[1];
+        pp[7] = Above[2];
+        pp[8] = Above[3];
+
+
+        dst[3 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1;
+        dst[3 * dst_stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        dst[2 * dst_stride + 0] =
+            dst[3 * dst_stride + 2] = (pp[1] + pp[2] + 1) >> 1;
+        dst[2 * dst_stride + 1] =
+            dst[3 * dst_stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        dst[2 * dst_stride + 2] =
+            dst[1 * dst_stride + 0] = (pp[2] + pp[3] + 1) >> 1;
+        dst[2 * dst_stride + 3] =
+            dst[1 * dst_stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+        dst[1 * dst_stride + 2] =
+            dst[0 * dst_stride + 0] = (pp[3] + pp[4] + 1) >> 1;
+        dst[1 * dst_stride + 3] =
+            dst[0 * dst_stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+        dst[0 * dst_stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+        dst[0 * dst_stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+    }
+    break;
+
+
+    case B_HU_PRED:
+    {
+        unsigned char *pp = Left;
+        dst[0 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1;
+        dst[0 * dst_stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+        dst[0 * dst_stride + 2] =
+            dst[1 * dst_stride + 0] = (pp[1] + pp[2] + 1) >> 1;
+        dst[0 * dst_stride + 3] =
+            dst[1 * dst_stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+        dst[1 * dst_stride + 2] =
+            dst[2 * dst_stride + 0] = (pp[2] + pp[3] + 1) >> 1;
+        dst[1 * dst_stride + 3] =
+            dst[2 * dst_stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
+        dst[2 * dst_stride + 2] =
+            dst[2 * dst_stride + 3] =
+                dst[3 * dst_stride + 0] =
+                    dst[3 * dst_stride + 1] =
+                        dst[3 * dst_stride + 2] =
+                            dst[3 * dst_stride + 3] = pp[3];
+    }
+    break;
+
+    default:
+    break;
+
+    }
+}
diff --git a/libvpx/vp8/common/reconintra4x4.h b/libvpx/vp8/common/reconintra4x4.h
new file mode 100644
index 0000000..d2b0d43
--- /dev/null
+++ b/libvpx/vp8/common/reconintra4x4.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTRA4x4_H
+#define __INC_RECONINTRA4x4_H
+#include "vp8/common/blockd.h"
+
+static void intra_prediction_down_copy(MACROBLOCKD *xd,
+                                             unsigned char *above_right_src)
+{
+    int dst_stride = xd->dst.y_stride;
+    unsigned char *above_right_dst = xd->dst.y_buffer - dst_stride + 16;
+
+    unsigned int *src_ptr = (unsigned int *)above_right_src;
+    unsigned int *dst_ptr0 = (unsigned int *)(above_right_dst + 4 * dst_stride);
+    unsigned int *dst_ptr1 = (unsigned int *)(above_right_dst + 8 * dst_stride);
+    unsigned int *dst_ptr2 = (unsigned int *)(above_right_dst + 12 * dst_stride);
+
+    *dst_ptr0 = *src_ptr;
+    *dst_ptr1 = *src_ptr;
+    *dst_ptr2 = *src_ptr;
+}
+
+#endif
diff --git a/libvpx/vp8/common/rtcd.c b/libvpx/vp8/common/rtcd.c
new file mode 100644
index 0000000..01dad46
--- /dev/null
+++ b/libvpx/vp8/common/rtcd.c
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vpx_config.h"
+#define RTCD_C
+#include "vpx_rtcd.h"
+
+#if CONFIG_MULTITHREAD && defined(_WIN32)
+#include <windows.h>
+#include <stdlib.h>
+static void once(void (*func)(void))
+{
+    static CRITICAL_SECTION *lock;
+    static LONG waiters;
+    static int done;
+    void *lock_ptr = &lock;
+
+    /* If the initialization is complete, return early. This isn't just an
+     * optimization, it prevents races on the destruction of the global
+     * lock.
+     */
+    if(done)
+        return;
+
+    InterlockedIncrement(&waiters);
+
+    /* Get a lock. We create one and try to make it the one-true-lock,
+     * throwing it away if we lost the race.
+     */
+
+    {
+        /* Scope to protect access to new_lock */
+        CRITICAL_SECTION *new_lock = malloc(sizeof(CRITICAL_SECTION));
+        InitializeCriticalSection(new_lock);
+        if (InterlockedCompareExchangePointer(lock_ptr, new_lock, NULL) != NULL)
+        {
+            DeleteCriticalSection(new_lock);
+            free(new_lock);
+        }
+    }
+
+    /* At this point, we have a lock that can be synchronized on. We don't
+     * care which thread actually performed the allocation.
+     */
+
+    EnterCriticalSection(lock);
+
+    if (!done)
+    {
+        func();
+        done = 1;
+    }
+
+    LeaveCriticalSection(lock);
+
+    /* Last one out should free resources. The destructed objects are
+     * protected by checking if(done) above.
+     */
+    if(!InterlockedDecrement(&waiters))
+    {
+        DeleteCriticalSection(lock);
+        free(lock);
+        lock = NULL;
+    }
+}
+
+
+#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
+#include <pthread.h>
+static void once(void (*func)(void))
+{
+    static pthread_once_t lock = PTHREAD_ONCE_INIT;
+    pthread_once(&lock, func);
+}
+
+
+#else
+/* No-op version that performs no synchronization. vpx_rtcd() is idempotent,
+ * so as long as your platform provides atomic loads/stores of pointers
+ * no synchronization is strictly necessary.
+ */
+
+static void once(void (*func)(void))
+{
+    static int done;
+
+    if(!done)
+    {
+        func();
+        done = 1;
+    }
+}
+#endif
+
+
+void vpx_rtcd()
+{
+    once(setup_rtcd_internal);
+}
diff --git a/libvpx/vp8/common/rtcd_defs.sh b/libvpx/vp8/common/rtcd_defs.sh
new file mode 100644
index 0000000..0f950f8
--- /dev/null
+++ b/libvpx/vp8/common/rtcd_defs.sh
@@ -0,0 +1,568 @@
+common_forward_decls() {
+cat <<EOF
+#include "vp8/common/blockd.h"
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls common_forward_decls
+
+#
+# Dequant
+#
+prototype void vp8_dequantize_b "struct blockd*, short *dqc"
+specialize vp8_dequantize_b mmx media neon
+vp8_dequantize_b_media=vp8_dequantize_b_v6
+
+prototype void vp8_dequant_idct_add "short *input, short *dq, unsigned char *output, int stride"
+specialize vp8_dequant_idct_add mmx media neon dspr2
+vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6
+vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2
+
+prototype void vp8_dequant_idct_add_y_block "short *q, short *dq, unsigned char *dst, int stride, char *eobs"
+specialize vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2
+vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6
+vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2
+
+prototype void vp8_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"
+specialize vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2
+vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6
+vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2
+
+#
+# Loopfilter
+#
+prototype void vp8_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp8_loop_filter_mbv mmx sse2 media neon dspr2
+vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6
+vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2
+
+prototype void vp8_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp8_loop_filter_bv mmx sse2 media neon dspr2
+vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6
+vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2
+
+prototype void vp8_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp8_loop_filter_mbh mmx sse2 media neon dspr2
+vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6
+vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2
+
+prototype void vp8_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp8_loop_filter_bh mmx sse2 media neon dspr2
+vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6
+vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2
+
+
+prototype void vp8_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp8_loop_filter_simple_mbv mmx sse2 media neon
+vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c
+vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx
+vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2
+vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6
+vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon
+
+prototype void vp8_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp8_loop_filter_simple_mbh mmx sse2 media neon
+vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c
+vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx
+vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2
+vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6
+vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon
+
+prototype void vp8_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp8_loop_filter_simple_bv mmx sse2 media neon
+vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c
+vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx
+vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2
+vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6
+vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon
+
+prototype void vp8_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
+specialize vp8_loop_filter_simple_bh mmx sse2 media neon
+vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c
+vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx
+vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2
+vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6
+vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon
+
+#
+# IDCT
+#
+#idct16
+prototype void vp8_short_idct4x4llm "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride"
+specialize vp8_short_idct4x4llm mmx media neon dspr2
+vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual
+vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2
+
+#iwalsh1
+prototype void vp8_short_inv_walsh4x4_1 "short *input, short *output"
+specialize vp8_short_inv_walsh4x4_1 dspr2
+vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2
+# no asm yet
+
+#iwalsh16
+prototype void vp8_short_inv_walsh4x4 "short *input, short *output"
+specialize vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2
+vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6
+vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2
+
+#idct1_scalar_add
+prototype void vp8_dc_only_idct_add "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride"
+specialize vp8_dc_only_idct_add	mmx media neon dspr2
+vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6
+vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2
+
+#
+# RECON
+#
+prototype void vp8_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp8_copy_mem16x16 mmx sse2 media neon dspr2
+vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6
+vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2
+
+prototype void vp8_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp8_copy_mem8x8 mmx media neon dspr2
+vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6
+vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2
+
+prototype void vp8_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+specialize vp8_copy_mem8x4 mmx media neon dspr2
+vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6
+vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2
+
+prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride"
+specialize vp8_build_intra_predictors_mby_s sse2 ssse3
+#TODO: fix assembly for neon
+
+prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"
+specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
+
+prototype void vp8_intra4x4_predict "unsigned char *Above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"
+specialize vp8_intra4x4_predict media
+vp8_intra4x4_predict_media=vp8_intra4x4_predict_armv6
+
+#
+# Postproc
+#
+if [ "$CONFIG_POSTPROC" = "yes" ]; then
+    prototype void vp8_mbpost_proc_down "unsigned char *dst, int pitch, int rows, int cols,int flimit"
+    specialize vp8_mbpost_proc_down mmx sse2
+    vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm
+
+    prototype void vp8_mbpost_proc_across_ip "unsigned char *dst, int pitch, int rows, int cols,int flimit"
+    specialize vp8_mbpost_proc_across_ip sse2
+    vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm
+
+    prototype void vp8_post_proc_down_and_across_mb_row "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"
+    specialize vp8_post_proc_down_and_across_mb_row sse2
+
+    prototype void vp8_plane_add_noise "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch"
+    specialize vp8_plane_add_noise mmx sse2
+    vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt
+
+    prototype void vp8_blend_mb_inner "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+    # no asm yet
+
+    prototype void vp8_blend_mb_outer "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+    # no asm yet
+
+    prototype void vp8_blend_b "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+    # no asm yet
+
+    prototype void vp8_filter_by_weight16x16 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
+    specialize vp8_filter_by_weight16x16 sse2
+
+    prototype void vp8_filter_by_weight8x8 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
+    specialize vp8_filter_by_weight8x8 sse2
+
+    prototype void vp8_filter_by_weight4x4 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
+    # no asm yet
+fi
+
+#
+# Subpixel
+#
+prototype void vp8_sixtap_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2
+vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6
+vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2
+
+prototype void vp8_sixtap_predict8x8 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2
+vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6
+vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2
+
+prototype void vp8_sixtap_predict8x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2
+vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6
+vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2
+
+prototype void vp8_sixtap_predict4x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_sixtap_predict4x4 mmx ssse3 media neon dspr2
+vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6
+vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2
+
+prototype void vp8_bilinear_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon
+vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6
+
+prototype void vp8_bilinear_predict8x8 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon
+vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6
+
+prototype void vp8_bilinear_predict8x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_bilinear_predict8x4 mmx media neon
+vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6
+
+prototype void vp8_bilinear_predict4x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
+specialize vp8_bilinear_predict4x4 mmx media neon
+vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6
+
+#
+# Whole-pixel Variance
+#
+prototype unsigned int vp8_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance4x4 mmx sse2
+vp8_variance4x4_sse2=vp8_variance4x4_wmt
+
+prototype unsigned int vp8_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance8x8 mmx sse2 media neon
+vp8_variance8x8_sse2=vp8_variance8x8_wmt
+vp8_variance8x8_media=vp8_variance8x8_armv6
+
+prototype unsigned int vp8_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance8x16 mmx sse2 neon
+vp8_variance8x16_sse2=vp8_variance8x16_wmt
+
+prototype unsigned int vp8_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance16x8 mmx sse2 neon
+vp8_variance16x8_sse2=vp8_variance16x8_wmt
+
+prototype unsigned int vp8_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance16x16 mmx sse2 media neon
+vp8_variance16x16_sse2=vp8_variance16x16_wmt
+vp8_variance16x16_media=vp8_variance16x16_armv6
+
+#
+# Sub-pixel Variance
+#
+prototype unsigned int vp8_sub_pixel_variance4x4 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance4x4 mmx sse2
+vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt
+
+prototype unsigned int vp8_sub_pixel_variance8x8 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance8x8 mmx sse2 media neon
+vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt
+vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6
+
+prototype unsigned int vp8_sub_pixel_variance8x16 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance8x16 mmx sse2
+vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt
+
+prototype unsigned int vp8_sub_pixel_variance16x8 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance16x8 mmx sse2 ssse3
+vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt
+
+prototype unsigned int vp8_sub_pixel_variance16x16 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon
+vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt
+vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6
+
+prototype unsigned int vp8_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_h mmx sse2 media neon
+vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt
+vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6
+
+prototype unsigned int vp8_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_v mmx sse2 media neon
+vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt
+vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6
+
+prototype unsigned int vp8_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_variance_halfpixvar16x16_hv mmx sse2 media neon
+vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt
+vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6
+
+#
+# Single block SAD
+#
+prototype unsigned int vp8_sad4x4 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp8_sad4x4 mmx sse2 neon
+vp8_sad4x4_sse2=vp8_sad4x4_wmt
+
+prototype unsigned int vp8_sad8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp8_sad8x8 mmx sse2 neon
+vp8_sad8x8_sse2=vp8_sad8x8_wmt
+
+prototype unsigned int vp8_sad8x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp8_sad8x16 mmx sse2 neon
+vp8_sad8x16_sse2=vp8_sad8x16_wmt
+
+prototype unsigned int vp8_sad16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp8_sad16x8 mmx sse2 neon
+vp8_sad16x8_sse2=vp8_sad16x8_wmt
+
+prototype unsigned int vp8_sad16x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
+specialize vp8_sad16x16 mmx sse2 sse3 media neon
+vp8_sad16x16_sse2=vp8_sad16x16_wmt
+vp8_sad16x16_media=vp8_sad16x16_armv6
+
+#
+# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+#
+prototype void vp8_sad4x4x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad4x4x3 sse3
+
+prototype void vp8_sad8x8x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x8x3 sse3
+
+prototype void vp8_sad8x16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x16x3 sse3
+
+prototype void vp8_sad16x8x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x8x3 sse3 ssse3
+
+prototype void vp8_sad16x16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x16x3 sse3 ssse3
+
+# Note the only difference in the following prototypes is that they return into
+# an array of short
+prototype void vp8_sad4x4x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad4x4x8 sse4_1
+vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4
+
+prototype void vp8_sad8x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad8x8x8 sse4_1
+vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4
+
+prototype void vp8_sad8x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad8x16x8 sse4_1
+vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4
+
+prototype void vp8_sad16x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad16x8x8 sse4_1
+vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4
+
+prototype void vp8_sad16x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+specialize vp8_sad16x16x8 sse4_1
+vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4
+
+#
+# Multi-block SAD, comparing a reference to N independent blocks
+#
+prototype void vp8_sad4x4x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad4x4x4d sse3
+
+prototype void vp8_sad8x8x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x8x4d sse3
+
+prototype void vp8_sad8x16x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad8x16x4d sse3
+
+prototype void vp8_sad16x8x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x8x4d sse3
+
+prototype void vp8_sad16x16x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+specialize vp8_sad16x16x4d sse3
+
+#
+# Encoder functions below this point.
+#
+if [ "$CONFIG_VP8_ENCODER" = "yes" ]; then
+
+#
+# Sum of squares (vector)
+#
+prototype unsigned int vp8_get_mb_ss "const short *"
+specialize vp8_get_mb_ss mmx sse2
+
+#
+# SSE (Sum Squared Error)
+#
+prototype unsigned int vp8_sub_pixel_mse16x16 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp8_sub_pixel_mse16x16 mmx sse2
+vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt
+
+prototype unsigned int vp8_mse16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
+specialize vp8_mse16x16 mmx sse2 media neon
+vp8_mse16x16_sse2=vp8_mse16x16_wmt
+vp8_mse16x16_media=vp8_mse16x16_armv6
+
+prototype unsigned int vp8_get4x4sse_cs "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride"
+specialize vp8_get4x4sse_cs mmx neon
+
+#
+# Block copy
+#
+case $arch in
+    x86*)
+    prototype void vp8_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
+    specialize vp8_copy32xn sse2 sse3
+    ;;
+esac
+
+#
+# Structured Similarity (SSIM)
+#
+if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
+    [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
+
+    prototype void vp8_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    specialize vp8_ssim_parms_8x8 $sse2_on_x86_64
+
+    prototype void vp8_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    specialize vp8_ssim_parms_16x16 $sse2_on_x86_64
+fi
+
+#
+# Forward DCT
+#
+prototype void vp8_short_fdct4x4 "short *input, short *output, int pitch"
+specialize vp8_short_fdct4x4 mmx sse2 media neon
+vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6
+
+prototype void vp8_short_fdct8x4 "short *input, short *output, int pitch"
+specialize vp8_short_fdct8x4 mmx sse2 media neon
+vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6
+
+prototype void vp8_short_walsh4x4 "short *input, short *output, int pitch"
+specialize vp8_short_walsh4x4 sse2 media neon
+vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6
+
+#
+# Quantizer
+#
+prototype void vp8_regular_quantize_b "struct block *, struct blockd *"
+specialize vp8_regular_quantize_b sse2 sse4_1
+vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4
+
+prototype void vp8_fast_quantize_b "struct block *, struct blockd *"
+specialize vp8_fast_quantize_b sse2 ssse3 media neon
+vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6
+
+prototype void vp8_regular_quantize_b_pair "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"
+# no asm yet
+
+prototype void vp8_fast_quantize_b_pair "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"
+specialize vp8_fast_quantize_b_pair neon
+
+prototype void vp8_quantize_mb "struct macroblock *"
+specialize vp8_quantize_mb neon
+
+prototype void vp8_quantize_mby "struct macroblock *"
+specialize vp8_quantize_mby neon
+
+prototype void vp8_quantize_mbuv "struct macroblock *"
+specialize vp8_quantize_mbuv neon
+
+#
+# Block subtraction
+#
+prototype int vp8_block_error "short *coeff, short *dqcoeff"
+specialize vp8_block_error mmx sse2
+vp8_block_error_sse2=vp8_block_error_xmm
+
+prototype int vp8_mbblock_error "struct macroblock *mb, int dc"
+specialize vp8_mbblock_error mmx sse2
+vp8_mbblock_error_sse2=vp8_mbblock_error_xmm
+
+prototype int vp8_mbuverror "struct macroblock *mb"
+specialize vp8_mbuverror mmx sse2
+vp8_mbuverror_sse2=vp8_mbuverror_xmm
+
+prototype void vp8_subtract_b "struct block *be, struct blockd *bd, int pitch"
+specialize vp8_subtract_b mmx sse2 media neon
+vp8_subtract_b_media=vp8_subtract_b_armv6
+
+prototype void vp8_subtract_mby "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride"
+specialize vp8_subtract_mby mmx sse2 media neon
+vp8_subtract_mby_media=vp8_subtract_mby_armv6
+
+prototype void vp8_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride"
+specialize vp8_subtract_mbuv mmx sse2 media neon
+vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6
+
+#
+# Motion search
+#
+prototype int vp8_full_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
+specialize vp8_full_search_sad sse3 sse4_1
+vp8_full_search_sad_sse3=vp8_full_search_sadx3
+vp8_full_search_sad_sse4_1=vp8_full_search_sadx8
+
+prototype int vp8_refining_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
+specialize vp8_refining_search_sad sse3
+vp8_refining_search_sad_sse3=vp8_refining_search_sadx4
+
+prototype int vp8_diamond_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
+vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4
+
+#
+# Alt-ref Noise Reduction (ARNR)
+#
+if [ "$CONFIG_REALTIME_ONLY" != "yes" ]; then
+    prototype void vp8_temporal_filter_apply "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count"
+    specialize vp8_temporal_filter_apply sse2
+fi
+
+#
+# Pick Loopfilter
+#
+prototype void vp8_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
+specialize vp8_yv12_copy_partial_frame neon
+
+#
+# Denoiser filter
+#
+if [ "$CONFIG_TEMPORAL_DENOISING" = "yes" ]; then
+    prototype int vp8_denoiser_filter "struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset"
+    specialize vp8_denoiser_filter sse2
+fi
+
+# End of encoder only functions
+fi
+
+# Scaler functions
+if [ "CONFIG_SPATIAL_RESAMPLING" != "yes" ]; then
+    prototype void vp8_horizontal_line_4_5_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+    prototype void vp8_vertical_band_4_5_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+    prototype void vp8_last_vertical_band_4_5_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+    prototype void vp8_horizontal_line_2_3_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+    prototype void vp8_vertical_band_2_3_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+    prototype void vp8_last_vertical_band_2_3_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+    prototype void vp8_horizontal_line_3_5_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+    prototype void vp8_vertical_band_3_5_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+    prototype void vp8_last_vertical_band_3_5_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+    prototype void vp8_horizontal_line_3_4_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+    prototype void vp8_vertical_band_3_4_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+    prototype void vp8_last_vertical_band_3_4_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+    prototype void vp8_horizontal_line_1_2_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+    prototype void vp8_vertical_band_1_2_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+    prototype void vp8_last_vertical_band_1_2_scale "unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+    prototype void vp8_horizontal_line_5_4_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+    prototype void vp8_vertical_band_5_4_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+    prototype void vp8_horizontal_line_5_3_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+    prototype void vp8_vertical_band_5_3_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+    prototype void vp8_horizontal_line_2_1_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
+    prototype void vp8_vertical_band_2_1_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+    prototype void vp8_vertical_band_2_1_scale_i "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
+fi
+
+prototype void vp8_yv12_extend_frame_borders "struct yv12_buffer_config *ybf"
+specialize vp8_yv12_extend_frame_borders neon
+
+prototype void vp8_yv12_copy_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
+specialize vp8_yv12_copy_frame neon
+
+prototype void vp8_yv12_copy_y "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
+specialize vp8_yv12_copy_y neon
+
diff --git a/libvpx/vp8/common/sad_c.c b/libvpx/vp8/common/sad_c.c
new file mode 100644
index 0000000..5f36fc9
--- /dev/null
+++ b/libvpx/vp8/common/sad_c.c
@@ -0,0 +1,302 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include <stdlib.h>
+#include "vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+static unsigned int sad_mx_n_c(const unsigned char *src_ptr, int src_stride,
+                               const unsigned char *ref_ptr, int ref_stride,
+                               unsigned int max_sad, int m, int n)
+{
+    int r, c;
+    unsigned int sad = 0;
+
+    for (r = 0; r < n; r++)
+    {
+        for (c = 0; c < m; c++)
+        {
+            sad += abs(src_ptr[c] - ref_ptr[c]);
+        }
+
+        if (sad > max_sad)
+          break;
+
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+    }
+
+    return sad;
+}
+
+/* max_sad is provided as an optional optimization point. Alternative
+ * implementations of these functions are not required to check it.
+ */
+
+unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride,
+                            const unsigned char *ref_ptr, int ref_stride,
+                            unsigned int max_sad)
+{
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 16);
+}
+
+unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride,
+                          const unsigned char *ref_ptr, int ref_stride,
+                          unsigned int max_sad)
+{
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 8);
+}
+
+unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride,
+                           const unsigned char *ref_ptr, int ref_stride,
+                           unsigned int max_sad)
+{
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 8);
+
+}
+
+unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride,
+                           const unsigned char *ref_ptr, int ref_stride,
+                           unsigned int max_sad)
+{
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 16);
+}
+
+unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride,
+                          const unsigned char *ref_ptr, int ref_stride,
+                          unsigned int max_sad)
+{
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 4, 4);
+}
+
+void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride,
+                      const unsigned char *ref_ptr, int ref_stride,
+                      unsigned int *sad_array)
+{
+    sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+}
+
+void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride,
+                      const unsigned char *ref_ptr, int ref_stride,
+                      unsigned short *sad_array)
+{
+    sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+    sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+    sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+    sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+    sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+    sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
+}
+
+void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride,
+                     const unsigned char *ref_ptr, int ref_stride,
+                     unsigned int *sad_array)
+{
+    sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+}
+
+void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride,
+                     const unsigned char *ref_ptr, int ref_stride,
+                     unsigned short *sad_array)
+{
+    sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+    sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+    sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+    sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+    sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+    sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
+}
+
+void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride,
+                    const unsigned char *ref_ptr, int ref_stride,
+                    unsigned int *sad_array)
+{
+    sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+}
+
+void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride,
+                    const unsigned char *ref_ptr, int ref_stride,
+                    unsigned short *sad_array)
+{
+    sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+    sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+    sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+    sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+    sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+    sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
+}
+
+void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride,
+                     const unsigned char *ref_ptr, int ref_stride,
+                     unsigned int *sad_array)
+{
+    sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+}
+
+void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride,
+                     const unsigned char *ref_ptr, int ref_stride,
+                     unsigned short *sad_array)
+{
+    sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+    sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+    sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+    sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+    sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+    sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
+}
+
+void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride,
+                    const unsigned char *ref_ptr, int ref_stride,
+                    unsigned int *sad_array)
+{
+    sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+}
+
+void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride,
+                    const unsigned char *ref_ptr, int ref_stride,
+                    unsigned short *sad_array)
+{
+    sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+    sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+    sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+    sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+    sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+    sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
+}
+
+void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride,
+                       const unsigned char * const ref_ptr[], int ref_stride,
+                       unsigned int *sad_array)
+{
+    sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+    sad_array[3] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
+}
+
+void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride,
+                      const unsigned char * const ref_ptr[], int ref_stride,
+                      unsigned int *sad_array)
+{
+    sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+    sad_array[3] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
+}
+
+void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride,
+                     const unsigned char * const ref_ptr[], int ref_stride,
+                     unsigned int *sad_array)
+{
+    sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+    sad_array[3] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
+}
+
+void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride,
+                      const unsigned char * const ref_ptr[], int ref_stride,
+                      unsigned int *sad_array)
+{
+    sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+    sad_array[3] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
+}
+
+void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride,
+                     const unsigned char * const ref_ptr[], int  ref_stride,
+                     unsigned int *sad_array)
+{
+    sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+    sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
+}
+
+/* Copy 2 macroblocks to a buffer */
+void vp8_copy32xn_c(unsigned char *src_ptr, int src_stride,
+                    unsigned char *dst_ptr, int dst_stride,
+                    int height)
+{
+    int r;
+
+    for (r = 0; r < height; r++)
+    {
+#if !(CONFIG_FAST_UNALIGNED)
+        dst_ptr[0] = src_ptr[0];
+        dst_ptr[1] = src_ptr[1];
+        dst_ptr[2] = src_ptr[2];
+        dst_ptr[3] = src_ptr[3];
+        dst_ptr[4] = src_ptr[4];
+        dst_ptr[5] = src_ptr[5];
+        dst_ptr[6] = src_ptr[6];
+        dst_ptr[7] = src_ptr[7];
+        dst_ptr[8] = src_ptr[8];
+        dst_ptr[9] = src_ptr[9];
+        dst_ptr[10] = src_ptr[10];
+        dst_ptr[11] = src_ptr[11];
+        dst_ptr[12] = src_ptr[12];
+        dst_ptr[13] = src_ptr[13];
+        dst_ptr[14] = src_ptr[14];
+        dst_ptr[15] = src_ptr[15];
+        dst_ptr[16] = src_ptr[16];
+        dst_ptr[17] = src_ptr[17];
+        dst_ptr[18] = src_ptr[18];
+        dst_ptr[19] = src_ptr[19];
+        dst_ptr[20] = src_ptr[20];
+        dst_ptr[21] = src_ptr[21];
+        dst_ptr[22] = src_ptr[22];
+        dst_ptr[23] = src_ptr[23];
+        dst_ptr[24] = src_ptr[24];
+        dst_ptr[25] = src_ptr[25];
+        dst_ptr[26] = src_ptr[26];
+        dst_ptr[27] = src_ptr[27];
+        dst_ptr[28] = src_ptr[28];
+        dst_ptr[29] = src_ptr[29];
+        dst_ptr[30] = src_ptr[30];
+        dst_ptr[31] = src_ptr[31];
+#else
+        ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0] ;
+        ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1] ;
+        ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2] ;
+        ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3] ;
+        ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4] ;
+        ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5] ;
+        ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6] ;
+        ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7] ;
+#endif
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+
+    }
+}
diff --git a/libvpx/vp8/common/setupintrarecon.c b/libvpx/vp8/common/setupintrarecon.c
new file mode 100644
index 0000000..60afe51
--- /dev/null
+++ b/libvpx/vp8/common/setupintrarecon.c
@@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "setupintrarecon.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf)
+{
+    int i;
+
+    /* set up frame new frame for intra coded blocks */
+    vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+    for (i = 0; i < ybf->y_height; i++)
+        ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129;
+
+    vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    for (i = 0; i < ybf->uv_height; i++)
+        ybf->u_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
+
+    vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    for (i = 0; i < ybf->uv_height; i++)
+        ybf->v_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
+
+}
+
+void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf)
+{
+    vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+    vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+}
diff --git a/libvpx/vp8/common/setupintrarecon.h b/libvpx/vp8/common/setupintrarecon.h
new file mode 100644
index 0000000..e515c3a
--- /dev/null
+++ b/libvpx/vp8/common/setupintrarecon.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
+extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf);
+
+static
+void setup_intra_recon_left(unsigned char *y_buffer,
+                            unsigned char *u_buffer,
+                            unsigned char *v_buffer,
+                            int y_stride,
+                            int uv_stride)
+{
+    int i;
+
+    for (i = 0; i < 16; i++)
+        y_buffer[y_stride *i] = (unsigned char) 129;
+
+    for (i = 0; i < 8; i++)
+        u_buffer[uv_stride *i] = (unsigned char) 129;
+
+    for (i = 0; i < 8; i++)
+        v_buffer[uv_stride *i] = (unsigned char) 129;
+}
diff --git a/libvpx/vp8/common/swapyv12buffer.c b/libvpx/vp8/common/swapyv12buffer.c
new file mode 100644
index 0000000..73656b3
--- /dev/null
+++ b/libvpx/vp8/common/swapyv12buffer.c
@@ -0,0 +1,34 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "swapyv12buffer.h"
+
+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame)
+{
+    unsigned char *temp;
+
+    temp = last_frame->buffer_alloc;
+    last_frame->buffer_alloc = new_frame->buffer_alloc;
+    new_frame->buffer_alloc = temp;
+
+    temp = last_frame->y_buffer;
+    last_frame->y_buffer = new_frame->y_buffer;
+    new_frame->y_buffer = temp;
+
+    temp = last_frame->u_buffer;
+    last_frame->u_buffer = new_frame->u_buffer;
+    new_frame->u_buffer = temp;
+
+    temp = last_frame->v_buffer;
+    last_frame->v_buffer = new_frame->v_buffer;
+    new_frame->v_buffer = temp;
+
+}
diff --git a/libvpx/vp8/common/swapyv12buffer.h b/libvpx/vp8/common/swapyv12buffer.h
new file mode 100644
index 0000000..a6473ed
--- /dev/null
+++ b/libvpx/vp8/common/swapyv12buffer.h
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef SWAPYV12_BUFFER_H
+#define SWAPYV12_BUFFER_H
+
+#include "vpx_scale/yv12config.h"
+
+void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame);
+
+#endif
diff --git a/libvpx/vp8/common/systemdependent.h b/libvpx/vp8/common/systemdependent.h
new file mode 100644
index 0000000..f99c4bb
--- /dev/null
+++ b/libvpx/vp8/common/systemdependent.h
@@ -0,0 +1,21 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#if ARCH_X86 || ARCH_X86_64
+void vpx_reset_mmx_state(void);
+#define vp8_clear_system_state() vpx_reset_mmx_state()
+#else
+#define vp8_clear_system_state()
+#endif
+
+struct VP8Common;
+void vp8_machine_specific_config(struct VP8Common *);
diff --git a/libvpx/vp8/common/textblit.c b/libvpx/vp8/common/textblit.c
new file mode 100644
index 0000000..1756100
--- /dev/null
+++ b/libvpx/vp8/common/textblit.c
@@ -0,0 +1,130 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+
+void vp8_blit_text(const char *msg, unsigned char *address, const int pitch)
+{
+    int letter_bitmap;
+    unsigned char *output_pos = address;
+    int colpos;
+    const int font[] =
+    {
+        0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,
+        0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,
+        0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,
+        0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,
+        0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,
+        0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,
+        0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,
+        0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,
+        0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820
+    };
+    colpos = 0;
+
+    while (msg[colpos] != 0)
+    {
+        char letter = msg[colpos];
+        int fontcol, fontrow;
+
+        if (letter <= 'Z' && letter >= ' ')
+            letter_bitmap = font[letter-' '];
+        else if (letter <= 'z' && letter >= 'a')
+            letter_bitmap = font[letter-'a'+'A' - ' '];
+        else
+            letter_bitmap = font[0];
+
+        for (fontcol = 6; fontcol >= 0 ; fontcol--)
+            for (fontrow = 0; fontrow < 5; fontrow++)
+                output_pos[fontrow *pitch + fontcol] =
+                    ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0);
+
+        output_pos += 7;
+        colpos++;
+    }
+}
+
+static void plot (const int x, const int y, unsigned char *image, const int pitch)
+{
+    image [x+y*pitch] ^= 255;
+}
+
+/* Bresenham line algorithm */
+void vp8_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch)
+{
+    int steep = abs(y1 - y0) > abs(x1 - x0);
+    int deltax, deltay;
+    int error, ystep, y, x;
+
+    if (steep)
+    {
+        int t;
+        t = x0;
+        x0 = y0;
+        y0 = t;
+
+        t = x1;
+        x1 = y1;
+        y1 = t;
+    }
+
+    if (x0 > x1)
+    {
+        int t;
+        t = x0;
+        x0 = x1;
+        x1 = t;
+
+        t = y0;
+        y0 = y1;
+        y1 = t;
+    }
+
+    deltax = x1 - x0;
+    deltay = abs(y1 - y0);
+    error  = deltax / 2;
+
+    y = y0;
+
+    if (y0 < y1)
+        ystep = 1;
+    else
+        ystep = -1;
+
+    if (steep)
+    {
+        for (x = x0; x <= x1; x++)
+        {
+            plot(y,x, image, pitch);
+
+            error = error - deltay;
+            if (error < 0)
+            {
+                y = y + ystep;
+                error = error + deltax;
+            }
+        }
+    }
+    else
+    {
+        for (x = x0; x <= x1; x++)
+        {
+            plot(x,y, image, pitch);
+
+            error = error - deltay;
+            if (error < 0)
+            {
+                y = y + ystep;
+                error = error + deltax;
+            }
+        }
+    }
+}
diff --git a/libvpx/vp8/common/threading.h b/libvpx/vp8/common/threading.h
new file mode 100644
index 0000000..ed9e3e6
--- /dev/null
+++ b/libvpx/vp8/common/threading.h
@@ -0,0 +1,186 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef _PTHREAD_EMULATION
+#define _PTHREAD_EMULATION
+
+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
+
+/* Thread management macros */
+#ifdef _WIN32
+/* Win32 */
+#include <process.h>
+#include <windows.h>
+#define THREAD_FUNCTION DWORD WINAPI
+#define THREAD_FUNCTION_RETURN DWORD
+#define THREAD_SPECIFIC_INDEX DWORD
+#define pthread_t HANDLE
+#define pthread_attr_t DWORD
+#define pthread_create(thhandle,attr,thfunc,tharg) (int)((*thhandle=(HANDLE)_beginthreadex(NULL,0,(unsigned int (__stdcall *)(void *))thfunc,tharg,0,NULL))==NULL)
+#define pthread_join(thread, result) ((WaitForSingleObject((thread),INFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread))
+#define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread)
+#define thread_sleep(nms) Sleep(nms)
+#define pthread_cancel(thread) terminate_thread(thread,0)
+#define ts_key_create(ts_key, destructor) {ts_key = TlsAlloc();};
+#define pthread_getspecific(ts_key) TlsGetValue(ts_key)
+#define pthread_setspecific(ts_key, value) TlsSetValue(ts_key, (void *)value)
+#define pthread_self() GetCurrentThreadId()
+
+#elif defined(__OS2__)
+/* OS/2 */
+#define INCL_DOS
+#include <os2.h>
+
+#include <stdlib.h>
+#define THREAD_FUNCTION void
+#define THREAD_FUNCTION_RETURN void
+#define THREAD_SPECIFIC_INDEX PULONG
+#define pthread_t TID
+#define pthread_attr_t ULONG
+#define pthread_create(thhandle,attr,thfunc,tharg) \
+    ((int)((*(thhandle)=_beginthread(thfunc,NULL,1024*1024,tharg))==-1))
+#define pthread_join(thread, result) ((int)DosWaitThread(&(thread),0))
+#define pthread_detach(thread) 0
+#define thread_sleep(nms) DosSleep(nms)
+#define pthread_cancel(thread) DosKillThread(thread)
+#define ts_key_create(ts_key, destructor) \
+    DosAllocThreadLocalMemory(1, &(ts_key));
+#define pthread_getspecific(ts_key) ((void *)(*(ts_key)))
+#define pthread_setspecific(ts_key, value) (*(ts_key)=(ULONG)(value))
+#define pthread_self() _gettid()
+#else
+#ifdef __APPLE__
+#include <mach/mach_init.h>
+#include <mach/semaphore.h>
+#include <mach/task.h>
+#include <time.h>
+#include <unistd.h>
+
+#else
+#include <semaphore.h>
+#endif
+
+#include <pthread.h>
+/* pthreads */
+/* Nearly everything is already defined */
+#define THREAD_FUNCTION void *
+#define THREAD_FUNCTION_RETURN void *
+#define THREAD_SPECIFIC_INDEX pthread_key_t
+#define ts_key_create(ts_key, destructor) pthread_key_create (&(ts_key), destructor);
+#endif
+
+/* Syncrhronization macros: Win32 and Pthreads */
+#ifdef _WIN32
+#define sem_t HANDLE
+#define pause(voidpara) __asm PAUSE
+#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateSemaphore(NULL,0,32768,NULL))==NULL)
+#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,INFINITE))
+#define sem_post(sem) ReleaseSemaphore(*sem,1,NULL)
+#define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE)
+#define thread_sleep(nms) Sleep(nms)
+
+#elif defined(__OS2__)
+typedef struct
+{
+    HEV  event;
+    HMTX wait_mutex;
+    HMTX count_mutex;
+    int  count;
+} sem_t;
+
+static inline int sem_init(sem_t *sem, int pshared, unsigned int value)
+{
+    DosCreateEventSem(NULL, &sem->event, pshared ? DC_SEM_SHARED : 0,
+                      value > 0 ? TRUE : FALSE);
+    DosCreateMutexSem(NULL, &sem->wait_mutex, 0, FALSE);
+    DosCreateMutexSem(NULL, &sem->count_mutex, 0, FALSE);
+
+    sem->count = value;
+
+    return 0;
+}
+
+static inline int sem_wait(sem_t * sem)
+{
+    DosRequestMutexSem(sem->wait_mutex, -1);
+
+    DosWaitEventSem(sem->event, -1);
+
+    DosRequestMutexSem(sem->count_mutex, -1);
+
+    sem->count--;
+    if (sem->count == 0)
+    {
+        ULONG post_count;
+
+        DosResetEventSem(sem->event, &post_count);
+    }
+
+    DosReleaseMutexSem(sem->count_mutex);
+
+    DosReleaseMutexSem(sem->wait_mutex);
+
+    return 0;
+}
+
+static inline int sem_post(sem_t * sem)
+{
+    DosRequestMutexSem(sem->count_mutex, -1);
+
+    if (sem->count < 32768)
+    {
+        sem->count++;
+        DosPostEventSem(sem->event);
+    }
+
+    DosReleaseMutexSem(sem->count_mutex);
+
+    return 0;
+}
+
+static inline int sem_destroy(sem_t * sem)
+{
+    DosCloseEventSem(sem->event);
+    DosCloseMutexSem(sem->wait_mutex);
+    DosCloseMutexSem(sem->count_mutex);
+
+    return 0;
+}
+
+#define thread_sleep(nms) DosSleep(nms)
+
+#else
+
+#ifdef __APPLE__
+#define sem_t semaphore_t
+#define sem_init(X,Y,Z) semaphore_create(mach_task_self(), X, SYNC_POLICY_FIFO, Z)
+#define sem_wait(sem) (semaphore_wait(*sem) )
+#define sem_post(sem) semaphore_signal(*sem)
+#define sem_destroy(sem) semaphore_destroy(mach_task_self(),*sem)
+#define thread_sleep(nms) /* { struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
+#else
+#include <unistd.h>
+#include <sched.h>
+#define thread_sleep(nms) sched_yield();/* {struct timespec ts;ts.tv_sec=0; ts.tv_nsec = 1000*nms;nanosleep(&ts, NULL);} */
+#endif
+/* Not Windows. Assume pthreads */
+
+#endif
+
+#if ARCH_X86 || ARCH_X86_64
+#include "vpx_ports/x86.h"
+#else
+#define x86_pause_hint()
+#endif
+
+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
+
+#endif
diff --git a/libvpx/vp8/common/treecoder.c b/libvpx/vp8/common/treecoder.c
new file mode 100644
index 0000000..d80c64b
--- /dev/null
+++ b/libvpx/vp8/common/treecoder.c
@@ -0,0 +1,143 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#if CONFIG_DEBUG
+#include <assert.h>
+#endif
+#include <stdio.h>
+
+#include "treecoder.h"
+
+static void tree2tok(
+    struct vp8_token_struct *const p,
+    vp8_tree t,
+    int i,
+    int v,
+    int L
+)
+{
+    v += v;
+    ++L;
+
+    do
+    {
+        const vp8_tree_index j = t[i++];
+
+        if (j <= 0)
+        {
+            p[-j].value = v;
+            p[-j].Len = L;
+        }
+        else
+            tree2tok(p, t, j, v, L);
+    }
+    while (++v & 1);
+}
+
+void vp8_tokens_from_tree(struct vp8_token_struct *p, vp8_tree t)
+{
+    tree2tok(p, t, 0, 0, 0);
+}
+
+void vp8_tokens_from_tree_offset(struct vp8_token_struct *p, vp8_tree t,
+                                 int offset)
+{
+    tree2tok(p - offset, t, 0, 0, 0);
+}
+
+static void branch_counts(
+    int n,                      /* n = size of alphabet */
+    vp8_token tok               [ /* n */ ],
+    vp8_tree tree,
+    unsigned int branch_ct       [ /* n-1 */ ] [2],
+    const unsigned int num_events[ /* n */ ]
+)
+{
+    const int tree_len = n - 1;
+    int t = 0;
+
+#if CONFIG_DEBUG
+    assert(tree_len);
+#endif
+
+    do
+    {
+        branch_ct[t][0] = branch_ct[t][1] = 0;
+    }
+    while (++t < tree_len);
+
+    t = 0;
+
+    do
+    {
+        int L = tok[t].Len;
+        const int enc = tok[t].value;
+        const unsigned int ct = num_events[t];
+
+        vp8_tree_index i = 0;
+
+        do
+        {
+            const int b = (enc >> --L) & 1;
+            const int j = i >> 1;
+#if CONFIG_DEBUG
+            assert(j < tree_len  &&  0 <= L);
+#endif
+
+            branch_ct [j] [b] += ct;
+            i = tree[ i + b];
+        }
+        while (i > 0);
+
+#if CONFIG_DEBUG
+        assert(!L);
+#endif
+    }
+    while (++t < n);
+
+}
+
+
+void vp8_tree_probs_from_distribution(
+    int n,                      /* n = size of alphabet */
+    vp8_token tok               [ /* n */ ],
+    vp8_tree tree,
+    vp8_prob probs          [ /* n-1 */ ],
+    unsigned int branch_ct       [ /* n-1 */ ] [2],
+    const unsigned int num_events[ /* n */ ],
+    unsigned int Pfac,
+    int rd
+)
+{
+    const int tree_len = n - 1;
+    int t = 0;
+
+    branch_counts(n, tok, tree, branch_ct, num_events);
+
+    do
+    {
+        const unsigned int *const c = branch_ct[t];
+        const unsigned int tot = c[0] + c[1];
+
+#if CONFIG_DEBUG
+        assert(tot < (1 << 24));        /* no overflow below */
+#endif
+
+        if (tot)
+        {
+            const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
+            probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
+        }
+        else
+            probs[t] = vp8_prob_half;
+    }
+    while (++t < tree_len);
+}
diff --git a/libvpx/vp8/common/treecoder.h b/libvpx/vp8/common/treecoder.h
new file mode 100644
index 0000000..ebf51c5
--- /dev/null
+++ b/libvpx/vp8/common/treecoder.h
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TREECODER_H
+#define __INC_TREECODER_H
+
+typedef unsigned char vp8bc_index_t; /* probability index */
+
+
+typedef unsigned char vp8_prob;
+
+#define vp8_prob_half ( (vp8_prob) 128)
+
+typedef signed char vp8_tree_index;
+struct bool_coder_spec;
+
+typedef struct bool_coder_spec bool_coder_spec;
+typedef struct bool_writer bool_writer;
+typedef struct bool_reader bool_reader;
+
+typedef const bool_coder_spec c_bool_coder_spec;
+typedef const bool_writer c_bool_writer;
+typedef const bool_reader c_bool_reader;
+
+
+
+# define vp8_complement( x) (255 - x)
+
+
+/* We build coding trees compactly in arrays.
+   Each node of the tree is a pair of vp8_tree_indices.
+   Array index often references a corresponding probability table.
+   Index <= 0 means done encoding/decoding and value = -Index,
+   Index > 0 means need another bit, specification at index.
+   Nonnegative indices are always even;  processing begins at node 0. */
+
+typedef const vp8_tree_index vp8_tree[], *vp8_tree_p;
+
+
+typedef const struct vp8_token_struct
+{
+    int value;
+    int Len;
+} vp8_token;
+
+/* Construct encoding array from tree. */
+
+void vp8_tokens_from_tree(struct vp8_token_struct *, vp8_tree);
+void vp8_tokens_from_tree_offset(struct vp8_token_struct *, vp8_tree,
+                                 int offset);
+
+
+/* Convert array of token occurrence counts into a table of probabilities
+   for the associated binary encoding tree.  Also writes count of branches
+   taken for each node on the tree; this facilitiates decisions as to
+   probability updates. */
+
+void vp8_tree_probs_from_distribution(
+    int n,                      /* n = size of alphabet */
+    vp8_token tok               [ /* n */ ],
+    vp8_tree tree,
+    vp8_prob probs          [ /* n-1 */ ],
+    unsigned int branch_ct       [ /* n-1 */ ] [2],
+    const unsigned int num_events[ /* n */ ],
+    unsigned int Pfactor,
+    int Round
+);
+
+/* Variant of above using coder spec rather than hardwired 8-bit probs. */
+
+void vp8bc_tree_probs_from_distribution(
+    int n,                      /* n = size of alphabet */
+    vp8_token tok               [ /* n */ ],
+    vp8_tree tree,
+    vp8_prob probs          [ /* n-1 */ ],
+    unsigned int branch_ct       [ /* n-1 */ ] [2],
+    const unsigned int num_events[ /* n */ ],
+    c_bool_coder_spec *s
+);
+
+
+#endif
diff --git a/libvpx/vp8/common/variance.h b/libvpx/vp8/common/variance.h
new file mode 100644
index 0000000..01193b8
--- /dev/null
+++ b/libvpx/vp8/common/variance.h
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_H
+#define VARIANCE_H
+
+#include "vpx_config.h"
+
+typedef unsigned int(*vp8_sad_fn_t)(
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int ref_stride,
+    unsigned int max_sad);
+
+typedef void (*vp8_copy32xn_fn_t)(
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int ref_stride,
+    int n);
+
+typedef void (*vp8_sad_multi_fn_t)(
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned int *sad_array);
+
+typedef void (*vp8_sad_multi1_fn_t)
+    (
+     const unsigned char *src_ptr,
+     int source_stride,
+     const unsigned char *ref_ptr,
+     int  ref_stride,
+     unsigned short *sad_array
+    );
+
+typedef void (*vp8_sad_multi_d_fn_t)
+    (
+     const unsigned char *src_ptr,
+     int source_stride,
+     const unsigned char * const ref_ptr[],
+     int  ref_stride,
+     unsigned int *sad_array
+    );
+
+typedef unsigned int (*vp8_variance_fn_t)
+    (
+     const unsigned char *src_ptr,
+     int source_stride,
+     const unsigned char *ref_ptr,
+     int  ref_stride,
+     unsigned int *sse
+    );
+
+typedef unsigned int (*vp8_subpixvariance_fn_t)
+    (
+      const unsigned char  *src_ptr,
+      int  source_stride,
+      int  xoffset,
+      int  yoffset,
+      const unsigned char *ref_ptr,
+      int Refstride,
+      unsigned int *sse
+    );
+
+typedef void (*vp8_ssimpf_fn_t)
+      (
+        unsigned char *s,
+        int sp,
+        unsigned char *r,
+        int rp,
+        unsigned long *sum_s,
+        unsigned long *sum_r,
+        unsigned long *sum_sq_s,
+        unsigned long *sum_sq_r,
+        unsigned long *sum_sxr
+      );
+
+typedef unsigned int (*vp8_getmbss_fn_t)(const short *);
+
+typedef unsigned int (*vp8_get16x16prederror_fn_t)
+    (
+     const unsigned char *src_ptr,
+     int source_stride,
+     const unsigned char *ref_ptr,
+     int  ref_stride
+    );
+
+typedef struct variance_vtable
+{
+    vp8_sad_fn_t            sdf;
+    vp8_variance_fn_t       vf;
+    vp8_subpixvariance_fn_t svf;
+    vp8_variance_fn_t       svf_halfpix_h;
+    vp8_variance_fn_t       svf_halfpix_v;
+    vp8_variance_fn_t       svf_halfpix_hv;
+    vp8_sad_multi_fn_t      sdx3f;
+    vp8_sad_multi1_fn_t     sdx8f;
+    vp8_sad_multi_d_fn_t    sdx4df;
+#if ARCH_X86 || ARCH_X86_64
+    vp8_copy32xn_fn_t       copymem;
+#endif
+} vp8_variance_fn_ptr_t;
+
+#endif
diff --git a/libvpx/vp8/common/variance_c.c b/libvpx/vp8/common/variance_c.c
new file mode 100644
index 0000000..da08aff
--- /dev/null
+++ b/libvpx/vp8/common/variance_c.c
@@ -0,0 +1,458 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "filter.h"
+
+
+unsigned int vp8_get_mb_ss_c
+(
+    const short *src_ptr
+)
+{
+    unsigned int i = 0, sum = 0;
+
+    do
+    {
+        sum += (src_ptr[i] * src_ptr[i]);
+        i++;
+    }
+    while (i < 256);
+
+    return sum;
+}
+
+
+static void variance(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    int  w,
+    int  h,
+    unsigned int *sse,
+    int *sum)
+{
+    int i, j;
+    int diff;
+
+    *sum = 0;
+    *sse = 0;
+
+    for (i = 0; i < h; i++)
+    {
+        for (j = 0; j < w; j++)
+        {
+            diff = src_ptr[j] - ref_ptr[j];
+            *sum += diff;
+            *sse += diff * diff;
+        }
+
+        src_ptr += source_stride;
+        ref_ptr += recon_stride;
+    }
+}
+
+
+unsigned int vp8_variance16x16_c(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+    *sse = var;
+    return (var - ((unsigned int)(avg * avg) >> 8));
+}
+
+unsigned int vp8_variance8x16_c(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
+    *sse = var;
+    return (var - ((unsigned int)(avg * avg) >> 7));
+}
+
+unsigned int vp8_variance16x8_c(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
+    *sse = var;
+    return (var - ((unsigned int)(avg * avg) >> 7));
+}
+
+
+unsigned int vp8_variance8x8_c(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
+    *sse = var;
+    return (var - ((unsigned int)(avg * avg) >> 6));
+}
+
+unsigned int vp8_variance4x4_c(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
+    *sse = var;
+    return (var - ((unsigned int)(avg * avg) >> 4));
+}
+
+
+unsigned int vp8_mse16x16_c(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+    *sse = var;
+    return var;
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_first_pass
+ *
+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
+ *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
+ *                  UINT32 output_height     : Input block height.
+ *                  UINT32 output_width      : Input block width.
+ *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement first-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_first_pass
+(
+    const unsigned char *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int src_pixels_per_line,
+    int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short *vp8_filter
+)
+{
+    unsigned int i, j;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            /* Apply bilinear filter */
+            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
+                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+                             (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+            src_ptr++;
+        }
+
+        /* Next row... */
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_width;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_second_pass
+ *
+ *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
+ *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
+ *                  UINT32 output_height     : Input block height.
+ *                  UINT32 output_width      : Input block width.
+ *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement second-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+static void var_filter_block2d_bil_second_pass
+(
+    const unsigned short *src_ptr,
+    unsigned char  *output_ptr,
+    unsigned int  src_pixels_per_line,
+    unsigned int  pixel_step,
+    unsigned int  output_height,
+    unsigned int  output_width,
+    const short *vp8_filter
+)
+{
+    unsigned int  i, j;
+    int  Temp;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            /* Apply filter */
+            Temp = ((int)src_ptr[0]          * vp8_filter[0]) +
+                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+                   (VP8_FILTER_WEIGHT / 2);
+            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+            src_ptr++;
+        }
+
+        /* Next row... */
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_width;
+    }
+}
+
+
+unsigned int vp8_sub_pixel_variance4x4_c
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned char  temp2[20*16];
+    const short *HFilter, *VFilter;
+    unsigned short FData3[5*4]; /* Temp data bufffer used in filtering */
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    /* First filter 1d Horizontal */
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
+
+    /* Now filter Verticaly */
+    var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
+
+    return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_c
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short FData3[9*8]; /* Temp data bufffer used in filtering */
+    unsigned char  temp2[20*16];
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
+
+    return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_variance16x16_c
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short FData3[17*16];   /* Temp data bufffer used in filtering */
+    unsigned char  temp2[20*16];
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
+
+    return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_c(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 0,
+                                         ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_c(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 4,
+                                         ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_c(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 4,
+                                         ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_sub_pixel_mse16x16_c
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    vp8_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+    return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_c
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short FData3[16*9];    /* Temp data bufffer used in filtering */
+    unsigned char  temp2[20*16];
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
+
+    return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_variance8x16_c
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short FData3[9*16];    /* Temp data bufffer used in filtering */
+    unsigned char  temp2[20*16];
+    const short *HFilter, *VFilter;
+
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
+
+    return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
diff --git a/libvpx/vp8/common/vp8_entropymodedata.h b/libvpx/vp8/common/vp8_entropymodedata.h
new file mode 100644
index 0000000..13e9a92
--- /dev/null
+++ b/libvpx/vp8/common/vp8_entropymodedata.h
@@ -0,0 +1,242 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+*/
+
+
+/*Generated file, included by entropymode.c*/
+
+
+const struct vp8_token_struct vp8_bmode_encodings[VP8_BINTRAMODES] =
+{
+    { 0, 1 },
+    { 2, 2 },
+    { 6, 3 },
+    { 28, 5 },
+    { 30, 5 },
+    { 58, 6 },
+    { 59, 6 },
+    { 62, 6 },
+    { 126, 7 },
+    { 127, 7 }
+};
+
+const struct vp8_token_struct vp8_ymode_encodings[VP8_YMODES] =
+{
+    { 0, 1 },
+    { 4, 3 },
+    { 5, 3 },
+    { 6, 3 },
+    { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_kf_ymode_encodings[VP8_YMODES] =
+{
+    { 4, 3 },
+    { 5, 3 },
+    { 6, 3 },
+    { 7, 3 },
+    { 0, 1 }
+};
+
+const struct vp8_token_struct vp8_uv_mode_encodings[VP8_UV_MODES] =
+{
+    { 0, 1 },
+    { 2, 2 },
+    { 6, 3 },
+    { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_mbsplit_encodings[VP8_NUMMBSPLITS] =
+{
+    { 6, 3 },
+    { 7, 3 },
+    { 2, 2 },
+    { 0, 1 }
+};
+
+const struct vp8_token_struct vp8_mv_ref_encoding_array[VP8_MVREFS] =
+{
+    { 2, 2 },
+    { 6, 3 },
+    { 0, 1 },
+    { 14, 4 },
+    { 15, 4 }
+};
+
+const struct vp8_token_struct vp8_sub_mv_ref_encoding_array[VP8_SUBMVREFS] =
+{
+    { 0, 1 },
+    { 2, 2 },
+    { 6, 3 },
+    { 7, 3 }
+};
+
+const struct vp8_token_struct vp8_small_mvencodings[8] =
+{
+    { 0, 3 },
+    { 1, 3 },
+    { 2, 3 },
+    { 3, 3 },
+    { 4, 3 },
+    { 5, 3 },
+    { 6, 3 },
+    { 7, 3 }
+};
+
+const vp8_prob vp8_ymode_prob[VP8_YMODES-1] =
+{
+    112, 86, 140, 37
+};
+
+const vp8_prob vp8_kf_ymode_prob[VP8_YMODES-1] =
+{
+    145, 156, 163, 128
+};
+
+const vp8_prob vp8_uv_mode_prob[VP8_UV_MODES-1] =
+{
+    162, 101, 204
+};
+
+const vp8_prob vp8_kf_uv_mode_prob[VP8_UV_MODES-1] =
+{
+    142, 114, 183
+};
+
+const vp8_prob vp8_bmode_prob[VP8_BINTRAMODES-1] =
+{
+    120, 90, 79, 133, 87, 85, 80, 111, 151
+};
+
+
+
+const vp8_prob vp8_kf_bmode_prob
+[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1] =
+{
+    {
+        { 231, 120,  48,  89, 115, 113, 120, 152, 112 },
+        { 152, 179,  64, 126, 170, 118,  46,  70,  95 },
+        { 175,  69, 143,  80,  85,  82,  72, 155, 103 },
+        {  56,  58,  10, 171, 218, 189,  17,  13, 152 },
+        { 144,  71,  10,  38, 171, 213, 144,  34,  26 },
+        { 114,  26,  17, 163,  44, 195,  21,  10, 173 },
+        { 121,  24,  80, 195,  26,  62,  44,  64,  85 },
+        { 170,  46,  55,  19, 136, 160,  33, 206,  71 },
+        {  63,  20,   8, 114, 114, 208,  12,   9, 226 },
+        {  81,  40,  11,  96, 182,  84,  29,  16,  36 }
+    },
+    {
+        { 134, 183,  89, 137,  98, 101, 106, 165, 148 },
+        {  72, 187, 100, 130, 157, 111,  32,  75,  80 },
+        {  66, 102, 167,  99,  74,  62,  40, 234, 128 },
+        {  41,  53,   9, 178, 241, 141,  26,   8, 107 },
+        { 104,  79,  12,  27, 217, 255,  87,  17,   7 },
+        {  74,  43,  26, 146,  73, 166,  49,  23, 157 },
+        {  65,  38, 105, 160,  51,  52,  31, 115, 128 },
+        {  87,  68,  71,  44, 114,  51,  15, 186,  23 },
+        {  47,  41,  14, 110, 182, 183,  21,  17, 194 },
+        {  66,  45,  25, 102, 197, 189,  23,  18,  22 }
+    },
+    {
+        {  88,  88, 147, 150,  42,  46,  45, 196, 205 },
+        {  43,  97, 183, 117,  85,  38,  35, 179,  61 },
+        {  39,  53, 200,  87,  26,  21,  43, 232, 171 },
+        {  56,  34,  51, 104, 114, 102,  29,  93,  77 },
+        { 107,  54,  32,  26,  51,   1,  81,  43,  31 },
+        {  39,  28,  85, 171,  58, 165,  90,  98,  64 },
+        {  34,  22, 116, 206,  23,  34,  43, 166,  73 },
+        {  68,  25, 106,  22,  64, 171,  36, 225, 114 },
+        {  34,  19,  21, 102, 132, 188,  16,  76, 124 },
+        {  62,  18,  78,  95,  85,  57,  50,  48,  51 }
+    },
+    {
+        { 193, 101,  35, 159, 215, 111,  89,  46, 111 },
+        {  60, 148,  31, 172, 219, 228,  21,  18, 111 },
+        { 112, 113,  77,  85, 179, 255,  38, 120, 114 },
+        {  40,  42,   1, 196, 245, 209,  10,  25, 109 },
+        { 100,  80,   8,  43, 154,   1,  51,  26,  71 },
+        {  88,  43,  29, 140, 166, 213,  37,  43, 154 },
+        {  61,  63,  30, 155,  67,  45,  68,   1, 209 },
+        { 142,  78,  78,  16, 255, 128,  34, 197, 171 },
+        {  41,  40,   5, 102, 211, 183,   4,   1, 221 },
+        {  51,  50,  17, 168, 209, 192,  23,  25,  82 }
+    },
+    {
+        { 125,  98,  42,  88, 104,  85, 117, 175,  82 },
+        {  95,  84,  53,  89, 128, 100, 113, 101,  45 },
+        {  75,  79, 123,  47,  51, 128,  81, 171,   1 },
+        {  57,  17,   5,  71, 102,  57,  53,  41,  49 },
+        { 115,  21,   2,  10, 102, 255, 166,  23,   6 },
+        {  38,  33,  13, 121,  57,  73,  26,   1,  85 },
+        {  41,  10,  67, 138,  77, 110,  90,  47, 114 },
+        { 101,  29,  16,  10,  85, 128, 101, 196,  26 },
+        {  57,  18,  10, 102, 102, 213,  34,  20,  43 },
+        { 117,  20,  15,  36, 163, 128,  68,   1,  26 }
+    },
+    {
+        { 138,  31,  36, 171,  27, 166,  38,  44, 229 },
+        {  67,  87,  58, 169,  82, 115,  26,  59, 179 },
+        {  63,  59,  90, 180,  59, 166,  93,  73, 154 },
+        {  40,  40,  21, 116, 143, 209,  34,  39, 175 },
+        {  57,  46,  22,  24, 128,   1,  54,  17,  37 },
+        {  47,  15,  16, 183,  34, 223,  49,  45, 183 },
+        {  46,  17,  33, 183,   6,  98,  15,  32, 183 },
+        {  65,  32,  73, 115,  28, 128,  23, 128, 205 },
+        {  40,   3,   9, 115,  51, 192,  18,   6, 223 },
+        {  87,  37,   9, 115,  59,  77,  64,  21,  47 }
+    },
+    {
+        { 104,  55,  44, 218,   9,  54,  53, 130, 226 },
+        {  64,  90,  70, 205,  40,  41,  23,  26,  57 },
+        {  54,  57, 112, 184,   5,  41,  38, 166, 213 },
+        {  30,  34,  26, 133, 152, 116,  10,  32, 134 },
+        {  75,  32,  12,  51, 192, 255, 160,  43,  51 },
+        {  39,  19,  53, 221,  26, 114,  32,  73, 255 },
+        {  31,   9,  65, 234,   2,  15,   1, 118,  73 },
+        {  88,  31,  35,  67, 102,  85,  55, 186,  85 },
+        {  56,  21,  23, 111,  59, 205,  45,  37, 192 },
+        {  55,  38,  70, 124,  73, 102,   1,  34,  98 }
+    },
+    {
+        { 102,  61,  71,  37,  34,  53,  31, 243, 192 },
+        {  69,  60,  71,  38,  73, 119,  28, 222,  37 },
+        {  68,  45, 128,  34,   1,  47,  11, 245, 171 },
+        {  62,  17,  19,  70, 146,  85,  55,  62,  70 },
+        {  75,  15,   9,   9,  64, 255, 184, 119,  16 },
+        {  37,  43,  37, 154, 100, 163,  85, 160,   1 },
+        {  63,   9,  92, 136,  28,  64,  32, 201,  85 },
+        {  86,   6,  28,   5,  64, 255,  25, 248,   1 },
+        {  56,   8,  17, 132, 137, 255,  55, 116, 128 },
+        {  58,  15,  20,  82, 135,  57,  26, 121,  40 }
+    },
+    {
+        { 164,  50,  31, 137, 154, 133,  25,  35, 218 },
+        {  51, 103,  44, 131, 131, 123,  31,   6, 158 },
+        {  86,  40,  64, 135, 148, 224,  45, 183, 128 },
+        {  22,  26,  17, 131, 240, 154,  14,   1, 209 },
+        {  83,  12,  13,  54, 192, 255,  68,  47,  28 },
+        {  45,  16,  21,  91,  64, 222,   7,   1, 197 },
+        {  56,  21,  39, 155,  60, 138,  23, 102, 213 },
+        {  85,  26,  85,  85, 128, 128,  32, 146, 171 },
+        {  18,  11,   7,  63, 144, 171,   4,   4, 246 },
+        {  35,  27,  10, 146, 174, 171,  12,  26, 128 }
+    },
+    {
+        { 190,  80,  35,  99, 180,  80, 126,  54,  45 },
+        {  85, 126,  47,  87, 176,  51,  41,  20,  32 },
+        { 101,  75, 128, 139, 118, 146, 116, 128,  85 },
+        {  56,  41,  15, 176, 236,  85,  37,   9,  62 },
+        { 146,  36,  19,  30, 171, 255,  97,  27,  20 },
+        {  71,  30,  17, 119, 118, 255,  17,  18, 138 },
+        { 101,  38,  60, 138,  55,  70,  43,  26, 142 },
+        { 138,  45,  61,  62, 219,   1,  81, 188,  64 },
+        {  32,  41,  20, 117, 151, 142,  20,  21, 163 },
+        { 112,  19,  12,  61, 195, 128,  48,   4,  24 }
+    }
+};
diff --git a/libvpx/vp8/common/x86/dequantize_mmx.asm b/libvpx/vp8/common/x86/dequantize_mmx.asm
new file mode 100644
index 0000000..4e551f0
--- /dev/null
+++ b/libvpx/vp8/common/x86/dequantize_mmx.asm
@@ -0,0 +1,258 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
+global sym(vp8_dequantize_b_impl_mmx) PRIVATE
+sym(vp8_dequantize_b_impl_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov       rsi, arg(0) ;sq
+        mov       rdi, arg(1) ;dq
+        mov       rax, arg(2) ;q
+
+        movq      mm1, [rsi]
+        pmullw    mm1, [rax+0]            ; mm4 *= kernel 0 modifiers.
+        movq      [rdi], mm1
+
+        movq      mm1, [rsi+8]
+        pmullw    mm1, [rax+8]            ; mm4 *= kernel 0 modifiers.
+        movq      [rdi+8], mm1
+
+        movq      mm1, [rsi+16]
+        pmullw    mm1, [rax+16]            ; mm4 *= kernel 0 modifiers.
+        movq      [rdi+16], mm1
+
+        movq      mm1, [rsi+24]
+        pmullw    mm1, [rax+24]            ; mm4 *= kernel 0 modifiers.
+        movq      [rdi+24], mm1
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void dequant_idct_add_mmx(
+;short *input,            0
+;short *dq,               1
+;unsigned char *dest,     2
+;int stride)              3
+global sym(vp8_dequant_idct_add_mmx) PRIVATE
+sym(vp8_dequant_idct_add_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    GET_GOT     rbx
+    push        rdi
+    ; end prolog
+
+        mov         rax,    arg(0) ;input
+        mov         rdx,    arg(1) ;dq
+
+
+        movq        mm0,    [rax   ]
+        pmullw      mm0,    [rdx]
+
+        movq        mm1,    [rax +8]
+        pmullw      mm1,    [rdx +8]
+
+        movq        mm2,    [rax+16]
+        pmullw      mm2,    [rdx+16]
+
+        movq        mm3,    [rax+24]
+        pmullw      mm3,    [rdx+24]
+
+        mov         rdx,    arg(2) ;dest
+
+        pxor        mm7,    mm7
+
+
+        movq        [rax],   mm7
+        movq        [rax+8], mm7
+
+        movq        [rax+16],mm7
+        movq        [rax+24],mm7
+
+
+        movsxd      rdi,            dword ptr arg(3) ;stride
+
+        psubw       mm0,            mm2             ; b1= 0-2
+        paddw       mm2,            mm2             ;
+
+        movq        mm5,            mm1
+        paddw       mm2,            mm0             ; a1 =0+2
+
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+        movq        mm7,            mm3             ;
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
+
+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       mm7,            mm5             ; c1
+
+        movq        mm5,            mm1
+        movq        mm4,            mm3
+
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
+        paddw       mm5,            mm1
+
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
+        paddw       mm3,            mm4
+
+        paddw       mm3,            mm5             ; d1
+        movq        mm6,            mm2             ; a1
+
+        movq        mm4,            mm0             ; b1
+        paddw       mm2,            mm3             ;0
+
+        paddw       mm4,            mm7             ;1
+        psubw       mm0,            mm7             ;2
+
+        psubw       mm6,            mm3             ;3
+
+        movq        mm1,            mm2             ; 03 02 01 00
+        movq        mm3,            mm4             ; 23 22 21 20
+
+        punpcklwd   mm1,            mm0             ; 11 01 10 00
+        punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+        punpcklwd   mm3,            mm6             ; 31 21 30 20
+        punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+        movq        mm0,            mm1             ; 11 01 10 00
+        movq        mm5,            mm2             ; 13 03 12 02
+
+        punpckldq   mm0,            mm3             ; 30 20 10 00
+        punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+        punpckldq   mm2,            mm4             ; 32 22 12 02
+        punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+        movq        mm3,            mm5             ; 33 23 13 03
+
+        psubw       mm0,            mm2             ; b1= 0-2
+        paddw       mm2,            mm2             ;
+
+        movq        mm5,            mm1
+        paddw       mm2,            mm0             ; a1 =0+2
+
+        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
+        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+        movq        mm7,            mm3             ;
+        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
+
+        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       mm7,            mm5             ; c1
+
+        movq        mm5,            mm1
+        movq        mm4,            mm3
+
+        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
+        paddw       mm5,            mm1
+
+        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
+        paddw       mm3,            mm4
+
+        paddw       mm3,            mm5             ; d1
+        paddw       mm0,            [GLOBAL(fours)]
+
+        paddw       mm2,            [GLOBAL(fours)]
+        movq        mm6,            mm2             ; a1
+
+        movq        mm4,            mm0             ; b1
+        paddw       mm2,            mm3             ;0
+
+        paddw       mm4,            mm7             ;1
+        psubw       mm0,            mm7             ;2
+
+        psubw       mm6,            mm3             ;3
+        psraw       mm2,            3
+
+        psraw       mm0,            3
+        psraw       mm4,            3
+
+        psraw       mm6,            3
+
+        movq        mm1,            mm2             ; 03 02 01 00
+        movq        mm3,            mm4             ; 23 22 21 20
+
+        punpcklwd   mm1,            mm0             ; 11 01 10 00
+        punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+        punpcklwd   mm3,            mm6             ; 31 21 30 20
+        punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+        movq        mm0,            mm1             ; 11 01 10 00
+        movq        mm5,            mm2             ; 13 03 12 02
+
+        punpckldq   mm0,            mm3             ; 30 20 10 00
+        punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+        punpckldq   mm2,            mm4             ; 32 22 12 02
+        punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+        pxor        mm7,            mm7
+
+        movd        mm4,            [rdx]
+        punpcklbw   mm4,            mm7
+        paddsw      mm0,            mm4
+        packuswb    mm0,            mm7
+        movd        [rdx],          mm0
+
+        movd        mm4,            [rdx+rdi]
+        punpcklbw   mm4,            mm7
+        paddsw      mm1,            mm4
+        packuswb    mm1,            mm7
+        movd        [rdx+rdi],      mm1
+
+        movd        mm4,            [rdx+2*rdi]
+        punpcklbw   mm4,            mm7
+        paddsw      mm2,            mm4
+        packuswb    mm2,            mm7
+        movd        [rdx+rdi*2],    mm2
+
+        add         rdx,            rdi
+
+        movd        mm4,            [rdx+2*rdi]
+        punpcklbw   mm4,            mm7
+        paddsw      mm5,            mm4
+        packuswb    mm5,            mm7
+        movd        [rdx+rdi*2],    mm5
+
+    ; begin epilog
+    pop rdi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+    times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+    times 4 dw 0x4E7B
+align 16
+fours:
+    times 4 dw 0x0004
diff --git a/libvpx/vp8/common/x86/filter_x86.c b/libvpx/vp8/common/x86/filter_x86.c
new file mode 100644
index 0000000..ebab814
--- /dev/null
+++ b/libvpx/vp8/common/x86/filter_x86.c
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) =
+{
+    { 128, 128, 128, 128,   0,   0,   0,   0 },
+    { 112, 112, 112, 112,  16,  16,  16,  16 },
+    {  96,  96,  96,  96,  32,  32,  32,  32 },
+    {  80,  80,  80,  80,  48,  48,  48,  48 },
+    {  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  48,  48,  48,  48,  80,  80,  80,  80 },
+    {  32,  32,  32,  32,  96,  96,  96,  96 },
+    {  16,  16,  16,  16, 112, 112, 112, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) =
+{
+    { 128, 128, 128, 128, 128, 128, 128, 128,   0,   0,   0,   0,   0,   0,   0,   0 },
+    { 112, 112, 112, 112, 112, 112, 112, 112,  16,  16,  16,  16,  16,  16,  16,  16 },
+    {  96,  96,  96,  96,  96,  96,  96,  96,  32,  32,  32,  32,  32,  32,  32,  32 },
+    {  80,  80,  80,  80,  80,  80,  80,  80,  48,  48,  48,  48,  48,  48,  48,  48 },
+    {  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  48,  48,  48,  48,  48,  48,  48,  48,  80,  80,  80,  80,  80,  80,  80,  80 },
+    {  32,  32,  32,  32,  32,  32,  32,  32,  96,  96,  96,  96,  96,  96,  96,  96 },
+    {  16,  16,  16,  16,  16,  16,  16,  16, 112, 112, 112, 112, 112, 112, 112, 112 }
+};
diff --git a/libvpx/vp8/common/x86/filter_x86.h b/libvpx/vp8/common/x86/filter_x86.h
new file mode 100644
index 0000000..efcc4dc
--- /dev/null
+++ b/libvpx/vp8/common/x86/filter_x86.h
@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef FILTER_X86_H
+#define FILTER_X86_H
+
+/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
+ * duplicated values */
+extern const short vp8_bilinear_filters_x86_4[8][8];  /* duplicated 4x */
+extern const short vp8_bilinear_filters_x86_8[8][16]; /* duplicated 8x */
+
+#endif /* FILTER_X86_H */
diff --git a/libvpx/vp8/common/x86/idct_blk_mmx.c b/libvpx/vp8/common/x86/idct_blk_mmx.c
new file mode 100644
index 0000000..4adf3f5
--- /dev/null
+++ b/libvpx/vp8/common/x86/idct_blk_mmx.c
@@ -0,0 +1,127 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/blockd.h"
+
+extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
+
+void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC)
+{
+    short *sq = (short *) d->qcoeff;
+    short *dq = (short *) d->dqcoeff;
+
+    vp8_dequantize_b_impl_mmx(sq, dq, DQC);
+}
+
+void vp8_dequant_idct_add_y_block_mmx
+            (short *q, short *dq,
+             unsigned char *dst, int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_mmx (q, dq, dst, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
+                                      dst+4, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        if (eobs[2] > 1)
+            vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride);
+        else if (eobs[2] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
+                                      dst+8, stride);
+            ((int *)(q+32))[0] = 0;
+        }
+
+        if (eobs[3] > 1)
+            vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride);
+        else if (eobs[3] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
+                                      dst+12, stride);
+            ((int *)(q+48))[0] = 0;
+        }
+
+        q    += 64;
+        dst  += 4*stride;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_mmx
+            (short *q, short *dq,
+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_mmx (q, dq, dstu, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
+                                      dstu+4, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        q    += 32;
+        dstu += 4*stride;
+        eobs += 2;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        if (eobs[0] > 1)
+            vp8_dequant_idct_add_mmx (q, dq, dstv, stride);
+        else if (eobs[0] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
+            ((int *)q)[0] = 0;
+        }
+
+        if (eobs[1] > 1)
+            vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride);
+        else if (eobs[1] == 1)
+        {
+            vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
+                                      dstv+4, stride);
+            ((int *)(q+16))[0] = 0;
+        }
+
+        q    += 32;
+        dstv += 4*stride;
+        eobs += 2;
+    }
+}
diff --git a/libvpx/vp8/common/x86/idct_blk_sse2.c b/libvpx/vp8/common/x86/idct_blk_sse2.c
new file mode 100644
index 0000000..056e052
--- /dev/null
+++ b/libvpx/vp8/common/x86/idct_blk_sse2.c
@@ -0,0 +1,89 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+void vp8_idct_dequant_0_2x_sse2
+            (short *q, short *dq ,
+             unsigned char *dst, int dst_stride);
+void vp8_idct_dequant_full_2x_sse2
+            (short *q, short *dq ,
+             unsigned char *dst, int dst_stride);
+
+void vp8_dequant_idct_add_y_block_sse2
+            (short *q, short *dq,
+             unsigned char *dst, int stride, char *eobs)
+{
+    int i;
+
+    for (i = 0; i < 4; i++)
+    {
+        if (((short *)(eobs))[0])
+        {
+            if (((short *)(eobs))[0] & 0xfefe)
+                vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride);
+            else
+                vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride);
+        }
+        if (((short *)(eobs))[1])
+        {
+            if (((short *)(eobs))[1] & 0xfefe)
+                vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride);
+            else
+                vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride);
+        }
+        q    += 64;
+        dst  += stride*4;
+        eobs += 4;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_sse2
+            (short *q, short *dq,
+             unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    if (((short *)(eobs))[0])
+    {
+        if (((short *)(eobs))[0] & 0xfefe)
+            vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
+        else
+            vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
+    }
+    q    += 32;
+    dstu += stride*4;
+
+    if (((short *)(eobs))[1])
+    {
+        if (((short *)(eobs))[1] & 0xfefe)
+            vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
+        else
+            vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
+    }
+    q    += 32;
+
+    if (((short *)(eobs))[2])
+    {
+        if (((short *)(eobs))[2] & 0xfefe)
+            vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
+        else
+            vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
+    }
+    q    += 32;
+    dstv += stride*4;
+
+    if (((short *)(eobs))[3])
+    {
+      if (((short *)(eobs))[3] & 0xfefe)
+          vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
+      else
+          vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
+    }
+}
diff --git a/libvpx/vp8/common/x86/idctllm_mmx.asm b/libvpx/vp8/common/x86/idctllm_mmx.asm
new file mode 100644
index 0000000..96fa2c6
--- /dev/null
+++ b/libvpx/vp8/common/x86/idctllm_mmx.asm
@@ -0,0 +1,295 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; /****************************************************************************
+; * Notes:
+; *
+; * This implementation makes use of 16 bit fixed point version of two multiply
+; * constants:
+; *        1.   sqrt(2) * cos (pi/8)
+; *        2.   sqrt(2) * sin (pi/8)
+; * Because the first constant is bigger than 1, to maintain the same 16 bit
+; * fixed point precision as the second one, we use a trick of
+; *        x * a = x + x*(a-1)
+; * so
+; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+; *
+; * For the second constant, because of the 16bit version is 35468, which
+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
+; * number.
+; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
+; *
+; **************************************************************************/
+
+
+;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
+;int pitch, unsigned char *dest,int stride)
+global sym(vp8_short_idct4x4llm_mmx) PRIVATE
+sym(vp8_short_idct4x4llm_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rax,    arg(0)              ;input
+    mov         rsi,    arg(1)              ;pred
+
+    movq        mm0,    [rax   ]
+    movq        mm1,    [rax+ 8]
+    movq        mm2,    [rax+16]
+    movq        mm3,    [rax+24]
+
+%if 0
+    pxor        mm7,    mm7
+    movq        [rax],   mm7
+    movq        [rax+8], mm7
+    movq        [rax+16],mm7
+    movq        [rax+24],mm7
+%endif
+    movsxd      rax,    dword ptr arg(2)    ;pitch
+    mov         rdx,    arg(3)              ;dest
+    movsxd      rdi,    dword ptr arg(4)    ;stride
+
+
+    psubw       mm0,            mm2             ; b1= 0-2
+    paddw       mm2,            mm2             ;
+
+    movq        mm5,            mm1
+    paddw       mm2,            mm0             ; a1 =0+2
+
+    pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
+    paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+    movq        mm7,            mm3             ;
+    pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
+
+    paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+    psubw       mm7,            mm5             ; c1
+
+    movq        mm5,            mm1
+    movq        mm4,            mm3
+
+    pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
+    paddw       mm5,            mm1
+
+    pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
+    paddw       mm3,            mm4
+
+    paddw       mm3,            mm5             ; d1
+    movq        mm6,            mm2             ; a1
+
+    movq        mm4,            mm0             ; b1
+    paddw       mm2,            mm3             ;0
+
+    paddw       mm4,            mm7             ;1
+    psubw       mm0,            mm7             ;2
+
+    psubw       mm6,            mm3             ;3
+
+    movq        mm1,            mm2             ; 03 02 01 00
+    movq        mm3,            mm4             ; 23 22 21 20
+
+    punpcklwd   mm1,            mm0             ; 11 01 10 00
+    punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+    punpcklwd   mm3,            mm6             ; 31 21 30 20
+    punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+    movq        mm0,            mm1             ; 11 01 10 00
+    movq        mm5,            mm2             ; 13 03 12 02
+
+    punpckldq   mm0,            mm3             ; 30 20 10 00
+    punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+    punpckldq   mm2,            mm4             ; 32 22 12 02
+    punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+    movq        mm3,            mm5             ; 33 23 13 03
+
+    psubw       mm0,            mm2             ; b1= 0-2
+    paddw       mm2,            mm2             ;
+
+    movq        mm5,            mm1
+    paddw       mm2,            mm0             ; a1 =0+2
+
+    pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
+    paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
+
+    movq        mm7,            mm3             ;
+    pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
+
+    paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
+    psubw       mm7,            mm5             ; c1
+
+    movq        mm5,            mm1
+    movq        mm4,            mm3
+
+    pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
+    paddw       mm5,            mm1
+
+    pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
+    paddw       mm3,            mm4
+
+    paddw       mm3,            mm5             ; d1
+    paddw       mm0,            [GLOBAL(fours)]
+
+    paddw       mm2,            [GLOBAL(fours)]
+    movq        mm6,            mm2             ; a1
+
+    movq        mm4,            mm0             ; b1
+    paddw       mm2,            mm3             ;0
+
+    paddw       mm4,            mm7             ;1
+    psubw       mm0,            mm7             ;2
+
+    psubw       mm6,            mm3             ;3
+    psraw       mm2,            3
+
+    psraw       mm0,            3
+    psraw       mm4,            3
+
+    psraw       mm6,            3
+
+    movq        mm1,            mm2             ; 03 02 01 00
+    movq        mm3,            mm4             ; 23 22 21 20
+
+    punpcklwd   mm1,            mm0             ; 11 01 10 00
+    punpckhwd   mm2,            mm0             ; 13 03 12 02
+
+    punpcklwd   mm3,            mm6             ; 31 21 30 20
+    punpckhwd   mm4,            mm6             ; 33 23 32 22
+
+    movq        mm0,            mm1             ; 11 01 10 00
+    movq        mm5,            mm2             ; 13 03 12 02
+
+    punpckldq   mm0,            mm3             ; 30 20 10 00
+    punpckhdq   mm1,            mm3             ; 31 21 11 01
+
+    punpckldq   mm2,            mm4             ; 32 22 12 02
+    punpckhdq   mm5,            mm4             ; 33 23 13 03
+
+    pxor        mm7,            mm7
+
+    movd        mm4,            [rsi]
+    punpcklbw   mm4,            mm7
+    paddsw      mm0,            mm4
+    packuswb    mm0,            mm7
+    movd        [rdx],          mm0
+
+    movd        mm4,            [rsi+rax]
+    punpcklbw   mm4,            mm7
+    paddsw      mm1,            mm4
+    packuswb    mm1,            mm7
+    movd        [rdx+rdi],      mm1
+
+    movd        mm4,            [rsi+2*rax]
+    punpcklbw   mm4,            mm7
+    paddsw      mm2,            mm4
+    packuswb    mm2,            mm7
+    movd        [rdx+rdi*2],    mm2
+
+    add         rdx,            rdi
+    add         rsi,            rax
+
+    movd        mm4,            [rsi+2*rax]
+    punpcklbw   mm4,            mm7
+    paddsw      mm5,            mm4
+    packuswb    mm5,            mm7
+    movd        [rdx+rdi*2],    mm5
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_dc_only_idct_add_mmx(
+;short input_dc,
+;unsigned char *pred_ptr,
+;int pred_stride,
+;unsigned char *dst_ptr,
+;int stride)
+global sym(vp8_dc_only_idct_add_mmx) PRIVATE
+sym(vp8_dc_only_idct_add_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    ; end prolog
+
+        movd        mm5,            arg(0) ;input_dc
+        mov         rax,            arg(1) ;pred_ptr
+        movsxd      rdx,            dword ptr arg(2) ;pred_stride
+
+        pxor        mm0,            mm0
+
+        paddw       mm5,            [GLOBAL(fours)]
+        lea         rcx,            [rdx + rdx*2]
+
+        psraw       mm5,            3
+
+        punpcklwd   mm5,            mm5
+
+        punpckldq   mm5,            mm5
+
+        movd        mm1,            [rax]
+        movd        mm2,            [rax+rdx]
+        movd        mm3,            [rax+2*rdx]
+        movd        mm4,            [rax+rcx]
+
+        mov         rax,            arg(3) ;d -- destination
+        movsxd      rdx,            dword ptr arg(4) ;dst_stride
+
+        punpcklbw   mm1,            mm0
+        paddsw      mm1,            mm5
+        packuswb    mm1,            mm0              ; pack and unpack to saturate
+        lea         rcx,            [rdx + rdx*2]
+
+        punpcklbw   mm2,            mm0
+        paddsw      mm2,            mm5
+        packuswb    mm2,            mm0              ; pack and unpack to saturate
+
+        punpcklbw   mm3,            mm0
+        paddsw      mm3,            mm5
+        packuswb    mm3,            mm0              ; pack and unpack to saturate
+
+        punpcklbw   mm4,            mm0
+        paddsw      mm4,            mm5
+        packuswb    mm4,            mm0              ; pack and unpack to saturate
+
+        movd        [rax],          mm1
+        movd        [rax+rdx],      mm2
+        movd        [rax+2*rdx],    mm3
+        movd        [rax+rcx],      mm4
+
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+    times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+    times 4 dw 0x4E7B
+align 16
+fours:
+    times 4 dw 0x0004
diff --git a/libvpx/vp8/common/x86/idctllm_sse2.asm b/libvpx/vp8/common/x86/idctllm_sse2.asm
new file mode 100644
index 0000000..bf8e2c4
--- /dev/null
+++ b/libvpx/vp8/common/x86/idctllm_sse2.asm
@@ -0,0 +1,708 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_idct_dequant_0_2x_sse2
+; (
+;   short *qcoeff       - 0
+;   short *dequant      - 1
+;   unsigned char *dst  - 2
+;   int dst_stride      - 3
+; )
+
+global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_0_2x_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    GET_GOT     rbx
+    ; end prolog
+
+        mov         rdx,            arg(1) ; dequant
+        mov         rax,            arg(0) ; qcoeff
+
+        movd        xmm4,           [rax]
+        movd        xmm5,           [rdx]
+
+        pinsrw      xmm4,           [rax+32],   4
+        pinsrw      xmm5,           [rdx],      4
+
+        pmullw      xmm4,           xmm5
+
+    ; Zero out xmm5, for use unpacking
+        pxor        xmm5,           xmm5
+
+    ; clear coeffs
+        movd        [rax],          xmm5
+        movd        [rax+32],       xmm5
+;pshufb
+        mov         rax,            arg(2) ; dst
+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
+
+        pshuflw     xmm4,           xmm4,       00000000b
+        pshufhw     xmm4,           xmm4,       00000000b
+
+        lea         rcx,            [rdx + rdx*2]
+        paddw       xmm4,           [GLOBAL(fours)]
+
+        psraw       xmm4,           3
+
+        movq        xmm0,           [rax]
+        movq        xmm1,           [rax+rdx]
+        movq        xmm2,           [rax+2*rdx]
+        movq        xmm3,           [rax+rcx]
+
+        punpcklbw   xmm0,           xmm5
+        punpcklbw   xmm1,           xmm5
+        punpcklbw   xmm2,           xmm5
+        punpcklbw   xmm3,           xmm5
+
+
+    ; Add to predict buffer
+        paddw       xmm0,           xmm4
+        paddw       xmm1,           xmm4
+        paddw       xmm2,           xmm4
+        paddw       xmm3,           xmm4
+
+    ; pack up before storing
+        packuswb    xmm0,           xmm5
+        packuswb    xmm1,           xmm5
+        packuswb    xmm2,           xmm5
+        packuswb    xmm3,           xmm5
+
+    ; store blocks back out
+        movq        [rax],          xmm0
+        movq        [rax + rdx],    xmm1
+
+        lea         rax,            [rax + 2*rdx]
+
+        movq        [rax],          xmm2
+        movq        [rax + rdx],    xmm3
+
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_idct_dequant_full_2x_sse2
+; (
+;   short *qcoeff       - 0
+;   short *dequant      - 1
+;   unsigned char *dst  - 2
+;   int dst_stride      - 3
+; )
+global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_full_2x_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; special case when 2 blocks have 0 or 1 coeffs
+    ; dc is set as first coeff, so no need to load qcoeff
+        mov         rax,            arg(0) ; qcoeff
+        mov         rdx,            arg(1)  ; dequant
+        mov         rdi,            arg(2) ; dst
+
+
+    ; Zero out xmm7, for use unpacking
+        pxor        xmm7,           xmm7
+
+
+    ; note the transpose of xmm1 and xmm2, necessary for shuffle
+    ;   to spit out sensicle data
+        movdqa      xmm0,           [rax]
+        movdqa      xmm2,           [rax+16]
+        movdqa      xmm1,           [rax+32]
+        movdqa      xmm3,           [rax+48]
+
+    ; Clear out coeffs
+        movdqa      [rax],          xmm7
+        movdqa      [rax+16],       xmm7
+        movdqa      [rax+32],       xmm7
+        movdqa      [rax+48],       xmm7
+
+    ; dequantize qcoeff buffer
+        pmullw      xmm0,           [rdx]
+        pmullw      xmm2,           [rdx+16]
+        pmullw      xmm1,           [rdx]
+        pmullw      xmm3,           [rdx+16]
+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
+
+    ; repack so block 0 row x and block 1 row x are together
+        movdqa      xmm4,           xmm0
+        punpckldq   xmm0,           xmm1
+        punpckhdq   xmm4,           xmm1
+
+        pshufd      xmm0,           xmm0,       11011000b
+        pshufd      xmm1,           xmm4,       11011000b
+
+        movdqa      xmm4,           xmm2
+        punpckldq   xmm2,           xmm3
+        punpckhdq   xmm4,           xmm3
+
+        pshufd      xmm2,           xmm2,       11011000b
+        pshufd      xmm3,           xmm4,       11011000b
+
+    ; first pass
+        psubw       xmm0,           xmm2        ; b1 = 0-2
+        paddw       xmm2,           xmm2        ;
+
+        movdqa      xmm5,           xmm1
+        paddw       xmm2,           xmm0        ; a1 = 0+2
+
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
+        lea         rcx,            [rdx + rdx*2]   ;dst_stride * 3
+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
+
+        movdqa      xmm7,           xmm3
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
+
+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       xmm7,           xmm5        ; c1
+
+        movdqa      xmm5,           xmm1
+        movdqa      xmm4,           xmm3
+
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
+        paddw       xmm5,           xmm1
+
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm3,           xmm4
+
+        paddw       xmm3,           xmm5        ; d1
+        movdqa      xmm6,           xmm2        ; a1
+
+        movdqa      xmm4,           xmm0        ; b1
+        paddw       xmm2,           xmm3        ;0
+
+        paddw       xmm4,           xmm7        ;1
+        psubw       xmm0,           xmm7        ;2
+
+        psubw       xmm6,           xmm3        ;3
+
+    ; transpose for the second pass
+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
+
+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
+
+
+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
+
+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
+
+
+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
+
+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
+
+        pshufd      xmm0,           xmm2,       11011000b
+        pshufd      xmm2,           xmm1,       11011000b
+
+        pshufd      xmm1,           xmm5,       11011000b
+        pshufd      xmm3,           xmm7,       11011000b
+
+    ; second pass
+        psubw       xmm0,           xmm2            ; b1 = 0-2
+        paddw       xmm2,           xmm2
+
+        movdqa      xmm5,           xmm1
+        paddw       xmm2,           xmm0            ; a1 = 0+2
+
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
+
+        movdqa      xmm7,           xmm3
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
+
+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       xmm7,           xmm5            ; c1
+
+        movdqa      xmm5,           xmm1
+        movdqa      xmm4,           xmm3
+
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
+        paddw       xmm5,           xmm1
+
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm3,           xmm4
+
+        paddw       xmm3,           xmm5            ; d1
+        paddw       xmm0,           [GLOBAL(fours)]
+
+        paddw       xmm2,           [GLOBAL(fours)]
+        movdqa      xmm6,           xmm2            ; a1
+
+        movdqa      xmm4,           xmm0            ; b1
+        paddw       xmm2,           xmm3            ;0
+
+        paddw       xmm4,           xmm7            ;1
+        psubw       xmm0,           xmm7            ;2
+
+        psubw       xmm6,           xmm3            ;3
+        psraw       xmm2,           3
+
+        psraw       xmm0,           3
+        psraw       xmm4,           3
+
+        psraw       xmm6,           3
+
+    ; transpose to save
+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
+
+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
+
+
+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
+
+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
+
+
+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
+
+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
+
+        pshufd      xmm0,           xmm2,       11011000b
+        pshufd      xmm2,           xmm1,       11011000b
+
+        pshufd      xmm1,           xmm5,       11011000b
+        pshufd      xmm3,           xmm7,       11011000b
+
+        pxor        xmm7,           xmm7
+
+    ; Load up predict blocks
+        movq        xmm4,           [rdi]
+        movq        xmm5,           [rdi+rdx]
+
+        punpcklbw   xmm4,           xmm7
+        punpcklbw   xmm5,           xmm7
+
+        paddw       xmm0,           xmm4
+        paddw       xmm1,           xmm5
+
+        movq        xmm4,           [rdi+2*rdx]
+        movq        xmm5,           [rdi+rcx]
+
+        punpcklbw   xmm4,           xmm7
+        punpcklbw   xmm5,           xmm7
+
+        paddw       xmm2,           xmm4
+        paddw       xmm3,           xmm5
+
+.finish:
+
+    ; pack up before storing
+        packuswb    xmm0,           xmm7
+        packuswb    xmm1,           xmm7
+        packuswb    xmm2,           xmm7
+        packuswb    xmm3,           xmm7
+
+    ; store blocks back out
+        movq        [rdi],          xmm0
+        movq        [rdi + rdx],    xmm1
+        movq        [rdi + rdx*2],  xmm2
+        movq        [rdi + rcx],    xmm3
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_idct_dequant_dc_0_2x_sse2
+; (
+;   short *qcoeff       - 0
+;   short *dequant      - 1
+;   unsigned char *dst  - 2
+;   int dst_stride      - 3
+;   short *dc           - 4
+; )
+global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_dc_0_2x_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rdi
+    ; end prolog
+
+    ; special case when 2 blocks have 0 or 1 coeffs
+    ; dc is set as first coeff, so no need to load qcoeff
+        mov         rax,            arg(0) ; qcoeff
+
+        mov         rdi,            arg(2) ; dst
+        mov         rdx,            arg(4) ; dc
+
+    ; Zero out xmm5, for use unpacking
+        pxor        xmm5,           xmm5
+
+    ; load up 2 dc words here == 2*16 = doubleword
+        movd        xmm4,           [rdx]
+
+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
+        lea         rcx, [rdx + rdx*2]
+    ; Load up predict blocks
+        movq        xmm0,           [rdi]
+        movq        xmm1,           [rdi+rdx*1]
+        movq        xmm2,           [rdi+rdx*2]
+        movq        xmm3,           [rdi+rcx]
+
+    ; Duplicate and expand dc across
+        punpcklwd   xmm4,           xmm4
+        punpckldq   xmm4,           xmm4
+
+    ; Rounding to dequant and downshift
+        paddw       xmm4,           [GLOBAL(fours)]
+        psraw       xmm4,           3
+
+    ; Predict buffer needs to be expanded from bytes to words
+        punpcklbw   xmm0,           xmm5
+        punpcklbw   xmm1,           xmm5
+        punpcklbw   xmm2,           xmm5
+        punpcklbw   xmm3,           xmm5
+
+    ; Add to predict buffer
+        paddw       xmm0,           xmm4
+        paddw       xmm1,           xmm4
+        paddw       xmm2,           xmm4
+        paddw       xmm3,           xmm4
+
+    ; pack up before storing
+        packuswb    xmm0,           xmm5
+        packuswb    xmm1,           xmm5
+        packuswb    xmm2,           xmm5
+        packuswb    xmm3,           xmm5
+
+    ; store blocks back out
+        movq        [rdi],          xmm0
+        movq        [rdi + rdx],    xmm1
+        movq        [rdi + rdx*2],  xmm2
+        movq        [rdi + rcx],    xmm3
+
+    ; begin epilog
+    pop         rdi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+;void vp8_idct_dequant_dc_full_2x_sse2
+; (
+;   short *qcoeff       - 0
+;   short *dequant      - 1
+;   unsigned char *dst  - 2
+;   int dst_stride      - 3
+;   short *dc           - 4
+; )
+global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_dc_full_2x_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rdi
+    ; end prolog
+
+    ; special case when 2 blocks have 0 or 1 coeffs
+    ; dc is set as first coeff, so no need to load qcoeff
+        mov         rax,            arg(0) ; qcoeff
+        mov         rdx,            arg(1)  ; dequant
+
+        mov         rdi,            arg(2) ; dst
+
+    ; Zero out xmm7, for use unpacking
+        pxor        xmm7,           xmm7
+
+
+    ; note the transpose of xmm1 and xmm2, necessary for shuffle
+    ;   to spit out sensicle data
+        movdqa      xmm0,           [rax]
+        movdqa      xmm2,           [rax+16]
+        movdqa      xmm1,           [rax+32]
+        movdqa      xmm3,           [rax+48]
+
+    ; Clear out coeffs
+        movdqa      [rax],          xmm7
+        movdqa      [rax+16],       xmm7
+        movdqa      [rax+32],       xmm7
+        movdqa      [rax+48],       xmm7
+
+    ; dequantize qcoeff buffer
+        pmullw      xmm0,           [rdx]
+        pmullw      xmm2,           [rdx+16]
+        pmullw      xmm1,           [rdx]
+        pmullw      xmm3,           [rdx+16]
+
+    ; DC component
+        mov         rdx,            arg(4)
+
+    ; repack so block 0 row x and block 1 row x are together
+        movdqa      xmm4,           xmm0
+        punpckldq   xmm0,           xmm1
+        punpckhdq   xmm4,           xmm1
+
+        pshufd      xmm0,           xmm0,       11011000b
+        pshufd      xmm1,           xmm4,       11011000b
+
+        movdqa      xmm4,           xmm2
+        punpckldq   xmm2,           xmm3
+        punpckhdq   xmm4,           xmm3
+
+        pshufd      xmm2,           xmm2,       11011000b
+        pshufd      xmm3,           xmm4,       11011000b
+
+    ; insert DC component
+        pinsrw      xmm0,           [rdx],      0
+        pinsrw      xmm0,           [rdx+2],    4
+
+    ; first pass
+        psubw       xmm0,           xmm2        ; b1 = 0-2
+        paddw       xmm2,           xmm2        ;
+
+        movdqa      xmm5,           xmm1
+        paddw       xmm2,           xmm0        ; a1 = 0+2
+
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
+
+        movdqa      xmm7,           xmm3
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
+
+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       xmm7,           xmm5        ; c1
+
+        movdqa      xmm5,           xmm1
+        movdqa      xmm4,           xmm3
+
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
+        paddw       xmm5,           xmm1
+
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm3,           xmm4
+
+        paddw       xmm3,           xmm5        ; d1
+        movdqa      xmm6,           xmm2        ; a1
+
+        movdqa      xmm4,           xmm0        ; b1
+        paddw       xmm2,           xmm3        ;0
+
+        paddw       xmm4,           xmm7        ;1
+        psubw       xmm0,           xmm7        ;2
+
+        psubw       xmm6,           xmm3        ;3
+
+    ; transpose for the second pass
+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
+
+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
+
+
+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
+
+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
+
+
+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
+
+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
+
+        pshufd      xmm0,           xmm2,       11011000b
+        pshufd      xmm2,           xmm1,       11011000b
+
+        pshufd      xmm1,           xmm5,       11011000b
+        pshufd      xmm3,           xmm7,       11011000b
+
+    ; second pass
+        psubw       xmm0,           xmm2            ; b1 = 0-2
+        paddw       xmm2,           xmm2
+
+        movdqa      xmm5,           xmm1
+        paddw       xmm2,           xmm0            ; a1 = 0+2
+
+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
+
+        movdqa      xmm7,           xmm3
+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
+
+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
+        psubw       xmm7,           xmm5            ; c1
+
+        movdqa      xmm5,           xmm1
+        movdqa      xmm4,           xmm3
+
+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
+        paddw       xmm5,           xmm1
+
+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
+        paddw       xmm3,           xmm4
+
+        paddw       xmm3,           xmm5            ; d1
+        paddw       xmm0,           [GLOBAL(fours)]
+
+        paddw       xmm2,           [GLOBAL(fours)]
+        movdqa      xmm6,           xmm2            ; a1
+
+        movdqa      xmm4,           xmm0            ; b1
+        paddw       xmm2,           xmm3            ;0
+
+        paddw       xmm4,           xmm7            ;1
+        psubw       xmm0,           xmm7            ;2
+
+        psubw       xmm6,           xmm3            ;3
+        psraw       xmm2,           3
+
+        psraw       xmm0,           3
+        psraw       xmm4,           3
+
+        psraw       xmm6,           3
+
+    ; transpose to save
+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
+
+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
+
+
+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
+
+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
+
+
+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
+
+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
+
+        pshufd      xmm0,           xmm2,       11011000b
+        pshufd      xmm2,           xmm1,       11011000b
+
+        pshufd      xmm1,           xmm5,       11011000b
+        pshufd      xmm3,           xmm7,       11011000b
+
+        pxor        xmm7,           xmm7
+
+    ; Load up predict blocks
+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
+        movq        xmm4,           [rdi]
+        movq        xmm5,           [rdi+rdx]
+        lea         rcx,            [rdx + rdx*2]
+
+        punpcklbw   xmm4,           xmm7
+        punpcklbw   xmm5,           xmm7
+
+        paddw       xmm0,           xmm4
+        paddw       xmm1,           xmm5
+
+        movq        xmm4,           [rdi+rdx*2]
+        movq        xmm5,           [rdi+rcx]
+
+        punpcklbw   xmm4,           xmm7
+        punpcklbw   xmm5,           xmm7
+
+        paddw       xmm2,           xmm4
+        paddw       xmm3,           xmm5
+
+.finish:
+
+    ; pack up before storing
+        packuswb    xmm0,           xmm7
+        packuswb    xmm1,           xmm7
+        packuswb    xmm2,           xmm7
+        packuswb    xmm3,           xmm7
+
+    ; Load destination stride before writing out,
+    ;   doesn't need to persist
+        movsxd      rdx,            dword ptr arg(3) ; dst_stride
+
+    ; store blocks back out
+        movq        [rdi],          xmm0
+        movq        [rdi + rdx],    xmm1
+
+        lea         rdi,            [rdi + 2*rdx]
+
+        movq        [rdi],          xmm2
+        movq        [rdi + rdx],    xmm3
+
+
+    ; begin epilog
+    pop         rdi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+fours:
+    times 8 dw 0x0004
+align 16
+x_s1sqr2:
+    times 8 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+    times 8 dw 0x4E7B
diff --git a/libvpx/vp8/common/x86/iwalsh_mmx.asm b/libvpx/vp8/common/x86/iwalsh_mmx.asm
new file mode 100644
index 0000000..4aac094
--- /dev/null
+++ b/libvpx/vp8/common/x86/iwalsh_mmx.asm
@@ -0,0 +1,140 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
+global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
+sym(vp8_short_inv_walsh4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    ; end prolog
+
+    mov         rdx, arg(0)
+    mov         rax, 30003h
+
+    movq        mm0, [rdx + 0]    ;ip[0]
+    movq        mm1, [rdx + 8]    ;ip[4]
+    movd        mm7, rax
+
+    movq        mm2, [rdx + 16]   ;ip[8]
+    movq        mm3, [rdx + 24]   ;ip[12]
+    punpcklwd   mm7, mm7          ;0003000300030003h
+    mov         rdx, arg(1)
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+
+    paddw       mm4, mm3          ;ip[0] + ip[12] aka al
+    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
+
+    movq        mm6, mm4          ;temp al
+    paddw       mm4, mm5          ;al + bl
+    psubw       mm6, mm5          ;al - bl
+
+    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
+    psubw       mm1, mm2          ;ip[4] - ip[8] aka c1
+
+    movq        mm5, mm0          ;temp dl
+    paddw       mm0, mm1          ;dl + cl
+    psubw       mm5, mm1          ;dl - cl
+
+    ; 03 02 01 00
+    ; 13 12 11 10
+    ; 23 22 21 20
+    ; 33 32 31 30
+
+    movq        mm3, mm4          ; 03 02 01 00
+    punpcklwd   mm4, mm0          ; 11 01 10 00
+    punpckhwd   mm3, mm0          ; 13 03 12 02
+
+    movq        mm1, mm6          ; 23 22 21 20
+    punpcklwd   mm6, mm5          ; 31 21 30 20
+    punpckhwd   mm1, mm5          ; 33 23 32 22
+
+    movq        mm0, mm4          ; 11 01 10 00
+    movq        mm2, mm3          ; 13 03 12 02
+
+    punpckldq   mm0, mm6          ; 30 20 10 00 aka ip[0]
+    punpckhdq   mm4, mm6          ; 31 21 11 01 aka ip[4]
+
+    punpckldq   mm2, mm1          ; 32 22 12 02 aka ip[8]
+    punpckhdq   mm3, mm1          ; 33 23 13 03 aka ip[12]
+;~~~~~~~~~~~~~~~~~~~~~
+    movq        mm1, mm0
+    movq        mm5, mm4
+    paddw       mm1, mm3          ;ip[0] + ip[12] aka al
+    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
+
+    movq        mm6, mm1          ;temp al
+    paddw       mm1, mm5          ;al + bl
+    psubw       mm6, mm5          ;al - bl
+    paddw       mm1, mm7
+    paddw       mm6, mm7
+    psraw       mm1, 3
+    psraw       mm6, 3
+
+    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
+    psubw       mm4, mm2          ;ip[4] - ip[8] aka c1
+
+    movq        mm5, mm0          ;temp dl
+    paddw       mm0, mm4          ;dl + cl
+    psubw       mm5, mm4          ;dl - cl
+    paddw       mm0, mm7
+    paddw       mm5, mm7
+    psraw       mm0, 3
+    psraw       mm5, 3
+;~~~~~~~~~~~~~~~~~~~~~
+
+    movd        eax, mm1
+    movd        ecx, mm0
+    psrlq       mm0, 32
+    psrlq       mm1, 32
+    mov         word ptr[rdx+32*0], ax
+    mov         word ptr[rdx+32*1], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*4], ax
+    mov         word ptr[rdx+32*5], cx
+    movd        eax, mm1
+    movd        ecx, mm0
+    mov         word ptr[rdx+32*8], ax
+    mov         word ptr[rdx+32*9], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*12], ax
+    mov         word ptr[rdx+32*13], cx
+
+    movd        eax, mm6
+    movd        ecx, mm5
+    psrlq       mm5, 32
+    psrlq       mm6, 32
+    mov         word ptr[rdx+32*2], ax
+    mov         word ptr[rdx+32*3], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*6], ax
+    mov         word ptr[rdx+32*7], cx
+    movd        eax, mm6
+    movd        ecx, mm5
+    mov         word ptr[rdx+32*10], ax
+    mov         word ptr[rdx+32*11], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*14], ax
+    mov         word ptr[rdx+32*15], cx
+
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
diff --git a/libvpx/vp8/common/x86/iwalsh_sse2.asm b/libvpx/vp8/common/x86/iwalsh_sse2.asm
new file mode 100644
index 0000000..06e86a8
--- /dev/null
+++ b/libvpx/vp8/common/x86/iwalsh_sse2.asm
@@ -0,0 +1,121 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
+global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
+sym(vp8_short_inv_walsh4x4_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    ; end prolog
+
+    mov         rcx, arg(0)
+    mov         rdx, arg(1)
+    mov         rax, 30003h
+
+    movdqa      xmm0, [rcx + 0]     ;ip[4] ip[0]
+    movdqa      xmm1, [rcx + 16]    ;ip[12] ip[8]
+
+
+    pshufd      xmm2, xmm1, 4eh     ;ip[8] ip[12]
+    movdqa      xmm3, xmm0          ;ip[4] ip[0]
+
+    paddw       xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw       xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+    movdqa      xmm4, xmm0
+    punpcklqdq  xmm0, xmm3          ;d1 a1
+    punpckhqdq  xmm4, xmm3          ;c1 b1
+
+    movdqa      xmm1, xmm4          ;c1 b1
+    paddw       xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw       xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ; 13 12 11 10 03 02 01 00
+    ;
+    ; 33 32 31 30 23 22 21 20
+    ;
+    movdqa      xmm3, xmm4          ; 13 12 11 10 03 02 01 00
+    punpcklwd   xmm4, xmm0          ; 23 03 22 02 21 01 20 00
+    punpckhwd   xmm3, xmm0          ; 33 13 32 12 31 11 30 10
+    movdqa      xmm1, xmm4          ; 23 03 22 02 21 01 20 00
+    punpcklwd   xmm4, xmm3          ; 31 21 11 01 30 20 10 00
+    punpckhwd   xmm1, xmm3          ; 33 23 13 03 32 22 12 02
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    movd        xmm0, eax
+    pshufd      xmm2, xmm1, 4eh     ;ip[8] ip[12]
+    movdqa      xmm3, xmm4          ;ip[4] ip[0]
+
+    pshufd      xmm0, xmm0, 0       ;03 03 03 03 03 03 03 03
+
+    paddw       xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw       xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+    movdqa      xmm5, xmm4
+    punpcklqdq  xmm4, xmm3          ;d1 a1
+    punpckhqdq  xmm5, xmm3          ;c1 b1
+
+    movdqa      xmm1, xmm5          ;c1 b1
+    paddw       xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw       xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+
+    paddw       xmm5, xmm0
+    paddw       xmm4, xmm0
+    psraw       xmm5, 3
+    psraw       xmm4, 3
+
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    psrldq      xmm5, 4
+    psrldq      xmm4, 4
+    mov         word ptr[rdx+32*0], ax
+    mov         word ptr[rdx+32*2], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*4], ax
+    mov         word ptr[rdx+32*6], cx
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    psrldq      xmm5, 4
+    psrldq      xmm4, 4
+    mov         word ptr[rdx+32*8], ax
+    mov         word ptr[rdx+32*10], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*12], ax
+    mov         word ptr[rdx+32*14], cx
+
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    psrldq      xmm5, 4
+    psrldq      xmm4, 4
+    mov         word ptr[rdx+32*1], ax
+    mov         word ptr[rdx+32*3], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*5], ax
+    mov         word ptr[rdx+32*7], cx
+    movd        eax, xmm5
+    movd        ecx, xmm4
+    mov         word ptr[rdx+32*9], ax
+    mov         word ptr[rdx+32*11], cx
+    shr         eax, 16
+    shr         ecx, 16
+    mov         word ptr[rdx+32*13], ax
+    mov         word ptr[rdx+32*15], cx
+
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libvpx/vp8/common/x86/loopfilter_block_sse2.asm b/libvpx/vp8/common/x86/loopfilter_block_sse2.asm
new file mode 100644
index 0000000..1c445ef
--- /dev/null
+++ b/libvpx/vp8/common/x86/loopfilter_block_sse2.asm
@@ -0,0 +1,813 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro LF_ABS 2
+        ; %1 value not preserved
+        ; %2 value preserved
+        ; output in %1
+        movdqa      scratch1, %2            ; v2
+
+        psubusb     scratch1, %1            ; v2 - v1
+        psubusb     %1, %2                  ; v1 - v2
+        por         %1, scratch1            ; abs(v2 - v1)
+%endmacro
+
+%macro LF_FILTER_HEV_MASK 8-9
+
+        LF_ABS      %1, %2                  ; abs(p3 - p2)
+        LF_ABS      %2, %3                  ; abs(p2 - p1)
+        pmaxub      %1, %2                  ; accumulate mask
+%if %0 == 8
+        movdqa      scratch2, %3            ; save p1
+        LF_ABS      scratch2, %4            ; abs(p1 - p0)
+%endif
+        LF_ABS      %4, %5                  ; abs(p0 - q0)
+        LF_ABS      %5, %6                  ; abs(q0 - q1)
+%if %0 == 8
+        pmaxub      %5, scratch2            ; accumulate hev
+%else
+        pmaxub      %5, %9
+%endif
+        pmaxub      %1, %5                  ; accumulate mask
+
+        LF_ABS      %3, %6                  ; abs(p1 - q1)
+        LF_ABS      %6, %7                  ; abs(q1 - q2)
+        pmaxub      %1, %6                  ; accumulate mask
+        LF_ABS      %7, %8                  ; abs(q2 - q3)
+        pmaxub      %1, %7                  ; accumulate mask
+
+        paddusb     %4, %4                  ; 2 * abs(p0 - q0)
+        pand        %3, [GLOBAL(tfe)]
+        psrlw       %3, 1                   ; abs(p1 - q1) / 2
+        paddusb     %4, %3                  ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+        psubusb     %1, [limit]
+        psubusb     %4, [blimit]
+        por         %1, %4
+        pcmpeqb     %1, zero                ; mask
+
+        psubusb     %5, [thresh]
+        pcmpeqb     %5, zero                ; ~hev
+%endmacro
+
+%macro LF_FILTER 6
+        ; %1-%4: p1-q1
+        ; %5: mask
+        ; %6: hev
+
+        movdqa      scratch2, %6            ; save hev
+
+        pxor        %1, [GLOBAL(t80)]       ; ps1
+        pxor        %4, [GLOBAL(t80)]       ; qs1
+        movdqa      scratch1, %1
+        psubsb      scratch1, %4            ; signed_char_clamp(ps1 - qs1)
+        pandn       scratch2, scratch1      ; vp8_filter &= hev
+
+        pxor        %2, [GLOBAL(t80)]       ; ps0
+        pxor        %3, [GLOBAL(t80)]       ; qs0
+        movdqa      scratch1, %3
+        psubsb      scratch1, %2            ; qs0 - ps0
+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
+        paddsb      scratch2, scratch1      ; vp8_filter += (qs0 - ps0)
+        pand        %5, scratch2            ; &= mask
+
+        movdqa      scratch2, %5
+        paddsb      %5, [GLOBAL(t4)]        ; Filter1
+        paddsb      scratch2, [GLOBAL(t3)]  ; Filter2
+
+        ; Filter1 >> 3
+        movdqa      scratch1, zero
+        pcmpgtb     scratch1, %5
+        psrlw       %5, 3
+        pand        scratch1, [GLOBAL(te0)]
+        pand        %5, [GLOBAL(t1f)]
+        por         %5, scratch1
+
+        psubsb      %3, %5                  ; qs0 - Filter1
+        pxor        %3, [GLOBAL(t80)]
+
+        ; Filter2 >> 3
+        movdqa      scratch1, zero
+        pcmpgtb     scratch1, scratch2
+        psrlw       scratch2, 3
+        pand        scratch1, [GLOBAL(te0)]
+        pand        scratch2, [GLOBAL(t1f)]
+        por         scratch2, scratch1
+
+        paddsb      %2, scratch2            ; ps0 + Filter2
+        pxor        %2, [GLOBAL(t80)]
+
+        ; outer tap adjustments
+        paddsb      %5, [GLOBAL(t1)]
+        movdqa      scratch1, zero
+        pcmpgtb     scratch1, %5
+        psrlw       %5, 1
+        pand        scratch1, [GLOBAL(t80)]
+        pand        %5, [GLOBAL(t7f)]
+        por         %5, scratch1
+        pand        %5, %6                  ; vp8_filter &= ~hev
+
+        psubsb      %4, %5                  ; qs1 - vp8_filter
+        pxor        %4, [GLOBAL(t80)]
+
+        paddsb      %1, %5                  ; ps1 + vp8_filter
+        pxor        %1, [GLOBAL(t80)]
+%endmacro
+
+;void vp8_loop_filter_bh_y_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh
+;)
+global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
+sym(vp8_loop_filter_bh_y_sse2):
+
+%ifidn __OUTPUT_FORMAT__,x64
+    %define src      rcx ; src_ptr
+    %define stride   rdx ; src_pixel_step
+    %define blimit   r8
+    %define limit    r9
+    %define thresh   r10
+
+    %define spp      rax
+    %define stride3  r11
+    %define stride5  r12
+    %define stride7  r13
+
+    push    rbp
+    mov     rbp, rsp
+    push    r12
+    push    r13
+    mov     thresh, arg(4)
+%else
+    %define src      rdi ; src_ptr
+    %define stride   rsi ; src_pixel_step
+    %define blimit   rdx
+    %define limit    rcx
+    %define thresh   r8
+
+    %define spp      rax
+    %define stride3  r9
+    %define stride5  r10
+    %define stride7  r11
+%endif
+
+    %define scratch1 xmm5
+    %define scratch2 xmm6
+    %define zero     xmm7
+
+    %define i0       [src]
+    %define i1       [spp]
+    %define i2       [src + 2 * stride]
+    %define i3       [spp + 2 * stride]
+    %define i4       [src + 4 * stride]
+    %define i5       [spp + 4 * stride]
+    %define i6       [src + 2 * stride3]
+    %define i7       [spp + 2 * stride3]
+    %define i8       [src + 8 * stride]
+    %define i9       [spp + 8 * stride]
+    %define i10      [src + 2 * stride5]
+    %define i11      [spp + 2 * stride5]
+    %define i12      [src + 4 * stride3]
+    %define i13      [spp + 4 * stride3]
+    %define i14      [src + 2 * stride7]
+    %define i15      [spp + 2 * stride7]
+
+    ; prep work
+    lea         spp, [src + stride]
+    lea         stride3, [stride + 2 * stride]
+    lea         stride5, [stride3 + 2 * stride]
+    lea         stride7, [stride3 + 4 * stride]
+    pxor        zero, zero
+
+        ; load the first set into registers
+        movdqa       xmm0, i0
+        movdqa       xmm1, i1
+        movdqa       xmm2, i2
+        movdqa       xmm3, i3
+        movdqa       xmm4, i4
+        movdqa       xmm8, i5
+        movdqa       xmm9, i6   ; q2, will contain abs(p1-p0)
+        movdqa       xmm10, i7
+LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
+
+        movdqa       xmm1, i2
+        movdqa       xmm2, i3
+        movdqa       xmm3, i4
+        movdqa       xmm8, i5
+LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
+        movdqa       i2, xmm1
+        movdqa       i3, xmm2
+
+; second set
+        movdqa       i4, xmm3
+        movdqa       i5, xmm8
+
+        movdqa       xmm0, i6
+        movdqa       xmm1, i7
+        movdqa       xmm2, i8
+        movdqa       xmm4, i9
+        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
+        movdqa       xmm11, i11
+LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
+
+        movdqa       xmm0, i6
+        movdqa       xmm1, i7
+        movdqa       xmm4, i8
+        movdqa       xmm8, i9
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+        movdqa       i6, xmm0
+        movdqa       i7, xmm1
+
+; last set
+        movdqa       i8, xmm4
+        movdqa       i9, xmm8
+
+        movdqa       xmm0, i10
+        movdqa       xmm1, i11
+        movdqa       xmm2, i12
+        movdqa       xmm3, i13
+        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
+        movdqa       xmm11, i15
+LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
+
+        movdqa       xmm0, i10
+        movdqa       xmm1, i11
+        movdqa       xmm3, i12
+        movdqa       xmm8, i13
+LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
+        movdqa       i10, xmm0
+        movdqa       i11, xmm1
+        movdqa       i12, xmm3
+        movdqa       i13, xmm8
+
+%ifidn __OUTPUT_FORMAT__,x64
+    pop    r13
+    pop    r12
+    pop    rbp
+%endif
+
+    ret
+
+
+;void vp8_loop_filter_bv_y_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh
+;)
+
+global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
+sym(vp8_loop_filter_bv_y_sse2):
+
+%ifidn __OUTPUT_FORMAT__,x64
+    %define src      rcx ; src_ptr
+    %define stride   rdx ; src_pixel_step
+    %define blimit   r8
+    %define limit    r9
+    %define thresh   r10
+
+    %define spp      rax
+    %define stride3  r11
+    %define stride5  r12
+    %define stride7  r13
+
+    push    rbp
+    mov     rbp, rsp
+    SAVE_XMM 15
+    push    r12
+    push    r13
+    mov     thresh, arg(4)
+%else
+    %define src      rdi
+    %define stride   rsi
+    %define blimit   rdx
+    %define limit    rcx
+    %define thresh   r8
+
+    %define spp      rax
+    %define stride3  r9
+    %define stride5  r10
+    %define stride7  r11
+%endif
+
+    %define scratch1 xmm5
+    %define scratch2 xmm6
+    %define zero     xmm7
+
+    %define s0       [src]
+    %define s1       [spp]
+    %define s2       [src + 2 * stride]
+    %define s3       [spp + 2 * stride]
+    %define s4       [src + 4 * stride]
+    %define s5       [spp + 4 * stride]
+    %define s6       [src + 2 * stride3]
+    %define s7       [spp + 2 * stride3]
+    %define s8       [src + 8 * stride]
+    %define s9       [spp + 8 * stride]
+    %define s10      [src + 2 * stride5]
+    %define s11      [spp + 2 * stride5]
+    %define s12      [src + 4 * stride3]
+    %define s13      [spp + 4 * stride3]
+    %define s14      [src + 2 * stride7]
+    %define s15      [spp + 2 * stride7]
+
+    %define i0       [rsp]
+    %define i1       [rsp + 16]
+    %define i2       [rsp + 32]
+    %define i3       [rsp + 48]
+    %define i4       [rsp + 64]
+    %define i5       [rsp + 80]
+    %define i6       [rsp + 96]
+    %define i7       [rsp + 112]
+    %define i8       [rsp + 128]
+    %define i9       [rsp + 144]
+    %define i10      [rsp + 160]
+    %define i11      [rsp + 176]
+    %define i12      [rsp + 192]
+    %define i13      [rsp + 208]
+    %define i14      [rsp + 224]
+    %define i15      [rsp + 240]
+
+    ALIGN_STACK 16, rax
+
+    ; reserve stack space
+    %define      temp_storage  0 ; size is 256 (16*16)
+    %define      stack_size 256
+    sub          rsp, stack_size
+
+    ; prep work
+    lea         spp, [src + stride]
+    lea         stride3, [stride + 2 * stride]
+    lea         stride5, [stride3 + 2 * stride]
+    lea         stride7, [stride3 + 4 * stride]
+
+        ; 8-f
+        movdqa      xmm0, s8
+        movdqa      xmm1, xmm0
+        punpcklbw   xmm0, s9                ; 80 90
+        punpckhbw   xmm1, s9                ; 88 98
+
+        movdqa      xmm2, s10
+        movdqa      xmm3, xmm2
+        punpcklbw   xmm2, s11 ; a0 b0
+        punpckhbw   xmm3, s11 ; a8 b8
+
+        movdqa      xmm4, xmm0
+        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
+        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
+
+        movdqa      xmm2, xmm1
+        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
+        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
+
+        ; using xmm[0124]
+        ; work on next 4 rows
+
+        movdqa      xmm3, s12
+        movdqa      xmm5, xmm3
+        punpcklbw   xmm3, s13 ; c0 d0
+        punpckhbw   xmm5, s13 ; c8 d8
+
+        movdqa      xmm6, s14
+        movdqa      xmm7, xmm6
+        punpcklbw   xmm6, s15 ; e0 f0
+        punpckhbw   xmm7, s15 ; e8 f8
+
+        movdqa      xmm8, xmm3
+        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
+        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
+
+        movdqa      xmm6, xmm5
+        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
+        punpckhwd   xmm6, xmm7              ; cc dc ec fc
+
+        ; pull the third and fourth sets together
+
+        movdqa      xmm7, xmm0
+        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
+        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
+
+        movdqa      xmm3, xmm4
+        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
+        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
+
+        movdqa      xmm8, xmm1
+        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
+        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
+
+        movdqa      xmm5, xmm2
+        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
+        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
+
+        ; save the calculations. we only have 15 registers ...
+        movdqa      i0, xmm0
+        movdqa      i1, xmm7
+        movdqa      i2, xmm4
+        movdqa      i3, xmm3
+        movdqa      i4, xmm1
+        movdqa      i5, xmm8
+        movdqa      i6, xmm2
+        movdqa      i7, xmm5
+
+        ; 0-7
+        movdqa      xmm0, s0
+        movdqa      xmm1, xmm0
+        punpcklbw   xmm0, s1 ; 00 10
+        punpckhbw   xmm1, s1 ; 08 18
+
+        movdqa      xmm2, s2
+        movdqa      xmm3, xmm2
+        punpcklbw   xmm2, s3 ; 20 30
+        punpckhbw   xmm3, s3 ; 28 38
+
+        movdqa      xmm4, xmm0
+        punpcklwd   xmm0, xmm2              ; 00 10 20 30
+        punpckhwd   xmm4, xmm2              ; 04 14 24 34
+
+        movdqa      xmm2, xmm1
+        punpcklwd   xmm1, xmm3              ; 08 18 28 38
+        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
+
+        ; using xmm[0124]
+        ; work on next 4 rows
+
+        movdqa      xmm3, s4
+        movdqa      xmm5, xmm3
+        punpcklbw   xmm3, s5 ; 40 50
+        punpckhbw   xmm5, s5 ; 48 58
+
+        movdqa      xmm6, s6
+        movdqa      xmm7, xmm6
+        punpcklbw   xmm6, s7   ; 60 70
+        punpckhbw   xmm7, s7   ; 68 78
+
+        movdqa      xmm8, xmm3
+        punpcklwd   xmm3, xmm6              ; 40 50 60 70
+        punpckhwd   xmm8, xmm6              ; 44 54 64 74
+
+        movdqa      xmm6, xmm5
+        punpcklwd   xmm5, xmm7              ; 48 58 68 78
+        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
+
+        ; pull the first two sets together
+
+        movdqa      xmm7, xmm0
+        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
+        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
+
+        movdqa      xmm3, xmm4
+        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
+        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
+
+        movdqa      xmm8, xmm1
+        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
+        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+        movdqa      xmm5, xmm2
+        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
+        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
+        ; final combination
+
+        movdqa      xmm6, xmm0
+        punpcklqdq  xmm0, i0
+        punpckhqdq  xmm6, i0
+
+        movdqa      xmm9, xmm7
+        punpcklqdq  xmm7, i1
+        punpckhqdq  xmm9, i1
+
+        movdqa      xmm10, xmm4
+        punpcklqdq  xmm4, i2
+        punpckhqdq  xmm10, i2
+
+        movdqa      xmm11, xmm3
+        punpcklqdq  xmm3, i3
+        punpckhqdq  xmm11, i3
+
+        movdqa      xmm12, xmm1
+        punpcklqdq  xmm1, i4
+        punpckhqdq  xmm12, i4
+
+        movdqa      xmm13, xmm8
+        punpcklqdq  xmm8, i5
+        punpckhqdq  xmm13, i5
+
+        movdqa      xmm14, xmm2
+        punpcklqdq  xmm2, i6
+        punpckhqdq  xmm14, i6
+
+        movdqa      xmm15, xmm5
+        punpcklqdq  xmm5, i7
+        punpckhqdq  xmm15, i7
+
+        movdqa      i0, xmm0
+        movdqa      i1, xmm6
+        movdqa      i2, xmm7
+        movdqa      i3, xmm9
+        movdqa      i4, xmm4
+        movdqa      i5, xmm10
+        movdqa      i6, xmm3
+        movdqa      i7, xmm11
+        movdqa      i8, xmm1
+        movdqa      i9, xmm12
+        movdqa      i10, xmm8
+        movdqa      i11, xmm13
+        movdqa      i12, xmm2
+        movdqa      i13, xmm14
+        movdqa      i14, xmm5
+        movdqa      i15, xmm15
+
+; TRANSPOSED DATA AVAILABLE ON THE STACK
+
+        movdqa      xmm12, xmm6
+        movdqa      xmm13, xmm7
+
+        pxor        zero, zero
+
+LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
+
+        movdqa       xmm1, i2
+        movdqa       xmm2, i3
+        movdqa       xmm8, i4
+        movdqa       xmm9, i5
+LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
+        movdqa       i2, xmm1
+        movdqa       i3, xmm2
+
+; second set
+        movdqa       i4, xmm8
+        movdqa       i5, xmm9
+
+        movdqa       xmm0, i6
+        movdqa       xmm1, i7
+        movdqa       xmm2, i8
+        movdqa       xmm4, i9
+        movdqa       xmm10, i10   ; q2, will contain abs(p1-p0)
+        movdqa       xmm11, i11
+LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
+
+        movdqa       xmm0, i6
+        movdqa       xmm1, i7
+        movdqa       xmm3, i8
+        movdqa       xmm4, i9
+LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
+        movdqa       i6, xmm0
+        movdqa       i7, xmm1
+
+; last set
+        movdqa       i8, xmm3
+        movdqa       i9, xmm4
+
+        movdqa       xmm0, i10
+        movdqa       xmm1, i11
+        movdqa       xmm2, i12
+        movdqa       xmm8, i13
+        movdqa       xmm9, i14   ; q2, will contain abs(p1-p0)
+        movdqa       xmm11, i15
+LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
+
+        movdqa       xmm0, i10
+        movdqa       xmm1, i11
+        movdqa       xmm4, i12
+        movdqa       xmm8, i13
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+        movdqa       i10, xmm0
+        movdqa       i11, xmm1
+        movdqa       i12, xmm4
+        movdqa       i13, xmm8
+
+
+; RESHUFFLE AND WRITE OUT
+        ; 8-f
+        movdqa      xmm0, i8
+        movdqa      xmm1, xmm0
+        punpcklbw   xmm0, i9                ; 80 90
+        punpckhbw   xmm1, i9                ; 88 98
+
+        movdqa      xmm2, i10
+        movdqa      xmm3, xmm2
+        punpcklbw   xmm2, i11               ; a0 b0
+        punpckhbw   xmm3, i11               ; a8 b8
+
+        movdqa      xmm4, xmm0
+        punpcklwd   xmm0, xmm2              ; 80 90 a0 b0
+        punpckhwd   xmm4, xmm2              ; 84 94 a4 b4
+
+        movdqa      xmm2, xmm1
+        punpcklwd   xmm1, xmm3              ; 88 98 a8 b8
+        punpckhwd   xmm2, xmm3              ; 8c 9c ac bc
+
+        ; using xmm[0124]
+        ; work on next 4 rows
+
+        movdqa      xmm3, i12
+        movdqa      xmm5, xmm3
+        punpcklbw   xmm3, i13               ; c0 d0
+        punpckhbw   xmm5, i13               ; c8 d8
+
+        movdqa      xmm6, i14
+        movdqa      xmm7, xmm6
+        punpcklbw   xmm6, i15               ; e0 f0
+        punpckhbw   xmm7, i15               ; e8 f8
+
+        movdqa      xmm8, xmm3
+        punpcklwd   xmm3, xmm6              ; c0 d0 e0 f0
+        punpckhwd   xmm8, xmm6              ; c4 d4 e4 f4
+
+        movdqa      xmm6, xmm5
+        punpcklwd   xmm5, xmm7              ; c8 d8 e8 f8
+        punpckhwd   xmm6, xmm7              ; cc dc ec fc
+
+        ; pull the third and fourth sets together
+
+        movdqa      xmm7, xmm0
+        punpckldq   xmm0, xmm3              ; 80 90 a0 b0 c0 d0 e0 f0
+        punpckhdq   xmm7, xmm3              ; 82 92 a2 b2 c2 d2 e2 f2
+
+        movdqa      xmm3, xmm4
+        punpckldq   xmm4, xmm8              ; 84 94 a4 b4 c4 d4 e4 f4
+        punpckhdq   xmm3, xmm8              ; 86 96 a6 b6 c6 d6 e6 f6
+
+        movdqa      xmm8, xmm1
+        punpckldq   xmm1, xmm5              ; 88 88 a8 b8 c8 d8 e8 f8
+        punpckhdq   xmm8, xmm5              ; 8a 9a aa ba ca da ea fa
+
+        movdqa      xmm5, xmm2
+        punpckldq   xmm2, xmm6              ; 8c 9c ac bc cc dc ec fc
+        punpckhdq   xmm5, xmm6              ; 8e 9e ae be ce de ee fe
+
+        ; save the calculations. we only have 15 registers ...
+        movdqa      i8, xmm0
+        movdqa      i9, xmm7
+        movdqa      i10, xmm4
+        movdqa      i11, xmm3
+        movdqa      i12, xmm1
+        movdqa      i13, xmm8
+        movdqa      i14, xmm2
+        movdqa      i15, xmm5
+
+        ; 0-7
+        movdqa      xmm0, i0
+        movdqa      xmm1, xmm0
+        punpcklbw   xmm0, i1                ; 00 10
+        punpckhbw   xmm1, i1                ; 08 18
+
+        movdqa      xmm2, i2
+        movdqa      xmm3, xmm2
+        punpcklbw   xmm2, i3                ; 20 30
+        punpckhbw   xmm3, i3                ; 28 38
+
+        movdqa      xmm4, xmm0
+        punpcklwd   xmm0, xmm2              ; 00 10 20 30
+        punpckhwd   xmm4, xmm2              ; 04 14 24 34
+
+        movdqa      xmm2, xmm1
+        punpcklwd   xmm1, xmm3              ; 08 18 28 38
+        punpckhwd   xmm2, xmm3              ; 0c 1c 2c 3c
+
+        ; using xmm[0124]
+        ; work on next 4 rows
+
+        movdqa      xmm3, i4
+        movdqa      xmm5, xmm3
+        punpcklbw   xmm3, i5                ; 40 50
+        punpckhbw   xmm5, i5                ; 48 58
+
+        movdqa      xmm6, i6
+        movdqa      xmm7, xmm6
+        punpcklbw   xmm6, i7                ; 60 70
+        punpckhbw   xmm7, i7                ; 68 78
+
+        movdqa      xmm8, xmm3
+        punpcklwd   xmm3, xmm6              ; 40 50 60 70
+        punpckhwd   xmm8, xmm6              ; 44 54 64 74
+
+        movdqa      xmm6, xmm5
+        punpcklwd   xmm5, xmm7              ; 48 58 68 78
+        punpckhwd   xmm6, xmm7              ; 4c 5c 6c 7c
+
+        ; pull the first two sets together
+
+        movdqa      xmm7, xmm0
+        punpckldq   xmm0, xmm3              ; 00 10 20 30 40 50 60 70
+        punpckhdq   xmm7, xmm3              ; 02 12 22 32 42 52 62 72
+
+        movdqa      xmm3, xmm4
+        punpckldq   xmm4, xmm8              ; 04 14 24 34 44 54 64 74
+        punpckhdq   xmm3, xmm8              ; 06 16 26 36 46 56 66 76
+
+        movdqa      xmm8, xmm1
+        punpckldq   xmm1, xmm5              ; 08 18 28 38 48 58 68 78
+        punpckhdq   xmm8, xmm5              ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+        movdqa      xmm5, xmm2
+        punpckldq   xmm2, xmm6              ; 0c 1c 2c 3c 4c 5c 6c 7c
+        punpckhdq   xmm5, xmm6              ; 0e 1e 2e 3e 4e 5e 6e 7e
+        ; final combination
+
+        movdqa      xmm6, xmm0
+        punpcklqdq  xmm0, i8
+        punpckhqdq  xmm6, i8
+
+        movdqa      xmm9, xmm7
+        punpcklqdq  xmm7, i9
+        punpckhqdq  xmm9, i9
+
+        movdqa      xmm10, xmm4
+        punpcklqdq  xmm4, i10
+        punpckhqdq  xmm10, i10
+
+        movdqa      xmm11, xmm3
+        punpcklqdq  xmm3, i11
+        punpckhqdq  xmm11, i11
+
+        movdqa      xmm12, xmm1
+        punpcklqdq  xmm1, i12
+        punpckhqdq  xmm12, i12
+
+        movdqa      xmm13, xmm8
+        punpcklqdq  xmm8, i13
+        punpckhqdq  xmm13, i13
+
+        movdqa      xmm14, xmm2
+        punpcklqdq  xmm2, i14
+        punpckhqdq  xmm14, i14
+
+        movdqa      xmm15, xmm5
+        punpcklqdq  xmm5, i15
+        punpckhqdq  xmm15, i15
+
+        movdqa      s0, xmm0
+        movdqa      s1, xmm6
+        movdqa      s2, xmm7
+        movdqa      s3, xmm9
+        movdqa      s4, xmm4
+        movdqa      s5, xmm10
+        movdqa      s6, xmm3
+        movdqa      s7, xmm11
+        movdqa      s8, xmm1
+        movdqa      s9, xmm12
+        movdqa      s10, xmm8
+        movdqa      s11, xmm13
+        movdqa      s12, xmm2
+        movdqa      s13, xmm14
+        movdqa      s14, xmm5
+        movdqa      s15, xmm15
+
+    ; free stack space
+    add          rsp, stack_size
+
+    ; un-ALIGN_STACK
+    pop          rsp
+
+%ifidn __OUTPUT_FORMAT__,x64
+    pop    r13
+    pop    r12
+    RESTORE_XMM
+    pop    rbp
+%endif
+
+    ret
+
+SECTION_RODATA
+align 16
+te0:
+    times 16 db 0xe0
+align 16
+t7f:
+    times 16 db 0x7f
+align 16
+tfe:
+    times 16 db 0xfe
+align 16
+t1f:
+    times 16 db 0x1f
+align 16
+t80:
+    times 16 db 0x80
+align 16
+t1:
+    times 16 db 0x01
+align 16
+t3:
+    times 16 db 0x03
+align 16
+t4:
+    times 16 db 0x04
diff --git a/libvpx/vp8/common/x86/loopfilter_mmx.asm b/libvpx/vp8/common/x86/loopfilter_mmx.asm
new file mode 100644
index 0000000..f388d24
--- /dev/null
+++ b/libvpx/vp8/common/x86/loopfilter_mmx.asm
@@ -0,0 +1,1753 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_loop_filter_horizontal_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int  count
+;)
+global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32                         ; reserve 32 bytes
+    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        movsxd      rcx, dword ptr arg(5) ;count
+.next8_h:
+        mov         rdx, arg(3) ;limit
+        movq        mm7, [rdx]
+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+
+        ; calculate breakout conditions
+        movq        mm2, [rdi+2*rax]      ; q3
+        movq        mm1, [rsi+2*rax]      ; q2
+        movq        mm6, mm1              ; q2
+        psubusb     mm1, mm2              ; q2-=q3
+        psubusb     mm2, mm6              ; q3-=q2
+        por         mm1, mm2              ; abs(q3-q2)
+        psubusb     mm1, mm7              ;
+
+
+        movq        mm4, [rsi+rax]        ; q1
+        movq        mm3, mm4              ; q1
+        psubusb     mm4, mm6              ; q1-=q2
+        psubusb     mm6, mm3              ; q2-=q1
+        por         mm4, mm6              ; abs(q2-q1)
+
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm4, [rsi]            ; q0
+        movq        mm0, mm4              ; q0
+        psubusb     mm4, mm3              ; q0-=q1
+        psubusb     mm3, mm0              ; q1-=q0
+        por         mm4, mm3              ; abs(q0-q1)
+        movq        t0, mm4               ; save to t0
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        neg         rax                   ; negate pitch to deal with above border
+
+        movq        mm2, [rsi+4*rax]      ; p3
+        movq        mm4, [rdi+4*rax]      ; p2
+        movq        mm5, mm4              ; p2
+        psubusb     mm4, mm2              ; p2-=p3
+        psubusb     mm2, mm5              ; p3-=p2
+        por         mm4, mm2              ; abs(p3 - p2)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        movq        mm4, [rsi+2*rax]      ; p1
+        movq        mm3, mm4              ; p1
+        psubusb     mm4, mm5              ; p1-=p2
+        psubusb     mm5, mm3              ; p2-=p1
+        por         mm4, mm5              ; abs(p2 - p1)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm2, mm3              ; p1
+
+        movq        mm4, [rsi+rax]        ; p0
+        movq        mm5, mm4              ; p0
+        psubusb     mm4, mm3              ; p0-=p1
+        psubusb     mm3, mm5              ; p1-=p0
+        por         mm4, mm3              ; abs(p1 - p0)
+        movq        t1, mm4               ; save to t1
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm3, [rdi]            ; q1
+        movq        mm4, mm3              ; q1
+        psubusb     mm3, mm2              ; q1-=p1
+        psubusb     mm2, mm4              ; p1-=q1
+        por         mm2, mm3              ; abs(p1-q1)
+        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
+        psrlw       mm2, 1                ; abs(p1-q1)/2
+
+        movq        mm6, mm5              ; p0
+        movq        mm3, [rsi]            ; q0
+        psubusb     mm5, mm3              ; p0-=q0
+        psubusb     mm3, mm6              ; q0-=p0
+        por         mm5, mm3              ; abs(p0 - q0)
+        paddusb     mm5, mm5              ; abs(p0-q0)*2
+        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm7, [rdx]            ; blimit
+
+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,    mm5
+        pxor        mm5,    mm5
+        pcmpeqb     mm1,    mm5           ; mask mm1
+
+        ; calculate high edge variance
+        mov         rdx, arg(4) ;thresh           ; get thresh
+        movq        mm7, [rdx]            ;
+        movq        mm4, t0               ; get abs (q1 - q0)
+        psubusb     mm4, mm7
+        movq        mm3, t1               ; get abs (p1 - p0)
+        psubusb     mm3, mm7
+        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        pcmpeqb     mm4,        mm5
+
+        pcmpeqb     mm5,        mm5
+        pxor        mm4,        mm5
+
+
+        ; start work on filters
+        movq        mm2, [rsi+2*rax]      ; p1
+        movq        mm7, [rdi]            ; q1
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+        pand        mm2, mm4              ; high var mask (hvm)(p1 - q1)
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2, mm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand        mm1, mm2                  ; mask filter values we don't care about
+        movq        mm2, mm1
+        paddsb      mm1, [GLOBAL(t4)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      mm2, [GLOBAL(t3)]     ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+        pxor        mm0, mm0             ;
+        pxor        mm5, mm5
+        punpcklbw   mm0, mm2            ;
+        punpckhbw   mm5, mm2            ;
+        psraw       mm0, 11             ;
+        psraw       mm5, 11
+        packsswb    mm0, mm5
+        movq        mm2, mm0            ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        mm0, mm0              ; 0
+        movq        mm5, mm1              ; abcdefgh
+        punpcklbw   mm0, mm1              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        pxor        mm1, mm1              ; 0
+        punpckhbw   mm1, mm5              ; a0b0c0d0
+        psraw       mm1, 11               ; sign extended shift right by 3
+        movq        mm5, mm0              ; save results
+
+        packsswb    mm0, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      mm5, [GLOBAL(ones)]
+        paddsw      mm1, [GLOBAL(ones)]
+        psraw       mm5, 1                ; partial shifted one more time for 2nd tap
+        psraw       mm1, 1                ; partial shifted one more time for 2nd tap
+        packsswb    mm5, mm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+        pandn       mm4, mm5              ; high edge variance additive
+
+        paddsb      mm6, mm2              ; p0+= p0 add
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi+rax], mm6        ; write back
+
+        movq        mm6, [rsi+2*rax]      ; p1
+        pxor        mm6, [GLOBAL(t80)]    ; reoffset
+        paddsb      mm6, mm4              ; p1+= p1 add
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi+2*rax], mm6      ; write back
+
+        psubsb      mm3, mm0              ; q0-= q0 add
+        pxor        mm3, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi], mm3            ; write back
+
+        psubsb      mm7, mm4              ; q1-= q1 add
+        pxor        mm7, [GLOBAL(t80)]    ; unoffset
+        movq        [rdi], mm7            ; write back
+
+        add         rsi,8
+        neg         rax
+        dec         rcx
+        jnz         .next8_h
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_vertical_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE
+sym(vp8_loop_filter_vertical_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 64      ; reserve 64 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[32];
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi + rax*4 - 4]
+
+        movsxd      rcx,        dword ptr arg(5) ;count
+.next8_v:
+        mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
+        add         rdi,        rax
+
+
+        ;transpose
+        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
+        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
+
+        punpckhbw   mm7,        [rdi+2*rax]                 ; 77 67 76 66 75 65 74 64
+        punpcklbw   mm6,        [rdi+2*rax]                 ; 73 63 72 62 71 61 70 60
+
+        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
+        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
+
+        punpckhbw   mm5,        [rsi+rax]                   ; 57 47 56 46 55 45 54 44
+        punpcklbw   mm4,        [rsi+rax]                   ; 53 43 52 42 51 41 50 40
+
+        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
+        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
+
+        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
+        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
+
+        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
+        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
+
+        neg         rax
+        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
+
+        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
+        punpckhbw   mm6,        [rsi+rax]                   ; 37 27 36 36 35 25 34 24
+
+        punpcklbw   mm1,        [rsi+rax]                   ; 33 23 32 22 31 21 30 20
+        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
+
+        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
+        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
+
+        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
+        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
+
+        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
+        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
+
+        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
+
+        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
+        psubusb     mm5,        mm7                         ; q2-q3
+
+        psubusb     mm7,        mm6                         ; q3-q2
+        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
+
+        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
+        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
+
+        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
+        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
+
+        psubusb     mm3,        mm6                         ; q1-q2
+        psubusb     mm6,        mm5                         ; q2-q1
+
+        por         mm6,        mm3                         ; mm6=abs(q2-q1)
+        lea         rdx,        srct
+
+        movq        [rdx+24],   mm5                         ; save q1
+        movq        [rdx+16],   mm0                         ; save q0
+
+        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
+        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
+
+        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
+        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
+
+        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
+        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
+
+        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
+        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
+
+        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
+        psubusb     mm2,        mm0                         ; p2-p3
+
+        psubusb     mm0,        mm1                         ; p3-p2
+        por         mm0,        mm2                         ; mm0=abs(p3-p2)
+
+        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
+        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
+
+        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
+        movq        [rdx+8],    mm3                         ; save p0
+
+        movq        [rdx],      mm2                         ; save p1
+        movq        mm5,        mm2                         ; mm5 = p1
+
+        psubusb     mm2,        mm1                         ; p1-p2
+        psubusb     mm1,        mm5                         ; p2-p1
+
+        por         mm1,        mm2                         ; mm1=abs(p2-p1)
+        mov         rdx,        arg(3) ;limit
+
+        movq        mm4,        [rdx]                       ; mm4 = limit
+        psubusb     mm7,        mm4
+
+        psubusb     mm0,        mm4
+        psubusb     mm1,        mm4
+
+        psubusb     mm6,        mm4
+        por         mm7,        mm6
+
+        por         mm0,        mm1
+        por         mm0,        mm7                         ;   abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+        movq        mm1,        mm5                         ; p1
+
+        movq        mm7,        mm3                         ; mm3=mm7=p0
+        psubusb     mm7,        mm5                         ; p0 - p1
+
+        psubusb     mm5,        mm3                         ; p1 - p0
+        por         mm5,        mm7                         ; abs(p1-p0)
+
+        movq        t0,         mm5                         ; save abs(p1-p0)
+        lea         rdx,        srct
+
+        psubusb     mm5,        mm4
+        por         mm0,        mm5                         ; mm0=mask
+
+        movq        mm5,        [rdx+16]                    ; mm5=q0
+        movq        mm7,        [rdx+24]                    ; mm7=q1
+
+        movq        mm6,        mm5                         ; mm6=q0
+        movq        mm2,        mm7                         ; q1
+        psubusb     mm5,        mm7                         ; q0-q1
+
+        psubusb     mm7,        mm6                         ; q1-q0
+        por         mm7,        mm5                         ; abs(q1-q0)
+
+        movq        t1,         mm7                         ; save abs(q1-q0)
+        psubusb     mm7,        mm4
+
+        por         mm0,        mm7                         ; mask
+
+        movq        mm5,        mm2                         ; q1
+        psubusb     mm5,        mm1                         ; q1-=p1
+        psubusb     mm1,        mm2                         ; p1-=q1
+        por         mm5,        mm1                         ; abs(p1-q1)
+        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
+        psrlw       mm5,        1                           ; abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;blimit                      ;
+
+        movq        mm4,        [rdx]                       ;blimit
+        movq        mm1,        mm3                         ; mm1=mm3=p0
+
+        movq        mm7,        mm6                         ; mm7=mm6=q0
+        psubusb     mm1,        mm7                         ; p0-q0
+
+        psubusb     mm7,        mm3                         ; q0-p0
+        por         mm1,        mm7                         ; abs(q0-p0)
+        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
+        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,        mm0;                        ; mask
+
+        pxor        mm0,        mm0
+        pcmpeqb     mm1,        mm0
+
+        ; calculate high edge variance
+        mov         rdx,        arg(4) ;thresh            ; get thresh
+        movq        mm7,        [rdx]
+        ;
+        movq        mm4,        t0              ; get abs (q1 - q0)
+        psubusb     mm4,        mm7
+
+        movq        mm3,        t1              ; get abs (p1 - p0)
+        psubusb     mm3,        mm7
+
+        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     mm4,        mm0
+
+        pcmpeqb     mm0,        mm0
+        pxor        mm4,        mm0
+
+
+
+        ; start work on filters
+        lea         rdx,        srct
+
+        movq        mm2,        [rdx]           ; p1
+        movq        mm7,        [rdx+24]        ; q1
+
+        movq        mm6,        [rdx+8]         ; p0
+        movq        mm0,        [rdx+16]        ; q0
+
+        pxor        mm2,        [GLOBAL(t80)]   ; p1 offset to convert to signed values
+        pxor        mm7,        [GLOBAL(t80)]   ; q1 offset to convert to signed values
+
+        psubsb      mm2,        mm7             ; p1 - q1
+        pand        mm2,        mm4             ; high var mask (hvm)(p1 - q1)
+
+        pxor        mm6,        [GLOBAL(t80)]   ; offset to convert to signed values
+        pxor        mm0,        [GLOBAL(t80)]   ; offset to convert to signed values
+
+        movq        mm3,        mm0             ; q0
+        psubsb      mm0,        mm6             ; q0 - p0
+
+        paddsb      mm2,        mm0             ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      mm2,        mm0             ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+        paddsb      mm2,        mm0             ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand       mm1,        mm2              ; mask filter values we don't care about
+
+        movq        mm2,        mm1
+        paddsb      mm1,        [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+
+        paddsb      mm2,        [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+        pxor        mm0,        mm0          ;
+
+        pxor        mm5,        mm5
+        punpcklbw   mm0,        mm2         ;
+
+        punpckhbw   mm5,        mm2         ;
+        psraw       mm0,        11              ;
+
+        psraw       mm5,        11
+        packsswb    mm0,        mm5
+
+        movq        mm2,        mm0         ;  (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+        pxor        mm0,        mm0           ; 0
+        movq        mm5,        mm1           ; abcdefgh
+
+        punpcklbw   mm0,        mm1           ; e0f0g0h0
+        psraw       mm0,        11                ; sign extended shift right by 3
+
+        pxor        mm1,        mm1           ; 0
+        punpckhbw   mm1,        mm5           ; a0b0c0d0
+
+        psraw       mm1,        11                ; sign extended shift right by 3
+        movq        mm5,        mm0              ; save results
+
+        packsswb    mm0,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+        paddsw      mm5,        [GLOBAL(ones)]
+
+        paddsw      mm1,        [GLOBAL(ones)]
+        psraw       mm5,        1                 ; partial shifted one more time for 2nd tap
+
+        psraw       mm1,        1                 ; partial shifted one more time for 2nd tap
+        packsswb    mm5,        mm1           ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+        pandn       mm4,        mm5             ; high edge variance additive
+
+        paddsb      mm6,        mm2             ; p0+= p0 add
+        pxor        mm6,        [GLOBAL(t80)]   ; unoffset
+
+        ; mm6=p0                               ;
+        movq        mm1,        [rdx]           ; p1
+        pxor        mm1,        [GLOBAL(t80)]   ; reoffset
+
+        paddsb      mm1,        mm4                 ; p1+= p1 add
+        pxor        mm1,        [GLOBAL(t80)]       ; unoffset
+        ; mm6 = p0 mm1 = p1
+
+        psubsb      mm3,        mm0                 ; q0-= q0 add
+        pxor        mm3,        [GLOBAL(t80)]       ; unoffset
+
+        ; mm3 = q0
+        psubsb      mm7,        mm4                 ; q1-= q1 add
+        pxor        mm7,        [GLOBAL(t80)]       ; unoffset
+        ; mm7 = q1
+
+        ; tranpose and write back
+        ; mm1 =    72 62 52 42 32 22 12 02
+        ; mm6 =    73 63 53 43 33 23 13 03
+        ; mm3 =    74 64 54 44 34 24 14 04
+        ; mm7 =    75 65 55 45 35 25 15 05
+
+        movq        mm2,        mm1             ; 72 62 52 42 32 22 12 02
+        punpcklbw   mm2,        mm6             ; 33 32 23 22 13 12 03 02
+
+        movq        mm4,        mm3             ; 74 64 54 44 34 24 14 04
+        punpckhbw   mm1,        mm6             ; 73 72 63 62 53 52 43 42
+
+        punpcklbw   mm4,        mm7             ; 35 34 25 24 15 14 05 04
+        punpckhbw   mm3,        mm7             ; 75 74 65 64 55 54 45 44
+
+        movq        mm6,        mm2             ; 33 32 23 22 13 12 03 02
+        punpcklwd   mm2,        mm4             ; 15 14 13 12 05 04 03 02
+
+        punpckhwd   mm6,        mm4             ; 35 34 33 32 25 24 23 22
+        movq        mm5,        mm1             ; 73 72 63 62 53 52 43 42
+
+        punpcklwd   mm1,        mm3             ; 55 54 53 52 45 44 43 42
+        punpckhwd   mm5,        mm3             ; 75 74 73 72 65 64 63 62
+
+
+        ; mm2 = 15 14 13 12 05 04 03 02
+        ; mm6 = 35 34 33 32 25 24 23 22
+        ; mm5 = 55 54 53 52 45 44 43 42
+        ; mm1 = 75 74 73 72 65 64 63 62
+
+
+
+        movd        [rsi+rax*4+2], mm2
+        psrlq       mm2,        32
+
+        movd        [rdi+rax*4+2], mm2
+        movd        [rsi+rax*2+2], mm6
+
+        psrlq       mm6,        32
+        movd        [rsi+rax+2],mm6
+
+        movd        [rsi+2],    mm1
+        psrlq       mm1,        32
+
+        movd        [rdi+2],    mm1
+        neg         rax
+
+        movd        [rdi+rax+2],mm5
+        psrlq       mm5,        32
+
+        movd        [rdi+rax*2+2], mm5
+
+        lea         rsi,        [rsi+rax*8]
+        dec         rcx
+        jnz         .next8_v
+
+    add rsp, 64
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_mbloop_filter_horizontal_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 32      ; reserve 32 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        movsxd      rcx, dword ptr arg(5) ;count
+.next8_mbh:
+        mov         rdx, arg(3) ;limit
+        movq        mm7, [rdx]
+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+
+        ; calculate breakout conditions
+        movq        mm2, [rdi+2*rax]      ; q3
+
+        movq        mm1, [rsi+2*rax]      ; q2
+        movq        mm6, mm1              ; q2
+        psubusb     mm1, mm2              ; q2-=q3
+        psubusb     mm2, mm6              ; q3-=q2
+        por         mm1, mm2              ; abs(q3-q2)
+        psubusb     mm1, mm7
+
+
+        ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
+        movq        mm4, [rsi+rax]        ; q1
+        movq        mm3, mm4              ; q1
+        psubusb     mm4, mm6              ; q1-=q2
+        psubusb     mm6, mm3              ; q2-=q1
+        por         mm4, mm6              ; abs(q2-q1)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        ; mm1 = mask,      mm3=q1, mm7 = limit
+
+        movq        mm4, [rsi]            ; q0
+        movq        mm0, mm4              ; q0
+        psubusb     mm4, mm3              ; q0-=q1
+        psubusb     mm3, mm0              ; q1-=q0
+        por         mm4, mm3              ; abs(q0-q1)
+        movq        t0, mm4               ; save to t0
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
+
+        neg         rax                   ; negate pitch to deal with above border
+
+        movq        mm2, [rsi+4*rax]      ; p3
+        movq        mm4, [rdi+4*rax]      ; p2
+        movq        mm5, mm4              ; p2
+        psubusb     mm4, mm2              ; p2-=p3
+        psubusb     mm2, mm5              ; p3-=p2
+        por         mm4, mm2              ; abs(p3 - p2)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
+
+        movq        mm4, [rsi+2*rax]      ; p1
+        movq        mm3, mm4              ; p1
+        psubusb     mm4, mm5              ; p1-=p2
+        psubusb     mm5, mm3              ; p2-=p1
+        por         mm4, mm5              ; abs(p2 - p1)
+        psubusb     mm4, mm7
+        por        mm1, mm4
+
+        movq        mm2, mm3              ; p1
+
+
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1)
+
+        movq        mm4, [rsi+rax]        ; p0
+        movq        mm5, mm4              ; p0
+        psubusb     mm4, mm3              ; p0-=p1
+        psubusb     mm3, mm5              ; p1-=p0
+        por         mm4, mm3              ; abs(p1 - p0)
+        movq        t1, mm4               ; save to t1
+        psubusb     mm4, mm7
+        por        mm1, mm4
+        ; mm1 = mask, mm0=q0,  mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm5 = p0
+        movq        mm3, [rdi]            ; q1
+        movq        mm4, mm3              ; q1
+        psubusb     mm3, mm2              ; q1-=p1
+        psubusb     mm2, mm4              ; p1-=q1
+        por         mm2, mm3              ; abs(p1-q1)
+        pand        mm2, [GLOBAL(tfe)]    ; set lsb of each byte to zero
+        psrlw       mm2, 1                ; abs(p1-q1)/2
+
+        movq        mm6, mm5              ; p0
+        movq        mm3, mm0              ; q0
+        psubusb     mm5, mm3              ; p0-=q0
+        psubusb     mm3, mm6              ; q0-=p0
+        por         mm5, mm3              ; abs(p0 - q0)
+        paddusb     mm5, mm5              ; abs(p0-q0)*2
+        paddusb     mm5, mm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm7, [rdx]            ; blimit
+
+        psubusb     mm5,    mm7           ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,    mm5
+        pxor        mm5,    mm5
+        pcmpeqb     mm1,    mm5           ; mask mm1
+
+        ; mm1 = mask, mm0=q0,  mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm6 = p0,
+
+        ; calculate high edge variance
+        mov         rdx, arg(4) ;thresh           ; get thresh
+        movq        mm7, [rdx]            ;
+        movq        mm4, t0               ; get abs (q1 - q0)
+        psubusb     mm4, mm7
+        movq        mm3, t1               ; get abs (p1 - p0)
+        psubusb     mm3, mm7
+        paddb       mm4, mm3              ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        pcmpeqb     mm4,        mm5
+
+        pcmpeqb     mm5,        mm5
+        pxor        mm4,        mm5
+
+
+
+        ; mm1 = mask, mm0=q0,  mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
+        ; mm6 = p0, mm4=hev
+        ; start work on filters
+        movq        mm2, [rsi+2*rax]      ; p1
+        movq        mm7, [rdi]            ; q1
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
+        paddsb      mm2, mm0              ; 2 * (q0 - p0)
+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
+        pand        mm1, mm2              ; mask filter values we don't care about
+
+
+        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
+        movq        mm2, mm1              ; vp8_filter
+        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
+
+        movq        mm5,        mm2       ;
+        paddsb      mm5,        [GLOBAL(t3)];
+
+        pxor        mm0, mm0              ; 0
+        pxor        mm7, mm7              ; 0
+
+        punpcklbw   mm0, mm5              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        punpckhbw   mm7, mm5              ; a0b0c0d0
+        psraw       mm7, 11               ; sign extended shift right by 3
+        packsswb    mm0, mm7              ; Filter2 >>=3;
+
+        movq        mm5, mm0              ; Filter2
+
+        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
+        pxor        mm0, mm0              ; 0
+        pxor        mm7, mm7              ; 0
+
+        punpcklbw   mm0, mm2              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        punpckhbw   mm7, mm2              ; a0b0c0d0
+        psraw       mm7, 11               ; sign extended shift right by 3
+        packsswb    mm0, mm7              ; Filter2 >>=3;
+
+        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
+        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
+        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
+
+        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
+        ; vp8_filter &= ~hev;
+        ; Filter2 = vp8_filter;
+        pandn       mm4, mm1              ; vp8_filter&=~hev
+
+
+        ; mm3=qs0, mm4=filter2, mm6=ps0
+
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+        ; s = vp8_signed_char_clamp(qs0 - u);
+        ; *oq0 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps0 + u);
+        ; *op0 = s^0x80;
+        pxor        mm0, mm0
+
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [GLOBAL(s27)]
+        pmulhw      mm2, [GLOBAL(s27)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        psubsb      mm3, mm1
+        paddsb      mm6, mm1
+
+        pxor        mm3, [GLOBAL(t80)]
+        pxor        mm6, [GLOBAL(t80)]
+        movq        [rsi+rax], mm6
+        movq        [rsi],     mm3
+
+        ; roughly 2/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+        ; s = vp8_signed_char_clamp(qs1 - u);
+        ; *oq1 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps1 + u);
+        ; *op1 = s^0x80;
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [GLOBAL(s18)]
+        pmulhw      mm2, [GLOBAL(s18)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        movq        mm3, [rdi]
+        movq        mm6, [rsi+rax*2]       ; p1
+
+        pxor        mm3, [GLOBAL(t80)]
+        pxor        mm6, [GLOBAL(t80)]
+
+        paddsb      mm6, mm1
+        psubsb      mm3, mm1
+
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
+        movq        [rdi], mm3
+        movq        [rsi+rax*2], mm6
+
+        ; roughly 1/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+        ; s = vp8_signed_char_clamp(qs2 - u);
+        ; *oq2 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps2 + u);
+        ; *op2 = s^0x80;
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [GLOBAL(s9)]
+        pmulhw      mm2, [GLOBAL(s9)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+
+        movq        mm6, [rdi+rax*4]
+        neg         rax
+        movq        mm3, [rdi+rax  ]
+
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
+
+        paddsb      mm6, mm1
+        psubsb      mm3, mm1
+
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
+        movq        [rdi+rax  ], mm3
+        neg         rax
+        movq        [rdi+rax*4], mm6
+
+;EARLY_BREAK_OUT:
+        neg         rax
+        add         rsi,8
+        dec         rcx
+        jnz         .next8_mbh
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_mbloop_filter_vertical_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;    const char *limit,
+;    const char *thresh,
+;    int count
+;)
+global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 96      ; reserve 96 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+    %define srct [rsp + 32]   ;__declspec(align(16)) char srct[64];
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rax,        dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi + rax*4 - 4]
+
+        movsxd      rcx,        dword ptr arg(5) ;count
+.next8_mbv:
+        lea         rdi,        [rsi + rax]  ; rdi points to row +1 for indirect addressing
+
+        ;transpose
+        movq        mm0,        [rdi+2*rax]                 ; 77 76 75 74 73 72 71 70
+        movq        mm6,        [rsi+2*rax]                 ; 67 66 65 64 63 62 61 60
+
+        movq        mm7,        mm6                         ; 77 76 75 74 73 72 71 70
+        punpckhbw   mm7,        mm0                         ; 77 67 76 66 75 65 74 64
+
+        punpcklbw   mm6,        mm0                         ; 73 63 72 62 71 61 70 60
+        movq        mm0,        [rsi+rax]                   ; 57 56 55 54 53 52 51 50
+
+        movq        mm4,        [rsi]                       ; 47 46 45 44 43 42 41 40
+        movq        mm5,        mm4                         ; 47 46 45 44 43 42 41 40
+
+        punpckhbw   mm5,        mm0                         ; 57 47 56 46 55 45 54 44
+        punpcklbw   mm4,        mm0                         ; 53 43 52 42 51 41 50 40
+
+        movq        mm3,        mm5                         ; 57 47 56 46 55 45 54 44
+        punpckhwd   mm5,        mm7                         ; 77 67 57 47 76 66 56 46
+
+        punpcklwd   mm3,        mm7                         ; 75 65 55 45 74 64 54 44
+        movq        mm2,        mm4                         ; 53 43 52 42 51 41 50 40
+
+        punpckhwd   mm4,        mm6                         ; 73 63 53 43 72 62 52 42
+        punpcklwd   mm2,        mm6                         ; 71 61 51 41 70 60 50 40
+
+        neg         rax
+
+        movq        mm7,        [rsi+rax]                   ; 37 36 35 34 33 32 31 30
+        movq        mm6,        [rsi+rax*2]                 ; 27 26 25 24 23 22 21 20
+
+        movq        mm1,        mm6                         ; 27 26 25 24 23 22 21 20
+        punpckhbw   mm6,        mm7                         ; 37 27 36 36 35 25 34 24
+
+        punpcklbw   mm1,        mm7                         ; 33 23 32 22 31 21 30 20
+
+        movq        mm7,        [rsi+rax*4];                ; 07 06 05 04 03 02 01 00
+        punpckhbw   mm7,        [rdi+rax*4]                 ; 17 07 16 06 15 05 14 04
+
+        movq        mm0,        mm7                         ; 17 07 16 06 15 05 14 04
+        punpckhwd   mm7,        mm6                         ; 37 27 17 07 36 26 16 06
+
+        punpcklwd   mm0,        mm6                         ; 35 25 15 05 34 24 14 04
+        movq        mm6,        mm7                         ; 37 27 17 07 36 26 16 06
+
+        punpckhdq   mm7,        mm5                         ; 77 67 57 47 37 27 17 07  = q3
+        punpckldq   mm6,        mm5                         ; 76 66 56 46 36 26 16 06  = q2
+
+        lea         rdx,        srct
+        movq        mm5,        mm6                         ; 76 66 56 46 36 26 16 06
+
+        movq        [rdx+56],   mm7
+        psubusb     mm5,        mm7                         ; q2-q3
+
+
+        movq        [rdx+48],   mm6
+        psubusb     mm7,        mm6                         ; q3-q2
+
+        por         mm7,        mm5;                        ; mm7=abs (q3-q2)
+        movq        mm5,        mm0                         ; 35 25 15 05 34 24 14 04
+
+        punpckhdq   mm5,        mm3                         ; 75 65 55 45 35 25 15 05 = q1
+        punpckldq   mm0,        mm3                         ; 74 64 54 44 34 24 15 04 = q0
+
+        movq        mm3,        mm5                         ; 75 65 55 45 35 25 15 05 = q1
+        psubusb     mm3,        mm6                         ; q1-q2
+
+        psubusb     mm6,        mm5                         ; q2-q1
+        por         mm6,        mm3                         ; mm6=abs(q2-q1)
+
+        movq        [rdx+40],   mm5                         ; save q1
+        movq        [rdx+32],   mm0                         ; save q0
+
+        movq        mm3,        [rsi+rax*4]                 ; 07 06 05 04 03 02 01 00
+        punpcklbw   mm3,        [rdi+rax*4]                 ; 13 03 12 02 11 01 10 00
+
+        movq        mm0,        mm3                         ; 13 03 12 02 11 01 10 00
+        punpcklwd   mm0,        mm1                         ; 31 21 11 01 30 20 10 00
+
+        punpckhwd   mm3,        mm1                         ; 33 23 13 03 32 22 12 02
+        movq        mm1,        mm0                         ; 31 21 11 01 30 20 10 00
+
+        punpckldq   mm0,        mm2                         ; 70 60 50 40 30 20 10 00  =p3
+        punpckhdq   mm1,        mm2                         ; 71 61 51 41 31 21 11 01  =p2
+
+        movq        [rdx],      mm0                         ; save p3
+        movq        [rdx+8],    mm1                         ; save p2
+
+        movq        mm2,        mm1                         ; 71 61 51 41 31 21 11 01  =p2
+        psubusb     mm2,        mm0                         ; p2-p3
+
+        psubusb     mm0,        mm1                         ; p3-p2
+        por         mm0,        mm2                         ; mm0=abs(p3-p2)
+
+        movq        mm2,        mm3                         ; 33 23 13 03 32 22 12 02
+        punpckldq   mm2,        mm4                         ; 72 62 52 42 32 22 12 02 = p1
+
+        punpckhdq   mm3,        mm4                         ; 73 63 53 43 33 23 13 03 = p0
+        movq        [rdx+24],   mm3                         ; save p0
+
+        movq        [rdx+16],   mm2                         ; save p1
+        movq        mm5,        mm2                         ; mm5 = p1
+
+        psubusb     mm2,        mm1                         ; p1-p2
+        psubusb     mm1,        mm5                         ; p2-p1
+
+        por         mm1,        mm2                         ; mm1=abs(p2-p1)
+        mov         rdx,        arg(3) ;limit
+
+        movq        mm4,        [rdx]                       ; mm4 = limit
+        psubusb     mm7,        mm4                         ; abs(q3-q2) > limit
+
+        psubusb     mm0,        mm4                         ; abs(p3-p2) > limit
+        psubusb     mm1,        mm4                         ; abs(p2-p1) > limit
+
+        psubusb     mm6,        mm4                         ; abs(q2-q1) > limit
+        por         mm7,        mm6                         ; or
+
+        por         mm0,        mm1                         ;
+        por         mm0,        mm7                         ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+        movq        mm1,        mm5                         ; p1
+
+        movq        mm7,        mm3                         ; mm3=mm7=p0
+        psubusb     mm7,        mm5                         ; p0 - p1
+
+        psubusb     mm5,        mm3                         ; p1 - p0
+        por         mm5,        mm7                         ; abs(p1-p0)
+
+        movq        t0,         mm5                         ; save abs(p1-p0)
+        lea         rdx,        srct
+
+        psubusb     mm5,        mm4                         ; mm5 = abs(p1-p0) > limit
+        por         mm0,        mm5                         ; mm0=mask
+
+        movq        mm5,        [rdx+32]                    ; mm5=q0
+        movq        mm7,        [rdx+40]                    ; mm7=q1
+
+        movq        mm6,        mm5                         ; mm6=q0
+        movq        mm2,        mm7                         ; q1
+        psubusb     mm5,        mm7                         ; q0-q1
+
+        psubusb     mm7,        mm6                         ; q1-q0
+        por         mm7,        mm5                         ; abs(q1-q0)
+
+        movq        t1,         mm7                         ; save abs(q1-q0)
+        psubusb     mm7,        mm4                         ; mm7=abs(q1-q0)> limit
+
+        por         mm0,        mm7                         ; mask
+
+        movq        mm5,        mm2                         ; q1
+        psubusb     mm5,        mm1                         ; q1-=p1
+        psubusb     mm1,        mm2                         ; p1-=q1
+        por         mm5,        mm1                         ; abs(p1-q1)
+        pand        mm5,        [GLOBAL(tfe)]               ; set lsb of each byte to zero
+        psrlw       mm5,        1                           ; abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;blimit                      ;
+
+        movq        mm4,        [rdx]                       ;blimit
+        movq        mm1,        mm3                         ; mm1=mm3=p0
+
+        movq        mm7,        mm6                         ; mm7=mm6=q0
+        psubusb     mm1,        mm7                         ; p0-q0
+
+        psubusb     mm7,        mm3                         ; q0-p0
+        por         mm1,        mm7                         ; abs(q0-p0)
+        paddusb     mm1,        mm1                         ; abs(q0-p0)*2
+        paddusb     mm1,        mm5                         ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        psubusb     mm1,        mm4                         ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         mm1,        mm0;                        ; mask
+
+        pxor        mm0,        mm0
+        pcmpeqb     mm1,        mm0
+
+        ; calculate high edge variance
+        mov         rdx,        arg(4) ;thresh            ; get thresh
+        movq        mm7,        [rdx]
+        ;
+        movq        mm4,        t0              ; get abs (q1 - q0)
+        psubusb     mm4,        mm7             ; abs(q1 - q0) > thresh
+
+        movq        mm3,        t1              ; get abs (p1 - p0)
+        psubusb     mm3,        mm7             ; abs(p1 - p0)> thresh
+
+        por         mm4,        mm3             ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+        pcmpeqb     mm4,        mm0
+
+        pcmpeqb     mm0,        mm0
+        pxor        mm4,        mm0
+
+
+
+
+        ; start work on filters
+        lea         rdx,        srct
+
+        ; start work on filters
+        movq        mm2, [rdx+16]         ; p1
+        movq        mm7, [rdx+40]         ; q1
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+
+        movq        mm6, [rdx+24]         ; p0
+        movq        mm0, [rdx+32]         ; q0
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
+
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; 1 * (q0 - p0) + (p1 - q1)
+        paddsb      mm2, mm0              ; 2 * (q0 - p0)
+        paddsb      mm2, mm0              ; 3 * (q0 - p0) + (p1 - q1)
+        pand       mm1, mm2           ; mask filter values we don't care about
+
+        ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
+        movq        mm2, mm1              ; vp8_filter
+        pand        mm2, mm4;             ; Filter2 = vp8_filter & hev
+
+        movq        mm5,        mm2       ;
+        paddsb      mm5,        [GLOBAL(t3)];
+
+        pxor        mm0, mm0              ; 0
+        pxor        mm7, mm7              ; 0
+
+        punpcklbw   mm0, mm5              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        punpckhbw   mm7, mm5              ; a0b0c0d0
+        psraw       mm7, 11               ; sign extended shift right by 3
+        packsswb    mm0, mm7              ; Filter2 >>=3;
+
+        movq        mm5, mm0              ; Filter2
+
+        paddsb      mm2, [GLOBAL(t4)]     ; vp8_signed_char_clamp(Filter2 + 4)
+        pxor        mm0, mm0              ; 0
+        pxor        mm7, mm7              ; 0
+
+        punpcklbw   mm0, mm2              ; e0f0g0h0
+        psraw       mm0, 11               ; sign extended shift right by 3
+        punpckhbw   mm7, mm2              ; a0b0c0d0
+        psraw       mm7, 11               ; sign extended shift right by 3
+        packsswb    mm0, mm7              ; Filter2 >>=3;
+
+        ; mm0= filter2 mm1 = vp8_filter,  mm3 =qs0 mm5=s mm4 =hev mm6=ps0
+        psubsb      mm3, mm0              ; qs0 =qs0 - filter1
+        paddsb      mm6, mm5              ; ps0 =ps0 + Fitler2
+
+        ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
+        ; vp8_filter &= ~hev;
+        ; Filter2 = vp8_filter;
+        pandn       mm4, mm1              ; vp8_filter&=~hev
+
+
+        ; mm3=qs0, mm4=filter2, mm6=ps0
+
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+        ; s = vp8_signed_char_clamp(qs0 - u);
+        ; *oq0 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps0 + u);
+        ; *op0 = s^0x80;
+        pxor        mm0, mm0
+
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [GLOBAL(s27)]
+        pmulhw      mm2, [GLOBAL(s27)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        psubsb      mm3, mm1
+        paddsb      mm6, mm1
+
+        pxor        mm3, [GLOBAL(t80)]
+        pxor        mm6, [GLOBAL(t80)]
+        movq        [rdx+24], mm6
+        movq        [rdx+32], mm3
+
+        ; roughly 2/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+        ; s = vp8_signed_char_clamp(qs1 - u);
+        ; *oq1 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps1 + u);
+        ; *op1 = s^0x80;
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [GLOBAL(s18)]
+        pmulhw      mm2, [GLOBAL(s18)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        movq        mm3, [rdx + 40]
+        movq        mm6, [rdx + 16]       ; p1
+        pxor        mm3, [GLOBAL(t80)]
+        pxor        mm6, [GLOBAL(t80)]
+
+        paddsb      mm6, mm1
+        psubsb      mm3, mm1
+
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
+        movq        [rdx + 40], mm3
+        movq        [rdx + 16], mm6
+
+        ; roughly 1/7th difference across boundary
+        ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+        ; s = vp8_signed_char_clamp(qs2 - u);
+        ; *oq2 = s^0x80;
+        ; s = vp8_signed_char_clamp(ps2 + u);
+        ; *op2 = s^0x80;
+        pxor        mm1, mm1
+        pxor        mm2, mm2
+        punpcklbw   mm1, mm4
+        punpckhbw   mm2, mm4
+        pmulhw      mm1, [GLOBAL(s9)]
+        pmulhw      mm2, [GLOBAL(s9)]
+        paddw       mm1, [GLOBAL(s63)]
+        paddw       mm2, [GLOBAL(s63)]
+        psraw       mm1, 7
+        psraw       mm2, 7
+        packsswb    mm1, mm2
+
+        movq        mm6, [rdx+ 8]
+        movq        mm3, [rdx+48]
+
+        pxor        mm6, [GLOBAL(t80)]
+        pxor        mm3, [GLOBAL(t80)]
+
+        paddsb      mm6, mm1
+        psubsb      mm3, mm1
+
+        pxor        mm6, [GLOBAL(t80)]          ; mm6 = 71 61 51 41 31 21 11 01
+        pxor        mm3, [GLOBAL(t80)]          ; mm3 = 76 66 56 46 36 26 15 06
+
+        ; tranpose and write back
+        movq        mm0,    [rdx]               ; mm0 = 70 60 50 40 30 20 10 00
+        movq        mm1,    mm0                 ; mm0 = 70 60 50 40 30 20 10 00
+
+        punpcklbw   mm0,    mm6                 ; mm0 = 31 30 21 20 11 10 01 00
+        punpckhbw   mm1,    mm6                 ; mm3 = 71 70 61 60 51 50 41 40
+
+        movq        mm2,    [rdx+16]            ; mm2 = 72 62 52 42 32 22 12 02
+        movq        mm6,    mm2                 ; mm3 = 72 62 52 42 32 22 12 02
+
+        punpcklbw   mm2,    [rdx+24]            ; mm2 = 33 32 23 22 13 12 03 02
+        punpckhbw   mm6,    [rdx+24]            ; mm3 = 73 72 63 62 53 52 43 42
+
+        movq        mm5,    mm0                 ; mm5 = 31 30 21 20 11 10 01 00
+        punpcklwd   mm0,    mm2                 ; mm0 = 13 12 11 10 03 02 01 00
+
+        punpckhwd   mm5,    mm2                 ; mm5 = 33 32 31 30 23 22 21 20
+        movq        mm4,    mm1                 ; mm4 = 71 70 61 60 51 50 41 40
+
+        punpcklwd   mm1,    mm6                 ; mm1 = 53 52 51 50 43 42 41 40
+        punpckhwd   mm4,    mm6                 ; mm4 = 73 72 71 70 63 62 61 60
+
+        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
+        punpcklbw   mm2,    [rdx+40]            ; mm2 = 35 34 25 24 15 14 05 04
+
+        movq        mm6,    mm3                 ; mm6 = 76 66 56 46 36 26 15 06
+        punpcklbw   mm6,    [rdx+56]            ; mm6 = 37 36 27 26 17 16 07 06
+
+        movq        mm7,    mm2                 ; mm7 = 35 34 25 24 15 14 05 04
+        punpcklwd   mm2,    mm6                 ; mm2 = 17 16 15 14 07 06 05 04
+
+        punpckhwd   mm7,    mm6                 ; mm7 = 37 36 35 34 27 26 25 24
+        movq        mm6,    mm0                 ; mm6 = 13 12 11 10 03 02 01 00
+
+        punpckldq   mm0,    mm2                 ; mm0 = 07 06 05 04 03 02 01 00
+        punpckhdq   mm6,    mm2                 ; mm6 = 17 16 15 14 13 12 11 10
+
+        movq        [rsi+rax*4], mm0            ; write out
+        movq        [rdi+rax*4], mm6            ; write out
+
+        movq        mm0,    mm5                 ; mm0 = 33 32 31 30 23 22 21 20
+        punpckldq   mm0,    mm7                 ; mm0 = 27 26 25 24 23 22 20 20
+
+        punpckhdq   mm5,    mm7                 ; mm5 = 37 36 35 34 33 32 31 30
+        movq        [rsi+rax*2], mm0            ; write out
+
+        movq        [rdi+rax*2], mm5            ; write out
+        movq        mm2,    [rdx+32]            ; mm2 = 74 64 54 44 34 24 14 04
+
+        punpckhbw   mm2,    [rdx+40]            ; mm2 = 75 74 65 64 54 54 45 44
+        punpckhbw   mm3,    [rdx+56]            ; mm3 = 77 76 67 66 57 56 47 46
+
+        movq        mm5,    mm2                 ; mm5 = 75 74 65 64 54 54 45 44
+        punpcklwd   mm2,    mm3                 ; mm2 = 57 56 55 54 47 46 45 44
+
+        punpckhwd   mm5,    mm3                 ; mm5 = 77 76 75 74 67 66 65 64
+        movq        mm0,    mm1                 ; mm0=  53 52 51 50 43 42 41 40
+
+        movq        mm3,    mm4                 ; mm4 = 73 72 71 70 63 62 61 60
+        punpckldq   mm0,    mm2                 ; mm0 = 47 46 45 44 43 42 41 40
+
+        punpckhdq   mm1,    mm2                 ; mm1 = 57 56 55 54 53 52 51 50
+        movq        [rsi],  mm0                 ; write out
+
+        movq        [rdi],  mm1                 ; write out
+        neg         rax
+
+        punpckldq   mm3,    mm5                 ; mm3 = 67 66 65 64 63 62 61 60
+        punpckhdq   mm4,    mm5                 ; mm4 = 77 76 75 74 73 72 71 60
+
+        movq        [rsi+rax*2], mm3
+        movq        [rdi+rax*2], mm4
+
+        lea         rsi,        [rsi+rax*8]
+        dec         rcx
+
+        jnz         .next8_mbv
+
+    add rsp, 96
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_simple_horizontal_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit
+;)
+global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE
+sym(vp8_loop_filter_simple_horizontal_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        mov         rcx, 2                ; count
+.nexts8_h:
+        mov         rdx, arg(2) ;blimit           ; get blimit
+        movq        mm3, [rdx]            ;
+
+        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
+        add         rdi, rax
+        neg         rax
+
+        ; calculate mask
+        movq        mm1, [rsi+2*rax]      ; p1
+        movq        mm0, [rdi]            ; q1
+        movq        mm2, mm1
+        movq        mm7, mm0
+        movq        mm4, mm0
+        psubusb     mm0, mm1              ; q1-=p1
+        psubusb     mm1, mm4              ; p1-=q1
+        por         mm1, mm0              ; abs(p1-q1)
+        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
+        psrlw       mm1, 1                ; abs(p1-q1)/2
+
+        movq        mm5, [rsi+rax]        ; p0
+        movq        mm4, [rsi]            ; q0
+        movq        mm0, mm4              ; q0
+        movq        mm6, mm5              ; p0
+        psubusb     mm5, mm4              ; p0-=q0
+        psubusb     mm4, mm6              ; q0-=p0
+        por         mm5, mm4              ; abs(p0 - q0)
+        paddusb     mm5, mm5              ; abs(p0-q0)*2
+        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        pxor        mm3, mm3
+        pcmpeqb     mm5, mm3
+
+        ; start work on filters
+        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
+        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
+        psubsb      mm2, mm7              ; p1 - q1
+
+        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
+        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
+        movq        mm3, mm0              ; q0
+        psubsb      mm0, mm6              ; q0 - p0
+        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
+        pand        mm5, mm2              ; mask filter values we don't care about
+
+        ; do + 4 side
+        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
+
+        movq        mm0, mm5              ; get a copy of filters
+        psllw       mm0, 8                ; shift left 8
+        psraw       mm0, 3                ; arithmetic shift right 11
+        psrlw       mm0, 8
+        movq        mm1, mm5              ; get a copy of filters
+        psraw       mm1, 11               ; arithmetic shift right 11
+        psllw       mm1, 8                ; shift left 8 to put it back
+
+        por         mm0, mm1              ; put the two together to get result
+
+        psubsb      mm3, mm0              ; q0-= q0 add
+        pxor        mm3, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi], mm3            ; write back
+
+
+        ; now do +3 side
+        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
+
+        movq        mm0, mm5              ; get a copy of filters
+        psllw       mm0, 8                ; shift left 8
+        psraw       mm0, 3                ; arithmetic shift right 11
+        psrlw       mm0, 8
+        psraw       mm5, 11               ; arithmetic shift right 11
+        psllw       mm5, 8                ; shift left 8 to put it back
+        por         mm0, mm5              ; put the two together to get result
+
+
+        paddsb      mm6, mm0              ; p0+= p0 add
+        pxor        mm6, [GLOBAL(t80)]    ; unoffset
+        movq        [rsi+rax], mm6        ; write back
+
+        add         rsi,8
+        neg         rax
+        dec         rcx
+        jnz         .nexts8_h
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_simple_vertical_edge_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit
+;)
+global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE
+sym(vp8_loop_filter_simple_vertical_edge_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, 32      ; reserve 32 bytes
+    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
+    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi, [rsi + rax*4- 2];  ;
+        mov         rcx, 2                                      ; count
+.nexts8_v:
+
+        lea         rdi,        [rsi + rax];
+        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
+
+        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
+        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
+
+        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
+        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
+
+        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
+        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
+
+        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
+        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
+
+        neg         rax
+
+        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
+        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
+
+        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
+        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
+
+        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
+        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
+
+        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
+        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
+
+        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
+        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
+
+        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
+        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
+
+        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
+        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
+
+        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
+
+
+        ; calculate mask
+        movq        mm6,        mm0                             ; p1
+        movq        mm7,        mm3                             ; q1
+        psubusb     mm7,        mm6                             ; q1-=p1
+        psubusb     mm6,        mm3                             ; p1-=q1
+        por         mm6,        mm7                             ; abs(p1-q1)
+        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
+        psrlw       mm6,        1                               ; abs(p1-q1)/2
+
+        movq        mm5,        mm1                             ; p0
+        movq        mm4,        mm2                             ; q0
+
+        psubusb     mm5,        mm2                             ; p0-=q0
+        psubusb     mm4,        mm1                             ; q0-=p0
+
+        por         mm5,        mm4                             ; abs(p0 - q0)
+        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
+        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        mov         rdx,        arg(2) ;blimit                          ; get blimit
+        movq        mm7,        [rdx]
+
+        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        pxor        mm7,        mm7
+        pcmpeqb     mm5,        mm7                             ; mm5 = mask
+
+        ; start work on filters
+        movq        t0,         mm0
+        movq        t1,         mm3
+
+        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
+        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
+
+        psubsb      mm0,        mm3                             ; p1 - q1
+        movq        mm6,        mm1                             ; p0
+
+        movq        mm7,        mm2                             ; q0
+        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
+
+        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
+        movq        mm3,        mm7                             ; offseted ; q0
+
+        psubsb      mm7,        mm6                             ; q0 - p0
+        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
+
+        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
+
+        pand        mm5,        mm0                             ; mask filter values we don't care about
+
+        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
+
+        movq        mm0,        mm5                             ; get a copy of filters
+        psllw       mm0,        8                               ; shift left 8
+        psraw       mm0,        3                               ; arithmetic shift right 11
+        psrlw       mm0,        8
+
+        movq        mm7,        mm5                             ; get a copy of filters
+        psraw       mm7,        11                              ; arithmetic shift right 11
+        psllw       mm7,        8                               ; shift left 8 to put it back
+
+        por         mm0,        mm7                             ; put the two together to get result
+
+        psubsb      mm3,        mm0                             ; q0-= q0sz add
+        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
+
+        ; now do +3 side
+        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
+
+        movq        mm0, mm5                                    ; get a copy of filters
+        psllw       mm0, 8                                      ; shift left 8
+        psraw       mm0, 3                                      ; arithmetic shift right 11
+        psrlw       mm0, 8
+
+        psraw       mm5, 11                                     ; arithmetic shift right 11
+        psllw       mm5, 8                                      ; shift left 8 to put it back
+        por         mm0, mm5                                    ; put the two together to get result
+
+        paddsb      mm6, mm0                                    ; p0+= p0 add
+        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
+
+
+        movq        mm0,        t0
+        movq        mm4,        t1
+
+        ; mm0 = 70 60 50 40 30 20 10 00
+        ; mm6 = 71 61 51 41 31 21 11 01
+        ; mm3 = 72 62 52 42 32 22 12 02
+        ; mm4 = 73 63 53 43 33 23 13 03
+        ; transpose back to write out
+
+        movq        mm1,        mm0                         ;
+        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
+
+        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
+        movq        mm2,        mm3                         ;
+
+        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
+        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
+
+        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
+        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
+
+        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
+        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
+
+        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
+        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
+
+        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
+        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
+
+        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
+        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
+
+        psrlq       mm6,        32                          ; 33 32 31 30
+        movd        [rsi],      mm1                         ; write 43 42 41 40
+
+        movd        [rsi + rax], mm6                        ; write 33 32 31 30
+        neg         rax
+
+        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
+        psrlq       mm1,        32                          ; 53 52 51 50
+
+        movd        [rdi],      mm1                         ; write out 53 52 51 50
+        psrlq       mm5,        32                          ; 73 72 71 70
+
+        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
+
+        lea         rsi,        [rsi+rax*8]                 ; next 8
+
+        dec         rcx
+        jnz         .nexts8_v
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
+;                  int y_stride,
+;                  loop_filter_info *lfi)
+;{
+;
+;
+;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;}
+
+SECTION_RODATA
+align 16
+tfe:
+    times 8 db 0xfe
+align 16
+t80:
+    times 8 db 0x80
+align 16
+t1s:
+    times 8 db 0x01
+align 16
+t3:
+    times 8 db 0x03
+align 16
+t4:
+    times 8 db 0x04
+align 16
+ones:
+    times 4 dw 0x0001
+align 16
+s27:
+    times 4 dw 0x1b00
+align 16
+s18:
+    times 4 dw 0x1200
+align 16
+s9:
+    times 4 dw 0x0900
+align 16
+s63:
+    times 4 dw 0x003f
diff --git a/libvpx/vp8/common/x86/loopfilter_sse2.asm b/libvpx/vp8/common/x86/loopfilter_sse2.asm
new file mode 100644
index 0000000..a66753b
--- /dev/null
+++ b/libvpx/vp8/common/x86/loopfilter_sse2.asm
@@ -0,0 +1,1640 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%define _t0 0
+%define _t1 _t0 + 16
+%define _p3 _t1 + 16
+%define _p2 _p3 + 16
+%define _p1 _p2 + 16
+%define _p0 _p1 + 16
+%define _q0 _p0 + 16
+%define _q1 _q0 + 16
+%define _q2 _q1 + 16
+%define _q3 _q2 + 16
+%define lf_var_size 160
+
+; Use of pmaxub instead of psubusb to compute filter mask was seen
+; in ffvp8
+
+%macro LFH_FILTER_AND_HEV_MASK 1
+%if %1
+        movdqa      xmm2,                   [rdi+2*rax]       ; q3
+        movdqa      xmm1,                   [rsi+2*rax]       ; q2
+        movdqa      xmm4,                   [rsi+rax]         ; q1
+        movdqa      xmm5,                   [rsi]             ; q0
+        neg         rax                     ; negate pitch to deal with above border
+%else
+        movlps      xmm2,                   [rsi + rcx*2]     ; q3
+        movlps      xmm1,                   [rsi + rcx]       ; q2
+        movlps      xmm4,                   [rsi]             ; q1
+        movlps      xmm5,                   [rsi + rax]       ; q0
+
+        movhps      xmm2,                   [rdi + rcx*2]
+        movhps      xmm1,                   [rdi + rcx]
+        movhps      xmm4,                   [rdi]
+        movhps      xmm5,                   [rdi + rax]
+
+        lea         rsi,                    [rsi + rax*4]
+        lea         rdi,                    [rdi + rax*4]
+
+        movdqa      [rsp+_q2],              xmm1              ; store q2
+        movdqa      [rsp+_q1],              xmm4              ; store q1
+%endif
+        movdqa      xmm7,                   [rdx]             ;limit
+
+        movdqa      xmm6,                   xmm1              ; q2
+        movdqa      xmm3,                   xmm4              ; q1
+
+        psubusb     xmm1,                   xmm2              ; q2-=q3
+        psubusb     xmm2,                   xmm6              ; q3-=q2
+
+        psubusb     xmm4,                   xmm6              ; q1-=q2
+        psubusb     xmm6,                   xmm3              ; q2-=q1
+
+        por         xmm4,                   xmm6              ; abs(q2-q1)
+        por         xmm1,                   xmm2              ; abs(q3-q2)
+
+        movdqa      xmm0,                   xmm5              ; q0
+        pmaxub      xmm1,                   xmm4
+
+        psubusb     xmm5,                   xmm3              ; q0-=q1
+        psubusb     xmm3,                   xmm0              ; q1-=q0
+
+        por         xmm5,                   xmm3              ; abs(q0-q1)
+        movdqa      [rsp+_t0],              xmm5              ; save to t0
+
+        pmaxub      xmm1,                   xmm5
+
+%if %1
+        movdqa      xmm2,                   [rsi+4*rax]       ; p3
+        movdqa      xmm4,                   [rdi+4*rax]       ; p2
+        movdqa      xmm6,                   [rsi+2*rax]       ; p1
+%else
+        movlps      xmm2,                   [rsi + rax]       ; p3
+        movlps      xmm4,                   [rsi]             ; p2
+        movlps      xmm6,                   [rsi + rcx]       ; p1
+
+        movhps      xmm2,                   [rdi + rax]
+        movhps      xmm4,                   [rdi]
+        movhps      xmm6,                   [rdi + rcx]
+
+        movdqa      [rsp+_p2],              xmm4              ; store p2
+        movdqa      [rsp+_p1],              xmm6              ; store p1
+%endif
+
+        movdqa      xmm5,                   xmm4              ; p2
+        movdqa      xmm3,                   xmm6              ; p1
+
+        psubusb     xmm4,                   xmm2              ; p2-=p3
+        psubusb     xmm2,                   xmm5              ; p3-=p2
+
+        psubusb     xmm3,                   xmm5              ; p1-=p2
+        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
+
+        psubusb     xmm5,                   xmm6              ; p2-=p1
+        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
+
+        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
+        movdqa      xmm2,                   xmm6              ; p1
+
+        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
+%if %1
+        movdqa      xmm4,                   [rsi+rax]         ; p0
+        movdqa      xmm3,                   [rdi]             ; q1
+%else
+        movlps      xmm4,                   [rsi + rcx*2]     ; p0
+        movhps      xmm4,                   [rdi + rcx*2]
+        movdqa      xmm3,                   [rsp+_q1]                ; q1
+%endif
+
+        movdqa      xmm5,                   xmm4              ; p0
+        psubusb     xmm4,                   xmm6              ; p0-=p1
+
+        psubusb     xmm6,                   xmm5              ; p1-=p0
+
+        por         xmm6,                   xmm4              ; abs(p1 - p0)
+        mov         rdx,                    arg(2)            ; get blimit
+
+        movdqa     [rsp+_t1],               xmm6              ; save to t1
+
+        movdqa      xmm4,                   xmm3              ; q1
+        pmaxub      xmm1,                   xmm6
+
+        psubusb     xmm3,                   xmm2              ; q1-=p1
+        psubusb     xmm2,                   xmm4              ; p1-=q1
+
+        psubusb     xmm1,                   xmm7
+        por         xmm2,                   xmm3              ; abs(p1-q1)
+
+        movdqa      xmm7,                   [rdx]             ; blimit
+        mov         rdx,                    arg(4)            ; hev get thresh
+
+        movdqa      xmm3,                   xmm0              ; q0
+        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
+
+        movdqa      xmm6,                   xmm5              ; p0
+        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
+
+        psubusb     xmm5,                   xmm3              ; p0-=q0
+        psubusb     xmm3,                   xmm6              ; q0-=p0
+        por         xmm5,                   xmm3              ; abs(p0 - q0)
+
+        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
+
+        movdqa      xmm4,                   [rsp+_t0]                ; hev get abs (q1 - q0)
+        movdqa      xmm3,                   [rsp+_t1]                ; get abs (p1 - p0)
+
+        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        movdqa      xmm2,                   [rdx]             ; hev
+
+        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        psubusb     xmm4,                   xmm2              ; hev
+
+        psubusb     xmm3,                   xmm2              ; hev
+        por         xmm1,                   xmm5
+
+        pxor        xmm7,                   xmm7
+        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        pcmpeqb     xmm4,                   xmm5              ; hev
+        pcmpeqb     xmm3,                   xmm3              ; hev
+
+        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
+        pxor        xmm4,                   xmm3              ; hev
+%endmacro
+
+%macro B_FILTER 1
+        movdqa      xmm3,                   [GLOBAL(t80)]
+%if %1 == 0
+        movdqa      xmm2,                   [rsp+_p1]                ; p1
+        movdqa      xmm7,                   [rsp+_q1]                ; q1
+%elif %1 == 1
+        movdqa      xmm2,                   [rsi+2*rax]       ; p1
+        movdqa      xmm7,                   [rdi]             ; q1
+%elif %1 == 2
+        movdqa      xmm2,                   [rsp+_p1]         ; p1
+        movdqa      xmm6,                   [rsp+_p0]         ; p0
+        movdqa      xmm0,                   [rsp+_q0]         ; q0
+        movdqa      xmm7,                   [rsp+_q1]         ; q1
+%endif
+
+        pxor        xmm2,                   xmm3              ; p1 offset to convert to signed values
+        pxor        xmm7,                   xmm3              ; q1 offset to convert to signed values
+
+        psubsb      xmm2,                   xmm7              ; p1 - q1
+        pxor        xmm6,                   xmm3              ; offset to convert to signed values
+
+        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
+        pxor        xmm0,                   xmm3              ; offset to convert to signed values
+
+        movdqa      xmm3,                   xmm0              ; q0
+        psubsb      xmm0,                   xmm6              ; q0 - p0
+        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
+        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
+        pand        xmm1,                   xmm2              ; mask filter values we don't care about
+
+        movdqa      xmm2,                   xmm1
+        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
+        punpcklbw   xmm2,                   xmm2              ; exfxgxhx
+
+        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
+        psraw       xmm5,                   11                ; sign extended shift right by 3
+
+        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
+        psraw       xmm2,                   11                ; sign extended shift right by 3
+
+        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+        psraw       xmm0,                   11                ; sign extended shift right by 3
+
+        psraw       xmm1,                   11                ; sign extended shift right by 3
+        movdqa      xmm5,                   xmm0              ; save results
+
+        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+
+        paddsb      xmm6,                   xmm2              ; p0+= p0 add
+
+        movdqa      xmm2,                   [GLOBAL(ones)]
+        paddsw      xmm5,                   xmm2
+        paddsw      xmm1,                   xmm2
+        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
+        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
+        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+        movdqa      xmm2,                   [GLOBAL(t80)]
+
+%if %1 == 0
+        movdqa      xmm1,                   [rsp+_p1]         ; p1
+        lea         rsi,                    [rsi + rcx*2]
+        lea         rdi,                    [rdi + rcx*2]
+%elif %1 == 1
+        movdqa      xmm1,                   [rsi+2*rax]       ; p1
+%elif %1 == 2
+        movdqa      xmm1,                   [rsp+_p1]         ; p1
+%endif
+
+        pandn       xmm4,                   xmm5              ; high edge variance additive
+        pxor        xmm6,                   xmm2              ; unoffset
+
+        pxor        xmm1,                   xmm2              ; reoffset
+        psubsb      xmm3,                   xmm0              ; q0-= q0 add
+
+        paddsb      xmm1,                   xmm4              ; p1+= p1 add
+        pxor        xmm3,                   xmm2              ; unoffset
+
+        pxor        xmm1,                   xmm2              ; unoffset
+        psubsb      xmm7,                   xmm4              ; q1-= q1 add
+
+        pxor        xmm7,                   xmm2              ; unoffset
+%if %1 == 0
+        movq        [rsi],                  xmm6              ; p0
+        movhps      [rdi],                  xmm6
+        movq        [rsi + rax],            xmm1              ; p1
+        movhps      [rdi + rax],            xmm1
+        movq        [rsi + rcx],            xmm3              ; q0
+        movhps      [rdi + rcx],            xmm3
+        movq        [rsi + rcx*2],          xmm7              ; q1
+        movhps      [rdi + rcx*2],          xmm7
+%elif %1 == 1
+        movdqa      [rsi+rax],              xmm6              ; write back
+        movdqa      [rsi+2*rax],            xmm1              ; write back
+        movdqa      [rsi],                  xmm3              ; write back
+        movdqa      [rdi],                  xmm7              ; write back
+%endif
+
+%endmacro
+
+%if ABI_IS_32BIT
+
+;void vp8_loop_filter_horizontal_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;)
+global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, lf_var_size
+
+        mov         rsi,                    arg(0)           ;src_ptr
+        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
+
+        mov         rdx,                    arg(3)           ;limit
+
+        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
+
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 1
+        ; filter and write back the result
+        B_FILTER 1
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%endif
+
+;void vp8_loop_filter_horizontal_edge_uv_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    int            count
+;)
+global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_uv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, lf_var_size
+
+        mov         rsi,                    arg(0)             ; u
+        mov         rdi,                    arg(5)             ; v
+        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
+        mov         rcx,                    rax
+        neg         rax                     ; negate pitch to deal with above border
+
+        mov         rdx,                    arg(3)             ;limit
+
+        lea         rsi,                    [rsi + rcx]
+        lea         rdi,                    [rdi + rcx]
+
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 0
+        ; filter and write back the result
+        B_FILTER 0
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+%macro MB_FILTER_AND_WRITEBACK 1
+        movdqa      xmm3,                   [GLOBAL(t80)]
+%if %1 == 0
+        movdqa      xmm2,                   [rsp+_p1]              ; p1
+        movdqa      xmm7,                   [rsp+_q1]              ; q1
+%elif %1 == 1
+        movdqa      xmm2,                   [rsi+2*rax]     ; p1
+        movdqa      xmm7,                   [rdi]           ; q1
+
+        mov         rcx,                    rax
+        neg         rcx
+%elif %1 == 2
+        movdqa      xmm2,                   [rsp+_p1]       ; p1
+        movdqa      xmm6,                   [rsp+_p0]       ; p0
+        movdqa      xmm0,                   [rsp+_q0]       ; q0
+        movdqa      xmm7,                   [rsp+_q1]       ; q1
+%endif
+
+        pxor        xmm2,                   xmm3            ; p1 offset to convert to signed values
+        pxor        xmm7,                   xmm3            ; q1 offset to convert to signed values
+        pxor        xmm6,                   xmm3            ; offset to convert to signed values
+        pxor        xmm0,                   xmm3            ; offset to convert to signed values
+
+        psubsb      xmm2,                   xmm7            ; p1 - q1
+
+        movdqa      xmm3,                   xmm0            ; q0
+        psubsb      xmm0,                   xmm6            ; q0 - p0
+        paddsb      xmm2,                   xmm0            ; 1 * (q0 - p0) + (p1 - q1)
+        paddsb      xmm2,                   xmm0            ; 2 * (q0 - p0)
+        paddsb      xmm2,                   xmm0            ; 3 * (q0 - p0) + (p1 - q1)
+        pand        xmm1,                   xmm2            ; mask filter values we don't care about
+
+        movdqa      xmm2,                   xmm1            ; vp8_filter
+
+        pand        xmm2,                   xmm4            ; Filter2 = vp8_filter & hev
+        pxor        xmm0,                   xmm0
+
+        pandn       xmm4,                   xmm1            ; vp8_filter&=~hev
+        pxor        xmm1,                   xmm1
+
+        punpcklbw   xmm0,                   xmm4            ; Filter 2 (hi)
+        punpckhbw   xmm1,                   xmm4            ; Filter 2 (lo)
+
+        movdqa      xmm5,                   xmm2
+
+        movdqa      xmm4,                   [GLOBAL(s9)]
+        paddsb      xmm5,                   [GLOBAL(t3)]    ; vp8_signed_char_clamp(Filter2 + 3)
+        paddsb      xmm2,                   [GLOBAL(t4)]    ; vp8_signed_char_clamp(Filter2 + 4)
+
+        pmulhw      xmm1,                   xmm4            ; Filter 2 (lo) * 9
+        pmulhw      xmm0,                   xmm4            ; Filter 2 (hi) * 9
+
+        punpckhbw   xmm7,                   xmm5            ; axbxcxdx
+        punpcklbw   xmm5,                   xmm5            ; exfxgxhx
+
+        psraw       xmm7,                   11              ; sign extended shift right by 3
+
+        psraw       xmm5,                   11              ; sign extended shift right by 3
+        punpckhbw   xmm4,                   xmm2            ; axbxcxdx
+
+        punpcklbw   xmm2,                   xmm2            ; exfxgxhx
+        psraw       xmm4,                   11              ; sign extended shift right by 3
+
+        packsswb    xmm5,                   xmm7            ; Filter2 >>=3;
+        psraw       xmm2,                   11              ; sign extended shift right by 3
+
+        packsswb    xmm2,                   xmm4            ; Filter1 >>=3;
+
+        paddsb      xmm6,                   xmm5            ; ps0 =ps0 + Fitler2
+
+        psubsb      xmm3,                   xmm2            ; qs0 =qs0 - Filter1
+        movdqa      xmm7,                   xmm1
+
+        movdqa      xmm4,                   [GLOBAL(s63)]
+        movdqa      xmm5,                   xmm0
+        movdqa      xmm2,                   xmm5
+        paddw       xmm0,                   xmm4            ; Filter 2 (hi) * 9 + 63
+        paddw       xmm1,                   xmm4            ; Filter 2 (lo) * 9 + 63
+        movdqa      xmm4,                   xmm7
+
+        paddw       xmm5,                   xmm5            ; Filter 2 (hi) * 18
+
+        paddw       xmm7,                   xmm7            ; Filter 2 (lo) * 18
+        paddw       xmm5,                   xmm0            ; Filter 2 (hi) * 27 + 63
+
+        paddw       xmm7,                   xmm1            ; Filter 2 (lo) * 27 + 63
+        paddw       xmm2,                   xmm0            ; Filter 2 (hi) * 18 + 63
+        psraw       xmm0,                   7               ; (Filter 2 (hi) * 9 + 63) >> 7
+
+        paddw       xmm4,                   xmm1            ; Filter 2 (lo) * 18 + 63
+        psraw       xmm1,                   7               ; (Filter 2 (lo) * 9 + 63) >> 7
+        psraw       xmm2,                   7               ; (Filter 2 (hi) * 18 + 63) >> 7
+
+        packsswb    xmm0,                   xmm1            ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+        psraw       xmm4,                   7               ; (Filter 2 (lo) * 18 + 63) >> 7
+        psraw       xmm5,                   7               ; (Filter 2 (hi) * 27 + 63) >> 7
+        psraw       xmm7,                   7               ; (Filter 2 (lo) * 27 + 63) >> 7
+
+        packsswb    xmm5,                   xmm7            ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+        packsswb    xmm2,                   xmm4            ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+        movdqa      xmm7,                   [GLOBAL(t80)]
+
+%if %1 == 0
+        movdqa      xmm1,                   [rsp+_q1]       ; q1
+        movdqa      xmm4,                   [rsp+_p1]       ; p1
+        lea         rsi,                    [rsi+rcx*2]
+        lea         rdi,                    [rdi+rcx*2]
+
+%elif %1 == 1
+        movdqa      xmm1,                   [rdi]           ; q1
+        movdqa      xmm4,                   [rsi+rax*2]     ; p1
+%elif %1 == 2
+        movdqa      xmm4,                   [rsp+_p1]       ; p1
+        movdqa      xmm1,                   [rsp+_q1]       ; q1
+%endif
+
+        pxor        xmm1,                   xmm7
+        pxor        xmm4,                   xmm7
+
+        psubsb      xmm3,                   xmm5            ; sq = vp8_signed_char_clamp(qs0 - u3)
+        paddsb      xmm6,                   xmm5            ; sp = vp8_signed_char_clamp(ps0 - u3)
+        psubsb      xmm1,                   xmm2            ; sq = vp8_signed_char_clamp(qs1 - u2)
+        paddsb      xmm4,                   xmm2            ; sp = vp8_signed_char_clamp(ps1 - u2)
+
+%if %1 == 1
+        movdqa      xmm2,                   [rdi+rax*4]     ; p2
+        movdqa      xmm5,                   [rdi+rcx]       ; q2
+%else
+        movdqa      xmm2,                   [rsp+_p2]       ; p2
+        movdqa      xmm5,                   [rsp+_q2]       ; q2
+%endif
+
+        pxor        xmm1,                   xmm7            ; *oq1 = sq^0x80;
+        pxor        xmm4,                   xmm7            ; *op1 = sp^0x80;
+        pxor        xmm2,                   xmm7
+        pxor        xmm5,                   xmm7
+        paddsb      xmm2,                   xmm0            ; sp = vp8_signed_char_clamp(ps2 - u)
+        psubsb      xmm5,                   xmm0            ; sq = vp8_signed_char_clamp(qs2 - u)
+        pxor        xmm2,                   xmm7            ; *op2 = sp^0x80;
+        pxor        xmm5,                   xmm7            ; *oq2 = sq^0x80;
+        pxor        xmm3,                   xmm7            ; *oq0 = sq^0x80
+        pxor        xmm6,                   xmm7            ; *oq0 = sp^0x80
+%if %1 == 0
+        movq        [rsi],                  xmm6            ; p0
+        movhps      [rdi],                  xmm6
+        movq        [rsi + rcx],            xmm3            ; q0
+        movhps      [rdi + rcx],            xmm3
+        lea         rdx,                    [rcx + rcx*2]
+        movq        [rsi+rcx*2],            xmm1            ; q1
+        movhps      [rdi+rcx*2],            xmm1
+
+        movq        [rsi + rax],            xmm4            ; p1
+        movhps      [rdi + rax],            xmm4
+
+        movq        [rsi+rax*2],            xmm2            ; p2
+        movhps      [rdi+rax*2],            xmm2
+
+        movq        [rsi+rdx],              xmm5            ; q2
+        movhps      [rdi+rdx],              xmm5
+%elif %1 == 1
+        movdqa      [rdi+rcx],              xmm5            ; q2
+        movdqa      [rdi],                  xmm1            ; q1
+        movdqa      [rsi],                  xmm3            ; q0
+        movdqa      [rsi+rax  ],            xmm6            ; p0
+        movdqa      [rsi+rax*2],            xmm4            ; p1
+        movdqa      [rdi+rax*4],            xmm2            ; p2
+%elif %1 == 2
+        movdqa      [rsp+_p1],              xmm4            ; p1
+        movdqa      [rsp+_p0],              xmm6            ; p0
+        movdqa      [rsp+_q0],              xmm3            ; q0
+        movdqa      [rsp+_q1],              xmm1            ; q1
+%endif
+
+%endmacro
+
+
+;void vp8_mbloop_filter_horizontal_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, lf_var_size
+
+        mov         rsi,                    arg(0)            ;src_ptr
+        movsxd      rax,                    dword ptr arg(1)  ;src_pixel_step
+        mov         rdx,                    arg(3)            ;limit
+
+        lea         rdi,                    [rsi+rax]         ; rdi points to row +1 for indirect addressing
+
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 1
+        ; filter and write back the results
+        MB_FILTER_AND_WRITEBACK 1
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_mbloop_filter_horizontal_edge_uv_sse2
+;(
+;    unsigned char *u,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    unsigned char *v
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, lf_var_size
+
+        mov         rsi,                    arg(0)             ; u
+        mov         rdi,                    arg(5)             ; v
+        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
+        mov         rcx,                    rax
+        neg         rax                     ; negate pitch to deal with above border
+        mov         rdx,                    arg(3)             ;limit
+
+        lea         rsi,                    [rsi + rcx]
+        lea         rdi,                    [rdi + rcx]
+
+        ; calculate breakout conditions and high edge variance
+        LFH_FILTER_AND_HEV_MASK 0
+        ; filter and write back the results
+        MB_FILTER_AND_WRITEBACK 0
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+%macro TRANSPOSE_16X8 2
+        movq        xmm4,               [rsi]           ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
+        movq        xmm1,               [rdi]           ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
+        movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
+        movq        xmm7,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+        movq        xmm5,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
+        movq        xmm2,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
+
+        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+
+        movq        xmm1,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
+
+        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
+
+        movq        xmm7,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+
+        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+%if %1
+        lea         rsi,                [rsi+rax*8]
+        lea         rdi,                [rdi+rax*8]
+%else
+        mov         rsi,                arg(5)          ; v_ptr
+%endif
+
+        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
+        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
+        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+
+%if %1 == 0
+        lea         rdi,                [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
+        lea         rsi,                [rsi - 4]
+%endif
+
+        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+
+        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+
+        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+
+        movdqa      [rsp+_t0],          xmm2            ; save to free XMM2
+
+        movq        xmm2,               [rsi]           ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
+        movq        xmm6,               [rdi]           ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+        movq        xmm0,               [rsi+2*rax]     ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+        movq        xmm5,               [rdi+2*rax]     ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
+        movq        xmm1,               [rsi+4*rax]     ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
+
+        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+        movq        xmm6,               [rdi+4*rax]     ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
+
+        punpcklbw   xmm0,               xmm5            ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
+
+        movq        xmm5,               [rsi+2*rcx]     ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
+
+        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
+
+        movq        xmm6,               [rdi+2*rcx]     ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
+
+        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
+
+        movdqa      xmm6,               xmm1            ;
+        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
+
+        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+
+        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+        movdqa      xmm0,               xmm5
+        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+
+        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
+
+        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
+        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+
+        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
+
+%if %2 == 0
+        movdqa      [rsp+_q3],          xmm7            ; save 7
+        movdqa      [rsp+_q2],          xmm6            ; save 6
+%endif
+        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        movdqa      [rsp+_p1],          xmm2            ; save 2
+
+        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        movdqa      [rsp+_p0],          xmm3            ; save 3
+
+        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+        movdqa      [rsp+_q0],          xmm4            ; save 4
+        movdqa      [rsp+_q1],          xmm5            ; save 5
+        movdqa      xmm1,               [rsp+_t0]
+
+        movdqa      xmm2,               xmm1            ;
+        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+%if %2 == 0
+        movdqa      [rsp+_p2],          xmm1
+        movdqa      [rsp+_p3],          xmm2
+%endif
+
+%endmacro
+
+%macro LFV_FILTER_MASK_HEV_MASK 0
+        movdqa      xmm0,               xmm6            ; q2
+        psubusb     xmm0,               xmm7            ; q2-q3
+
+        psubusb     xmm7,               xmm6            ; q3-q2
+        movdqa      xmm4,               xmm5            ; q1
+
+        por         xmm7,               xmm0            ; abs (q3-q2)
+        psubusb     xmm4,               xmm6            ; q1-q2
+
+        movdqa      xmm0,               xmm1
+        psubusb     xmm6,               xmm5            ; q2-q1
+
+        por         xmm6,               xmm4            ; abs (q2-q1)
+        psubusb     xmm0,               xmm2            ; p2 - p3;
+
+        psubusb     xmm2,               xmm1            ; p3 - p2;
+        por         xmm0,               xmm2            ; abs(p2-p3)
+
+        movdqa      xmm5,               [rsp+_p1]       ; p1
+        pmaxub      xmm0,               xmm7
+
+        movdqa      xmm2,               xmm5            ; p1
+        psubusb     xmm5,               xmm1            ; p1-p2
+        psubusb     xmm1,               xmm2            ; p2-p1
+
+        movdqa      xmm7,               xmm3            ; p0
+        psubusb     xmm7,               xmm2            ; p0-p1
+
+        por         xmm1,               xmm5            ; abs(p2-p1)
+        pmaxub      xmm0,               xmm6
+
+        pmaxub      xmm0,               xmm1
+        movdqa      xmm1,               xmm2            ; p1
+
+        psubusb     xmm2,               xmm3            ; p1-p0
+
+        por         xmm2,               xmm7            ; abs(p1-p0)
+
+        pmaxub      xmm0,               xmm2
+
+        movdqa      xmm5,               [rsp+_q0]       ; q0
+        movdqa      xmm7,               [rsp+_q1]       ; q1
+
+        mov         rdx,                arg(3)          ; limit
+
+        movdqa      xmm6,               xmm5            ; q0
+        movdqa      xmm4,               xmm7            ; q1
+
+        psubusb     xmm5,               xmm7            ; q0-q1
+        psubusb     xmm7,               xmm6            ; q1-q0
+
+        por         xmm7,               xmm5            ; abs(q1-q0)
+
+        pmaxub      xmm0,               xmm7
+
+        psubusb     xmm0,               [rdx]           ; limit
+
+        mov         rdx,                arg(2)          ; blimit
+        movdqa      xmm5,               xmm4            ; q1
+
+        psubusb     xmm5,               xmm1            ; q1-=p1
+        psubusb     xmm1,               xmm4            ; p1-=q1
+
+        por         xmm5,               xmm1            ; abs(p1-q1)
+        movdqa      xmm1,               xmm3            ; p0
+
+        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
+        psubusb     xmm1,               xmm6            ; p0-q0
+
+        movdqa      xmm4,               [rdx]           ; blimit
+        mov         rdx,                arg(4)          ; get thresh
+
+        psrlw       xmm5,               1               ; abs(p1-q1)/2
+        psubusb     xmm6,               xmm3            ; q0-p0
+
+        por         xmm1,               xmm6            ; abs(q0-p0)
+        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
+        movdqa      xmm3,               [rdx]
+
+        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
+        psubusb     xmm2,               xmm3            ; abs(q1 - q0) > thresh
+
+        psubusb     xmm7,               xmm3            ; abs(p1 - p0)> thresh
+
+        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        por         xmm2,               xmm7            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+        por         xmm1,               xmm0            ; mask
+        pcmpeqb     xmm2,               xmm0
+
+        pxor        xmm0,               xmm0
+        pcmpeqb     xmm4,               xmm4
+
+        pcmpeqb     xmm1,               xmm0
+        pxor        xmm4,               xmm2
+%endmacro
+
+%macro BV_TRANSPOSE 0
+        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+
+        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+
+        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+
+        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+%endmacro
+
+%macro BV_WRITEBACK 2
+        movd        [rsi+2],            %1
+        movd        [rsi+4*rax+2],      %2
+        psrldq      %1,                 4
+        psrldq      %2,                 4
+        movd        [rdi+2],            %1
+        movd        [rdi+4*rax+2],      %2
+        psrldq      %1,                 4
+        psrldq      %2,                 4
+        movd        [rsi+2*rax+2],      %1
+        movd        [rsi+2*rcx+2],      %2
+        psrldq      %1,                 4
+        psrldq      %2,                 4
+        movd        [rdi+2*rax+2],      %1
+        movd        [rdi+2*rcx+2],      %2
+%endmacro
+
+%if ABI_IS_32BIT
+
+;void vp8_loop_filter_vertical_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;)
+global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
+sym(vp8_loop_filter_vertical_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub             rsp, lf_var_size
+
+        mov         rsi,        arg(0)                  ; src_ptr
+        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
+
+        lea         rsi,        [rsi - 4]
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+        lea         rcx,        [rax*2+rax]
+
+        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+        TRANSPOSE_16X8 1, 1
+
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK
+
+        ; start work on filters
+        B_FILTER 2
+
+        ; tranpose and write back - only work on q1, q0, p0, p1
+        BV_TRANSPOSE
+        ; store 16-line result
+
+        lea         rdx,        [rax]
+        neg         rdx
+
+        BV_WRITEBACK xmm1, xmm5
+
+        lea         rsi,        [rsi+rdx*8]
+        lea         rdi,        [rdi+rdx*8]
+        BV_WRITEBACK xmm2, xmm6
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%endif
+
+;void vp8_loop_filter_vertical_edge_uv_sse2
+;(
+;    unsigned char *u,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    unsigned char *v
+;)
+global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
+sym(vp8_loop_filter_vertical_edge_uv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub             rsp, lf_var_size
+
+        mov         rsi,        arg(0)                  ; u_ptr
+        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
+
+        lea         rsi,        [rsi - 4]
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+        lea         rcx,        [rax+2*rax]
+
+        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+        TRANSPOSE_16X8 0, 1
+
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK
+
+        ; start work on filters
+        B_FILTER 2
+
+        ; tranpose and write back - only work on q1, q0, p0, p1
+        BV_TRANSPOSE
+
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+
+        ; store 16-line result
+        BV_WRITEBACK xmm1, xmm5
+
+        mov         rsi,        arg(0)                  ; u_ptr
+        lea         rsi,        [rsi - 4]
+        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
+        BV_WRITEBACK xmm2, xmm6
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%macro MBV_TRANSPOSE 0
+        movdqa      xmm0,               [rsp+_p3]           ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+        movdqa      xmm1,               xmm0                ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+        punpcklbw   xmm0,               xmm2                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        punpckhbw   xmm1,               xmm2                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+        movdqa      xmm7,               [rsp+_p1]           ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        movdqa      xmm6,               xmm7                ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+        punpcklbw   xmm7,               [rsp+_p0]           ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+        punpckhbw   xmm6,               [rsp+_p0]           ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        movdqa      xmm3,               xmm0                ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        punpcklwd   xmm0,               xmm7                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+        punpckhwd   xmm3,               xmm7                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+        movdqa      xmm4,               xmm1                ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+        punpcklwd   xmm1,               xmm6                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+        punpckhwd   xmm4,               xmm6                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+        movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        punpcklbw   xmm7,               [rsp+_q1]           ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+        movdqa      xmm6,               xmm5                ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+        punpcklbw   xmm6,               [rsp+_q3]           ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
+
+        movdqa      xmm2,               xmm7                ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+        punpcklwd   xmm7,               xmm6                ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
+
+        punpckhwd   xmm2,               xmm6                ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
+        movdqa      xmm6,               xmm0                ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+        punpckldq   xmm0,               xmm7                ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
+        punpckhdq   xmm6,               xmm7                ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
+%endmacro
+
+%macro MBV_WRITEBACK_1 0
+        movq        [rsi],              xmm0
+        movhps      [rdi],              xmm0
+
+        movq        [rsi+2*rax],        xmm6
+        movhps      [rdi+2*rax],        xmm6
+
+        movdqa      xmm0,               xmm3                ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+        punpckldq   xmm0,               xmm2                ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
+        punpckhdq   xmm3,               xmm2                ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
+
+        movq        [rsi+4*rax],        xmm0
+        movhps      [rdi+4*rax],        xmm0
+
+        movq        [rsi+2*rcx],        xmm3
+        movhps      [rdi+2*rcx],        xmm3
+
+        movdqa      xmm7,               [rsp+_q0]           ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+        punpckhbw   xmm7,               [rsp+_q1]           ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+        punpckhbw   xmm5,               [rsp+_q3]           ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
+
+        movdqa      xmm0,               xmm7
+        punpcklwd   xmm0,               xmm5                ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
+        punpckhwd   xmm7,               xmm5                ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
+
+        movdqa      xmm5,               xmm1                ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+        punpckldq   xmm1,               xmm0                ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
+        punpckhdq   xmm5,               xmm0                ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
+%endmacro
+
+%macro MBV_WRITEBACK_2 0
+        movq        [rsi],              xmm1
+        movhps      [rdi],              xmm1
+
+        movq        [rsi+2*rax],        xmm5
+        movhps      [rdi+2*rax],        xmm5
+
+        movdqa      xmm1,               xmm4                ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+        punpckldq   xmm1,               xmm7                ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
+        punpckhdq   xmm4,               xmm7                ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
+
+        movq        [rsi+4*rax],        xmm1
+        movhps      [rdi+4*rax],        xmm1
+
+        movq        [rsi+2*rcx],        xmm4
+        movhps      [rdi+2*rcx],        xmm4
+%endmacro
+
+
+;void vp8_mbloop_filter_vertical_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;)
+global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, lf_var_size
+
+        mov         rsi,                arg(0)              ; src_ptr
+        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
+
+        lea         rsi,                [rsi - 4]
+        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
+        lea         rcx,                [rax*2+rax]
+
+        ; Transpose
+        TRANSPOSE_16X8 1, 0
+
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK
+
+        neg         rax
+        ; start work on filters
+        MB_FILTER_AND_WRITEBACK 2
+
+        lea         rsi,                [rsi+rax*8]
+        lea         rdi,                [rdi+rax*8]
+
+        ; transpose and write back
+        MBV_TRANSPOSE
+
+        neg         rax
+
+        MBV_WRITEBACK_1
+
+
+        lea         rsi,                [rsi+rax*8]
+        lea         rdi,                [rdi+rax*8]
+        MBV_WRITEBACK_2
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_mbloop_filter_vertical_edge_uv_sse2
+;(
+;    unsigned char *u,
+;    int            src_pixel_step,
+;    const char    *blimit,
+;    const char    *limit,
+;    const char    *thresh,
+;    unsigned char *v
+;)
+global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub          rsp, lf_var_size
+
+        mov         rsi,                arg(0)              ; u_ptr
+        movsxd      rax,                dword ptr arg(1)    ; src_pixel_step
+
+        lea         rsi,                [rsi - 4]
+        lea         rdi,                [rsi + rax]         ; rdi points to row +1 for indirect addressing
+        lea         rcx,                [rax+2*rax]
+
+        ; Transpose
+        TRANSPOSE_16X8 0, 0
+
+        ; calculate filter mask and high edge variance
+        LFV_FILTER_MASK_HEV_MASK
+
+        ; start work on filters
+        MB_FILTER_AND_WRITEBACK 2
+
+        ; transpose and write back
+        MBV_TRANSPOSE
+
+        mov         rsi,                arg(0)             ;u_ptr
+        lea         rsi,                [rsi - 4]
+        lea         rdi,                [rsi + rax]
+        MBV_WRITEBACK_1
+        mov         rsi,                arg(5)             ;v_ptr
+        lea         rsi,                [rsi - 4]
+        lea         rdi,                [rsi + rax]
+        MBV_WRITEBACK_2
+
+    add rsp, lf_var_size
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_simple_horizontal_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;)
+global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
+sym(vp8_loop_filter_simple_horizontal_edge_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM 7
+    GET_GOT     rbx
+    ; end prolog
+
+        mov         rcx, arg(0)             ;src_ptr
+        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
+        movdqa      xmm6, [GLOBAL(tfe)]
+        lea         rdx, [rcx + rax]
+        neg         rax
+
+        ; calculate mask
+        movdqa      xmm0, [rdx]             ; q1
+        mov         rdx, arg(2)             ;blimit
+        movdqa      xmm1, [rcx+2*rax]       ; p1
+
+        movdqa      xmm2, xmm1
+        movdqa      xmm3, xmm0
+
+        psubusb     xmm0, xmm1              ; q1-=p1
+        psubusb     xmm1, xmm3              ; p1-=q1
+        por         xmm1, xmm0              ; abs(p1-q1)
+        pand        xmm1, xmm6              ; set lsb of each byte to zero
+        psrlw       xmm1, 1                 ; abs(p1-q1)/2
+
+        movdqa      xmm7, XMMWORD PTR [rdx]
+
+        movdqa      xmm5, [rcx+rax]         ; p0
+        movdqa      xmm4, [rcx]             ; q0
+        movdqa      xmm0, xmm4              ; q0
+        movdqa      xmm6, xmm5              ; p0
+        psubusb     xmm5, xmm4              ; p0-=q0
+        psubusb     xmm4, xmm6              ; q0-=p0
+        por         xmm5, xmm4              ; abs(p0 - q0)
+
+        movdqa      xmm4, [GLOBAL(t80)]
+
+        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
+        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
+        psubusb     xmm5, xmm7              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        pxor        xmm7, xmm7
+        pcmpeqb     xmm5, xmm7
+
+
+        ; start work on filters
+        pxor        xmm2, xmm4     ; p1 offset to convert to signed values
+        pxor        xmm3, xmm4     ; q1 offset to convert to signed values
+        psubsb      xmm2, xmm3              ; p1 - q1
+
+        pxor        xmm6, xmm4     ; offset to convert to signed values
+        pxor        xmm0, xmm4     ; offset to convert to signed values
+        movdqa      xmm3, xmm0              ; q0
+        psubsb      xmm0, xmm6              ; q0 - p0
+        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
+        pand        xmm5, xmm2              ; mask filter values we don't care about
+
+        movdqa      xmm0, xmm5
+        paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
+        paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
+
+        movdqa      xmm1, [GLOBAL(te0)]
+        movdqa      xmm2, [GLOBAL(t1f)]
+
+;        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm0              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm0, 3
+        pand        xmm0, xmm2              ;clear out upper 3 bits
+        por         xmm0, xmm7              ;add sign
+        psubsb      xmm3, xmm0              ; q0-= q0sz add
+
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm5              ;save sign
+        pand        xmm7, xmm1              ;preserve the upper 3 bits
+        psrlw       xmm5, 3
+        pand        xmm5, xmm2              ;clear out upper 3 bits
+        por         xmm5, xmm7              ;add sign
+        paddsb      xmm6, xmm5              ; p0+= p0 add
+
+        pxor        xmm3, xmm4     ; unoffset
+        movdqa      [rcx], xmm3             ; write back
+
+        pxor        xmm6, xmm4     ; unoffset
+        movdqa      [rcx+rax], xmm6         ; write back
+
+    ; begin epilog
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_loop_filter_simple_vertical_edge_sse2
+;(
+;    unsigned char *src_ptr,
+;    int  src_pixel_step,
+;    const char *blimit,
+;)
+global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
+sym(vp8_loop_filter_simple_vertical_edge_sse2):
+    push        rbp         ; save old base pointer value.
+    mov         rbp, rsp    ; set new base pointer value.
+    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM 7
+    GET_GOT     rbx         ; save callee-saved reg
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 32                         ; reserve 32 bytes
+    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
+    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
+
+        mov         rsi, arg(0) ;src_ptr
+        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
+
+        lea         rsi,        [rsi - 2 ]
+        lea         rdi,        [rsi + rax]
+        lea         rdx,        [rsi + rax*4]
+        lea         rcx,        [rdx + rax]
+
+        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
+        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
+        movd        xmm2,       [rdi]                   ; 13 12 11 10
+        movd        xmm3,       [rcx]                   ; 53 52 51 50
+        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
+        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
+
+        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
+        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
+        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
+        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
+        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
+        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
+
+        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+
+        movdqa      xmm1,       xmm0
+        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+        movdqa      xmm2,       xmm0
+        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+        lea         rsi,        [rsi + rax*8]
+        lea         rdi,        [rsi + rax]
+        lea         rdx,        [rsi + rax*4]
+        lea         rcx,        [rdx + rax]
+
+        movd        xmm4,       [rsi]                   ; 83 82 81 80
+        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
+        movd        xmm6,       [rdi]                   ; 93 92 91 90
+        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
+        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
+        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
+
+        movd        xmm1,       [rsi + rax*2]           ; a3 a2 a1 a0
+        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
+        movd        xmm3,       [rdi + rax*2]           ; b3 b2 b1 b0
+        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
+        punpckldq   xmm1,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
+        punpckldq   xmm3,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
+
+        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
+        punpcklbw   xmm1,       xmm3                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+
+        movdqa      xmm7,       xmm4
+        punpcklwd   xmm4,       xmm1                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+        punpckhwd   xmm7,       xmm1                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+
+        movdqa      xmm6,       xmm4
+        punpckldq   xmm4,       xmm7                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+        punpckhdq   xmm6,       xmm7                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+
+        movdqa      xmm1,       xmm0
+        movdqa      xmm3,       xmm2
+
+        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+        mov         rdx,        arg(2)                          ;blimit
+
+        ; calculate mask
+        movdqa      xmm6,       xmm0                            ; p1
+        movdqa      xmm7,       xmm3                            ; q1
+        psubusb     xmm7,       xmm0                            ; q1-=p1
+        psubusb     xmm6,       xmm3                            ; p1-=q1
+        por         xmm6,       xmm7                            ; abs(p1-q1)
+        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
+        psrlw       xmm6,       1                               ; abs(p1-q1)/2
+
+        movdqa      xmm7, [rdx]
+
+        movdqa      xmm5,       xmm1                            ; p0
+        movdqa      xmm4,       xmm2                            ; q0
+        psubusb     xmm5,       xmm2                            ; p0-=q0
+        psubusb     xmm4,       xmm1                            ; q0-=p0
+        por         xmm5,       xmm4                            ; abs(p0 - q0)
+        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
+        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+        movdqa      xmm4, [GLOBAL(t80)]
+
+        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
+        pxor        xmm7,        xmm7
+        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
+
+        ; start work on filters
+        movdqa        t0,        xmm0
+        movdqa        t1,        xmm3
+
+        pxor        xmm0,        xmm4                  ; p1 offset to convert to signed values
+        pxor        xmm3,        xmm4                  ; q1 offset to convert to signed values
+        psubsb      xmm0,        xmm3                           ; p1 - q1
+
+        pxor        xmm1,        xmm4                  ; offset to convert to signed values
+        pxor        xmm2,        xmm4                  ; offset to convert to signed values
+
+        movdqa      xmm3,        xmm2                           ; offseted ; q0
+        psubsb      xmm2,        xmm1                           ; q0 - p0
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 1 * (q0 - p0)
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 2 * (q0 - p0)
+        paddsb      xmm0,        xmm2                           ; p1 - q1 + 3 * (q0 - p0)
+        pand        xmm5,        xmm0                           ; mask filter values we don't care about
+
+        movdqa      xmm0, xmm5
+        paddsb      xmm5,        [GLOBAL(t3)]                  ;  3* (q0 - p0) + (p1 - q1) + 4
+        paddsb      xmm0,        [GLOBAL(t4)]                  ; +3 instead of +4
+
+        movdqa  xmm6, [GLOBAL(te0)]
+        movdqa  xmm2, [GLOBAL(t1f)]
+
+;        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm0              ;save sign
+        pand        xmm7, xmm6              ;preserve the upper 3 bits
+        psrlw       xmm0, 3
+        pand        xmm0, xmm2              ;clear out upper 3 bits
+        por         xmm0, xmm7              ;add sign
+        psubsb      xmm3, xmm0              ; q0-= q0sz add
+
+        pxor        xmm7, xmm7
+        pcmpgtb     xmm7, xmm5              ;save sign
+        pand        xmm7, xmm6              ;preserve the upper 3 bits
+        psrlw       xmm5, 3
+        pand        xmm5, xmm2              ;clear out upper 3 bits
+        por         xmm5, xmm7              ;add sign
+        paddsb      xmm1, xmm5              ; p0+= p0 add
+
+        pxor        xmm3,        xmm4                  ; unoffset   q0
+        pxor        xmm1,        xmm4                  ; unoffset   p0
+
+        movdqa      xmm0,        t0                             ; p1
+        movdqa      xmm4,        t1                             ; q1
+
+        ; write out order: xmm0 xmm2 xmm1 xmm3
+        lea         rdx,        [rsi + rax*4]
+
+        ; transpose back to write out
+        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+        movdqa      xmm6,       xmm0
+        punpcklbw   xmm0,       xmm1                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+        punpckhbw   xmm6,       xmm1                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+        movdqa      xmm5,       xmm3
+        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+        movdqa      xmm2,       xmm0
+        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+
+        movdqa      xmm3,       xmm6
+        punpcklwd   xmm6,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+        movd        [rsi],      xmm6                               ; write the second 8-line result
+        movd        [rdx],      xmm3
+        psrldq      xmm6,       4
+        psrldq      xmm3,       4
+        movd        [rdi],      xmm6
+        movd        [rcx],      xmm3
+        psrldq      xmm6,       4
+        psrldq      xmm3,       4
+        movd        [rsi + rax*2], xmm6
+        movd        [rdx + rax*2], xmm3
+        psrldq      xmm6,       4
+        psrldq      xmm3,       4
+        movd        [rdi + rax*2], xmm6
+        movd        [rcx + rax*2], xmm3
+
+        neg         rax
+        lea         rsi,        [rsi + rax*8]
+        neg         rax
+        lea         rdi,        [rsi + rax]
+        lea         rdx,        [rsi + rax*4]
+        lea         rcx,        [rdx + rax]
+
+        movd        [rsi],      xmm0                                ; write the first 8-line result
+        movd        [rdx],      xmm2
+        psrldq      xmm0,       4
+        psrldq      xmm2,       4
+        movd        [rdi],      xmm0
+        movd        [rcx],      xmm2
+        psrldq      xmm0,       4
+        psrldq      xmm2,       4
+        movd        [rsi + rax*2], xmm0
+        movd        [rdx + rax*2], xmm2
+        psrldq      xmm0,       4
+        psrldq      xmm2,       4
+        movd        [rdi + rax*2], xmm0
+        movd        [rcx + rax*2], xmm2
+
+    add rsp, 32
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+tfe:
+    times 16 db 0xfe
+align 16
+t80:
+    times 16 db 0x80
+align 16
+t1s:
+    times 16 db 0x01
+align 16
+t3:
+    times 16 db 0x03
+align 16
+t4:
+    times 16 db 0x04
+align 16
+ones:
+    times 8 dw 0x0001
+align 16
+s9:
+    times 8 dw 0x0900
+align 16
+s63:
+    times 8 dw 0x003f
+align 16
+te0:
+    times 16 db 0xe0
+align 16
+t1f:
+    times 16 db 0x1f
diff --git a/libvpx/vp8/common/x86/loopfilter_x86.c b/libvpx/vp8/common/x86/loopfilter_x86.c
new file mode 100644
index 0000000..6586004
--- /dev/null
+++ b/libvpx/vp8/common/x86/loopfilter_x86.c
@@ -0,0 +1,198 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8/common/loopfilter.h"
+
+#define prototype_loopfilter(sym) \
+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+             const unsigned char *limit, const unsigned char *thresh, int count)
+
+#define prototype_loopfilter_nc(sym) \
+    void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+             const unsigned char *limit, const unsigned char *thresh)
+
+#define prototype_simple_loopfilter(sym) \
+    void sym(unsigned char *y, int ystride, const unsigned char *blimit)
+
+prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
+prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
+prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
+
+#if HAVE_SSE2 && ARCH_X86_64
+prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
+prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
+#else
+prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2);
+#endif
+prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2);
+
+extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
+
+#if HAVE_MMX
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                            int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                            int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+    if (v_ptr)
+        vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+
+/* Horizontal MB filtering */
+#if HAVE_SSE2
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
+
+    if (u_ptr)
+        vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
+
+    if (u_ptr)
+        vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+#if ARCH_X86_64
+    vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+#else
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+    vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+#endif
+
+    if (u_ptr)
+        vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride);
+}
+
+
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                             int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+#if ARCH_X86_64
+    vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+#else
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+    vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+#endif
+
+    if (u_ptr)
+        vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4);
+}
+
+
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
+}
+
+#endif
diff --git a/libvpx/vp8/common/x86/mfqe_sse2.asm b/libvpx/vp8/common/x86/mfqe_sse2.asm
new file mode 100644
index 0000000..c1d2174
--- /dev/null
+++ b/libvpx/vp8/common/x86/mfqe_sse2.asm
@@ -0,0 +1,281 @@
+;
+;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_filter_by_weight16x16_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
+sym(vp8_filter_by_weight16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 16                     ; loop count
+    pxor        xmm6, xmm6
+
+.combine
+    movdqa      xmm2, [rax]
+    movdqa      xmm4, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm6
+    punpckhbw   xmm3, xmm6
+    pmullw      xmm2, xmm0
+    pmullw      xmm3, xmm0
+
+    ; dst * dst_weight
+    movdqa      xmm5, xmm4
+    punpcklbw   xmm4, xmm6
+    punpckhbw   xmm5, xmm6
+    pmullw      xmm4, xmm1
+    pmullw      xmm5, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    paddw       xmm3, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+    psrlw       xmm3, 4
+
+    packuswb    xmm2, xmm3
+    movdqa      [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp8_filter_by_weight8x8_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
+sym(vp8_filter_by_weight8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 8                      ; loop count
+    pxor        xmm4, xmm4
+
+.combine
+    movq        xmm2, [rax]
+    movq        xmm3, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    punpcklbw   xmm2, xmm4
+    pmullw      xmm2, xmm0
+
+    ; dst * dst_weight
+    punpcklbw   xmm3, xmm4
+    pmullw      xmm3, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm3
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+
+    packuswb    xmm2, xmm4
+    movq        [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp8_variance_and_sad_16x16_sse2 | arg
+;(
+;    unsigned char *src1,          0
+;    int            stride1,       1
+;    unsigned char *src2,          2
+;    int            stride2,       3
+;    unsigned int  *variance,      4
+;    unsigned int  *sad,           5
+;)
+global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
+sym(vp8_variance_and_sad_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rax,        arg(0)          ; src1
+    mov         rcx,        arg(1)          ; stride1
+    mov         rdx,        arg(2)          ; src2
+    mov         rdi,        arg(3)          ; stride2
+
+    mov         rsi,        16              ; block height
+
+    ; Prep accumulator registers
+    pxor        xmm3, xmm3                  ; SAD
+    pxor        xmm4, xmm4                  ; sum of src2
+    pxor        xmm5, xmm5                  ; sum of src2^2
+
+    ; Because we're working with the actual output frames
+    ; we can't depend on any kind of data alignment.
+.accumulate
+    movdqa      xmm0, [rax]                 ; src1
+    movdqa      xmm1, [rdx]                 ; src2
+    add         rax, rcx                    ; src1 + stride1
+    add         rdx, rdi                    ; src2 + stride2
+
+    ; SAD(src1, src2)
+    psadbw      xmm0, xmm1
+    paddusw     xmm3, xmm0
+
+    ; SUM(src2)
+    pxor        xmm2, xmm2
+    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
+    paddusw     xmm4, xmm2
+
+    ; pmaddubsw would be ideal if it took two unsigned values. instead,
+    ; it expects a signed and an unsigned value. so instead we zero extend
+    ; and operate on words.
+    pxor        xmm2, xmm2
+    movdqa      xmm0, xmm1
+    punpcklbw   xmm0, xmm2
+    punpckhbw   xmm1, xmm2
+    pmaddwd     xmm0, xmm0
+    pmaddwd     xmm1, xmm1
+    paddd       xmm5, xmm0
+    paddd       xmm5, xmm1
+
+    sub         rsi,        1
+    jnz         .accumulate
+
+    ; phaddd only operates on adjacent double words.
+    ; Finalize SAD and store
+    movdqa      xmm0, xmm3
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm3
+    paddd       xmm0, [GLOBAL(t128)]
+    psrld       xmm0, 8
+
+    mov         rax,  arg(5)
+    movd        [rax], xmm0
+
+    ; Accumulate sum of src2
+    movdqa      xmm0, xmm4
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm4
+    ; Square src2. Ignore high value
+    pmuludq     xmm0, xmm0
+    psrld       xmm0, 8
+
+    ; phaddw could be used to sum adjacent values but we want
+    ; all the values summed. promote to doubles, accumulate,
+    ; shift and sum
+    pxor        xmm2, xmm2
+    movdqa      xmm1, xmm5
+    punpckldq   xmm1, xmm2
+    punpckhdq   xmm5, xmm2
+    paddd       xmm1, xmm5
+    movdqa      xmm2, xmm1
+    psrldq      xmm1, 8
+    paddd       xmm1, xmm2
+
+    psubd       xmm1, xmm0
+
+    ; (variance + 128) >> 8
+    paddd       xmm1, [GLOBAL(t128)]
+    psrld       xmm1, 8
+    mov         rax,  arg(4)
+
+    movd        [rax], xmm1
+
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t128:
+    ddq 128
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+    times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+    times 8 dw 0x08
+
diff --git a/libvpx/vp8/common/x86/postproc_mmx.asm b/libvpx/vp8/common/x86/postproc_mmx.asm
new file mode 100644
index 0000000..966c586
--- /dev/null
+++ b/libvpx/vp8/common/x86/postproc_mmx.asm
@@ -0,0 +1,314 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT  7
+
+;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
+;                             int pitch, int rows, int cols,int flimit)
+extern sym(vp8_rv)
+global sym(vp8_mbpost_proc_down_mmx) PRIVATE
+sym(vp8_mbpost_proc_down_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 136
+
+    ; unsigned char d[16][8] at [rsp]
+    ; create flimit2 at [rsp+128]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp+128], eax
+    mov         [rsp+128+4], eax
+%define flimit2 [rsp+128]
+
+%if ABI_IS_32BIT=0
+    lea         r8,       [GLOBAL(sym(vp8_rv))]
+%endif
+
+    ;rows +=8;
+    add         dword ptr arg(2), 8
+
+    ;for(c=0; c<cols; c+=4)
+.loop_col:
+            mov         rsi,        arg(0)  ;s
+            pxor        mm0,        mm0     ;
+
+            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+
+            ; this copies the last row down into the border 8 rows
+            mov         rdi,        rsi
+            mov         rdx,        arg(2)
+            sub         rdx,        9
+            imul        rdx,        rax
+            lea         rdi,        [rdi+rdx]
+            movq        mm1,        QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_borderd                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_borderd
+
+            neg         rax                                     ; rax = -pitch
+
+            ; this copies the first row up into the border 8 rows
+            mov         rdi,        rsi
+            movq        mm1,        QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_border                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      mm1
+
+            dec         rcx
+            jne         .init_border
+
+
+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
+            neg         rax
+
+
+            pxor        mm5,        mm5
+            pxor        mm6,        mm6     ;
+
+            pxor        mm7,        mm7     ;
+            mov         rdi,        rsi
+
+            mov         rcx,        15          ;
+
+.loop_initvar:
+            movd        mm1,        DWORD PTR [rdi];
+            punpcklbw   mm1,        mm0     ;
+
+            paddw       mm5,        mm1     ;
+            pmullw      mm1,        mm1     ;
+
+            movq        mm2,        mm1     ;
+            punpcklwd   mm1,        mm0     ;
+
+            punpckhwd   mm2,        mm0     ;
+            paddd       mm6,        mm1     ;
+
+            paddd       mm7,        mm2     ;
+            lea         rdi,        [rdi+rax]   ;
+
+            dec         rcx
+            jne         .loop_initvar
+            ;save the var and sum
+            xor         rdx,        rdx
+.loop_row:
+            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
+            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
+
+            punpcklbw   mm1,        mm0
+            punpcklbw   mm2,        mm0
+
+            paddw       mm5,        mm2
+            psubw       mm5,        mm1
+
+            pmullw      mm2,        mm2
+            movq        mm4,        mm2
+
+            punpcklwd   mm2,        mm0
+            punpckhwd   mm4,        mm0
+
+            paddd       mm6,        mm2
+            paddd       mm7,        mm4
+
+            pmullw      mm1,        mm1
+            movq        mm2,        mm1
+
+            punpcklwd   mm1,        mm0
+            psubd       mm6,        mm1
+
+            punpckhwd   mm2,        mm0
+            psubd       mm7,        mm2
+
+
+            movq        mm3,        mm6
+            pslld       mm3,        4
+
+            psubd       mm3,        mm6
+            movq        mm1,        mm5
+
+            movq        mm4,        mm5
+            pmullw      mm1,        mm1
+
+            pmulhw      mm4,        mm4
+            movq        mm2,        mm1
+
+            punpcklwd   mm1,        mm4
+            punpckhwd   mm2,        mm4
+
+            movq        mm4,        mm7
+            pslld       mm4,        4
+
+            psubd       mm4,        mm7
+
+            psubd       mm3,        mm1
+            psubd       mm4,        mm2
+
+            psubd       mm3,        flimit2
+            psubd       mm4,        flimit2
+
+            psrad       mm3,        31
+            psrad       mm4,        31
+
+            packssdw    mm3,        mm4
+            packsswb    mm3,        mm0
+
+            movd        mm1,        DWORD PTR [rsi+rax*8]
+
+            movq        mm2,        mm1
+            punpcklbw   mm1,        mm0
+
+            paddw       mm1,        mm5
+            mov         rcx,        rdx
+
+            and         rcx,        127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+            push        rax
+            lea         rax,        [GLOBAL(sym(vp8_rv))]
+            movq        mm4,        [rax + rcx*2] ;vp8_rv[rcx*2]
+            pop         rax
+%elif ABI_IS_32BIT=0
+            movq        mm4,        [r8 + rcx*2] ;vp8_rv[rcx*2]
+%else
+            movq        mm4,        [sym(vp8_rv) + rcx*2]
+%endif
+            paddw       mm1,        mm4
+            ;paddw     xmm1,       eight8s
+            psraw       mm1,        4
+
+            packuswb    mm1,        mm0
+            pand        mm1,        mm3
+
+            pandn       mm3,        mm2
+            por         mm1,        mm3
+
+            and         rcx,        15
+            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
+
+            mov         rcx,        rdx
+            sub         rcx,        8
+
+            and         rcx,        15
+            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
+
+            movd        [rsi],      mm1
+            lea         rsi,        [rsi+rax]
+
+            lea         rdi,        [rdi+rax]
+            add         rdx,        1
+
+            cmp         edx,        dword arg(2) ;rows
+            jl          .loop_row
+
+
+        add         dword arg(0), 4 ; s += 4
+        sub         dword arg(3), 4 ; cols -= 4
+        cmp         dword arg(3), 0
+        jg          .loop_col
+
+    add         rsp, 136
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit2
+
+
+;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
+;                            unsigned char blackclamp[16],
+;                            unsigned char whiteclamp[16],
+;                            unsigned char bothclamp[16],
+;                            unsigned int Width, unsigned int Height, int Pitch)
+extern sym(rand)
+global sym(vp8_plane_add_noise_mmx) PRIVATE
+sym(vp8_plane_add_noise_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+.addnoise_loop:
+    call sym(rand) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    ; we rely on the fact that the clamping vectors are stored contiguously
+    ; in black/white/both order. Note that we have to reload this here because
+    ; rdx could be trashed by rand()
+    mov     rdx, arg(2) ; blackclamp
+
+
+            mov     rdi, rcx
+            movsxd  rcx, dword arg(5) ;[Width]
+            mov     rsi, arg(0) ;Pos
+            xor         rax,rax
+
+.addnoise_nextset:
+            movq        mm1,[rsi+rax]         ; get the source
+
+            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
+            paddusb     mm1, [rdx+32] ;bothclamp
+            psubusb     mm1, [rdx+16] ;whiteclamp
+
+            movq        mm2,[rdi+rax]         ; get the noise for this line
+            paddb       mm1,mm2              ; add it in
+            movq        [rsi+rax],mm1         ; store the result
+
+            add         rax,8                 ; move to the next line
+
+            cmp         rax, rcx
+            jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+Blur:
+    times 16 dw 16
+    times  8 dw 64
+    times 16 dw 16
+    times  8 dw  0
+
+rd:
+    times 4 dw 0x40
diff --git a/libvpx/vp8/common/x86/postproc_sse2.asm b/libvpx/vp8/common/x86/postproc_sse2.asm
new file mode 100644
index 0000000..00f84a3
--- /dev/null
+++ b/libvpx/vp8/common/x86/postproc_sse2.asm
@@ -0,0 +1,721 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;macro in deblock functions
+%macro FIRST_2_ROWS 0
+        movdqa      xmm4,       xmm0
+        movdqa      xmm6,       xmm0
+        movdqa      xmm5,       xmm1
+        pavgb       xmm5,       xmm3
+
+        ;calculate absolute value
+        psubusb     xmm4,       xmm1
+        psubusb     xmm1,       xmm0
+        psubusb     xmm6,       xmm3
+        psubusb     xmm3,       xmm0
+        paddusb     xmm4,       xmm1
+        paddusb     xmm6,       xmm3
+
+        ;get threshold
+        movdqa      xmm2,       flimit
+        pxor        xmm1,       xmm1
+        movdqa      xmm7,       xmm2
+
+        ;get mask
+        psubusb     xmm2,       xmm4
+        psubusb     xmm7,       xmm6
+        pcmpeqb     xmm2,       xmm1
+        pcmpeqb     xmm7,       xmm1
+        por         xmm7,       xmm2
+%endmacro
+
+%macro SECOND_2_ROWS 0
+        movdqa      xmm6,       xmm0
+        movdqa      xmm4,       xmm0
+        movdqa      xmm2,       xmm1
+        pavgb       xmm1,       xmm3
+
+        ;calculate absolute value
+        psubusb     xmm6,       xmm2
+        psubusb     xmm2,       xmm0
+        psubusb     xmm4,       xmm3
+        psubusb     xmm3,       xmm0
+        paddusb     xmm6,       xmm2
+        paddusb     xmm4,       xmm3
+
+        pavgb       xmm5,       xmm1
+
+        ;get threshold
+        movdqa      xmm2,       flimit
+        pxor        xmm1,       xmm1
+        movdqa      xmm3,       xmm2
+
+        ;get mask
+        psubusb     xmm2,       xmm6
+        psubusb     xmm3,       xmm4
+        pcmpeqb     xmm2,       xmm1
+        pcmpeqb     xmm3,       xmm1
+
+        por         xmm7,       xmm2
+        por         xmm7,       xmm3
+
+        pavgb       xmm5,       xmm0
+
+        ;decide if or not to use filtered value
+        pand        xmm0,       xmm7
+        pandn       xmm7,       xmm5
+        paddusb     xmm0,       xmm7
+%endmacro
+
+%macro UPDATE_FLIMIT 0
+        movdqa      xmm2,       XMMWORD PTR [rbx]
+        movdqa      [rsp],      xmm2
+        add         rbx,        16
+%endmacro
+
+;void vp8_post_proc_down_and_across_mb_row_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned char *dst_ptr,
+;    int src_pixels_per_line,
+;    int dst_pixels_per_line,
+;    int cols,
+;    int *flimits,
+;    int size
+;)
+global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
+sym(vp8_post_proc_down_and_across_mb_row_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+    ALIGN_STACK 16, rax
+    sub         rsp, 16
+
+        ; put flimit on stack
+        mov         rbx,        arg(5)           ;flimits ptr
+        UPDATE_FLIMIT
+
+%define flimit [rsp]
+
+        mov         rsi,        arg(0)           ;src_ptr
+        mov         rdi,        arg(1)           ;dst_ptr
+
+        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
+        movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
+.nextrow:
+        xor         rdx,        rdx              ;col
+.nextcol:
+        ;load current and next 2 rows
+        movdqu      xmm0,       XMMWORD PTR [rsi]
+        movdqu      xmm1,       XMMWORD PTR [rsi + rax]
+        movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
+
+        FIRST_2_ROWS
+
+        ;load above 2 rows
+        neg         rax
+        movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
+        movdqu      xmm3,       XMMWORD PTR [rsi + rax]
+
+        SECOND_2_ROWS
+
+        movdqu      XMMWORD PTR [rdi], xmm0
+
+        neg         rax                          ; positive stride
+        add         rsi,        16
+        add         rdi,        16
+
+        add         rdx,        16
+        cmp         edx,        dword arg(4)     ;cols
+        jge         .downdone
+        UPDATE_FLIMIT
+        jmp         .nextcol
+
+.downdone:
+        ; done with the all cols, start the across filtering in place
+        sub         rsi,        rdx
+        sub         rdi,        rdx
+
+        mov         rbx,        arg(5) ; flimits
+        UPDATE_FLIMIT
+
+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rdi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        mov         rdx,    -8
+        movq        [rdi+rdx], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(4)
+        movq        mm1,   [rdi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rdi+rdx], mm1
+
+        xor         rdx,        rdx
+        movq        mm0,        QWORD PTR [rdi-16];
+        movq        mm1,        QWORD PTR [rdi-8];
+
+.acrossnextcol:
+        movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
+        movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
+        movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
+
+        FIRST_2_ROWS
+
+        movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
+        movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
+
+        SECOND_2_ROWS
+
+        movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
+        movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
+        movdq2q     mm0,        xmm0
+        psrldq      xmm0,       8
+        movdq2q     mm1,        xmm0
+
+        add         rdx,        16
+        cmp         edx,        dword arg(4)     ;cols
+        jge         .acrossdone
+        UPDATE_FLIMIT
+        jmp         .acrossnextcol
+
+.acrossdone
+        ; last 16 pixels
+        movq        QWORD PTR [rdi+rdx-16], mm0
+
+        cmp         edx,        dword arg(4)
+        jne         .throw_last_8
+        movq        QWORD PTR [rdi+rdx-8], mm1
+.throw_last_8:
+        ; done with this rwo
+        add         rsi,rax                      ;next src line
+        mov         eax, dword arg(3)            ;dst_pixels_per_line
+        add         rdi,rax                      ;next destination
+        mov         eax, dword arg(2)            ;src_pixels_per_line
+
+        mov         rbx,        arg(5)           ;flimits
+        UPDATE_FLIMIT
+
+        dec         rcx                          ;decrement count
+        jnz         .nextrow                     ;next row
+
+    add rsp, 16
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit
+
+;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
+;                            int pitch, int rows, int cols,int flimit)
+extern sym(vp8_rv)
+global sym(vp8_mbpost_proc_down_xmm) PRIVATE
+sym(vp8_mbpost_proc_down_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 128+16
+
+    ; unsigned char d[16][8] at [rsp]
+    ; create flimit2 at [rsp+128]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp+128], eax
+    mov         [rsp+128+4], eax
+    mov         [rsp+128+8], eax
+    mov         [rsp+128+12], eax
+%define flimit4 [rsp+128]
+
+%if ABI_IS_32BIT=0
+    lea         r8,       [GLOBAL(sym(vp8_rv))]
+%endif
+
+    ;rows +=8;
+    add         dword arg(2), 8
+
+    ;for(c=0; c<cols; c+=8)
+.loop_col:
+            mov         rsi,        arg(0) ; s
+            pxor        xmm0,       xmm0        ;
+
+            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+
+            ; this copies the last row down into the border 8 rows
+            mov         rdi,        rsi
+            mov         rdx,        arg(2)
+            sub         rdx,        9
+            imul        rdx,        rax
+            lea         rdi,        [rdi+rdx]
+            movq        xmm1,       QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_borderd                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_borderd
+
+            neg         rax                                     ; rax = -pitch
+
+            ; this copies the first row up into the border 8 rows
+            mov         rdi,        rsi
+            movq        xmm1,       QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_border                                                    ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_border
+
+
+
+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
+            neg         rax
+
+            pxor        xmm5,       xmm5
+            pxor        xmm6,       xmm6        ;
+
+            pxor        xmm7,       xmm7        ;
+            mov         rdi,        rsi
+
+            mov         rcx,        15          ;
+
+.loop_initvar:
+            movq        xmm1,       QWORD PTR [rdi];
+            punpcklbw   xmm1,       xmm0        ;
+
+            paddw       xmm5,       xmm1        ;
+            pmullw      xmm1,       xmm1        ;
+
+            movdqa      xmm2,       xmm1        ;
+            punpcklwd   xmm1,       xmm0        ;
+
+            punpckhwd   xmm2,       xmm0        ;
+            paddd       xmm6,       xmm1        ;
+
+            paddd       xmm7,       xmm2        ;
+            lea         rdi,        [rdi+rax]   ;
+
+            dec         rcx
+            jne         .loop_initvar
+            ;save the var and sum
+            xor         rdx,        rdx
+.loop_row:
+            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
+            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
+
+            punpcklbw   xmm1,       xmm0
+            punpcklbw   xmm2,       xmm0
+
+            paddw       xmm5,       xmm2
+            psubw       xmm5,       xmm1
+
+            pmullw      xmm2,       xmm2
+            movdqa      xmm4,       xmm2
+
+            punpcklwd   xmm2,       xmm0
+            punpckhwd   xmm4,       xmm0
+
+            paddd       xmm6,       xmm2
+            paddd       xmm7,       xmm4
+
+            pmullw      xmm1,       xmm1
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm0
+            psubd       xmm6,       xmm1
+
+            punpckhwd   xmm2,       xmm0
+            psubd       xmm7,       xmm2
+
+
+            movdqa      xmm3,       xmm6
+            pslld       xmm3,       4
+
+            psubd       xmm3,       xmm6
+            movdqa      xmm1,       xmm5
+
+            movdqa      xmm4,       xmm5
+            pmullw      xmm1,       xmm1
+
+            pmulhw      xmm4,       xmm4
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm4
+            punpckhwd   xmm2,       xmm4
+
+            movdqa      xmm4,       xmm7
+            pslld       xmm4,       4
+
+            psubd       xmm4,       xmm7
+
+            psubd       xmm3,       xmm1
+            psubd       xmm4,       xmm2
+
+            psubd       xmm3,       flimit4
+            psubd       xmm4,       flimit4
+
+            psrad       xmm3,       31
+            psrad       xmm4,       31
+
+            packssdw    xmm3,       xmm4
+            packsswb    xmm3,       xmm0
+
+            movq        xmm1,       QWORD PTR [rsi+rax*8]
+
+            movq        xmm2,       xmm1
+            punpcklbw   xmm1,       xmm0
+
+            paddw       xmm1,       xmm5
+            mov         rcx,        rdx
+
+            and         rcx,        127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+            push        rax
+            lea         rax,        [GLOBAL(sym(vp8_rv))]
+            movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
+            pop         rax
+%elif ABI_IS_32BIT=0
+            movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
+%else
+            movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
+%endif
+
+            paddw       xmm1,       xmm4
+            ;paddw     xmm1,       eight8s
+            psraw       xmm1,       4
+
+            packuswb    xmm1,       xmm0
+            pand        xmm1,       xmm3
+
+            pandn       xmm3,       xmm2
+            por         xmm1,       xmm3
+
+            and         rcx,        15
+            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
+
+            mov         rcx,        rdx
+            sub         rcx,        8
+
+            and         rcx,        15
+            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
+
+            movq        [rsi],      mm0
+            lea         rsi,        [rsi+rax]
+
+            lea         rdi,        [rdi+rax]
+            add         rdx,        1
+
+            cmp         edx,        dword arg(2) ;rows
+            jl          .loop_row
+
+        add         dword arg(0), 8 ; s += 8
+        sub         dword arg(3), 8 ; cols -= 8
+        cmp         dword arg(3), 0
+        jg          .loop_col
+
+    add         rsp, 128+16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
+;                                int pitch, int rows, int cols,int flimit)
+global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
+sym(vp8_mbpost_proc_across_ip_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16
+
+    ; create flimit4 at [rsp]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp], eax
+    mov         [rsp+4], eax
+    mov         [rsp+8], eax
+    mov         [rsp+12], eax
+%define flimit4 [rsp]
+
+
+    ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+        xor         rdx,    rdx ;sumsq=0;
+        xor         rcx,    rcx ;sum=0;
+        mov         rsi,    arg(0); s
+
+
+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rsi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+
+        mov         rdi,    -8
+        movq        [rsi+rdi], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(3)
+        movq        mm1,   [rsi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rsi+rdx], mm1
+
+.ip_var_loop:
+        ;for(i=-8;i<=6;i++)
+        ;{
+        ;    sumsq += s[i]*s[i];
+        ;    sum   += s[i];
+        ;}
+        movzx       eax, byte [rsi+rdi]
+        add         ecx, eax
+        mul         al
+        add         edx, eax
+        add         rdi, 1
+        cmp         rdi, 6
+        jle         .ip_var_loop
+
+
+            ;mov         rax,    sumsq
+            ;movd        xmm7,   rax
+            movd        xmm7,   edx
+
+            ;mov         rax,    sum
+            ;movd        xmm6,   rax
+            movd        xmm6,   ecx
+
+            mov         rsi,    arg(0) ;s
+            xor         rcx,    rcx
+
+            movsxd      rdx,    dword arg(3) ;cols
+            add         rdx,    8
+            pxor        mm0,    mm0
+            pxor        mm1,    mm1
+
+            pxor        xmm0,   xmm0
+.nextcol4:
+
+            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
+            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
+
+            punpcklbw   xmm1,   xmm0                    ; expanding
+            punpcklbw   xmm2,   xmm0                    ; expanding
+
+            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
+            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
+
+            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
+            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
+
+            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
+            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
+
+            paddd       xmm6,   xmm2
+            paddd       xmm7,   xmm1
+
+            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
+            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
+
+            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
+            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
+
+            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
+
+            paddd       xmm6,   xmm4
+            paddd       xmm7,   xmm3
+
+            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
+            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            movdqa      xmm3,   xmm6
+            pmaddwd     xmm3,   xmm3
+
+            movdqa      xmm5,   xmm7
+            pslld       xmm5,   4
+
+            psubd       xmm5,   xmm7
+            psubd       xmm5,   xmm3
+
+            psubd       xmm5,   flimit4
+            psrad       xmm5,   31
+
+            packssdw    xmm5,   xmm0
+            packsswb    xmm5,   xmm0
+
+            movd        xmm1,   DWORD PTR [rsi+rcx]
+            movq        xmm2,   xmm1
+
+            punpcklbw   xmm1,   xmm0
+            punpcklwd   xmm1,   xmm0
+
+            paddd       xmm1,   xmm6
+            paddd       xmm1,   [GLOBAL(four8s)]
+
+            psrad       xmm1,   4
+            packssdw    xmm1,   xmm0
+
+            packuswb    xmm1,   xmm0
+            pand        xmm1,   xmm5
+
+            pandn       xmm5,   xmm2
+            por         xmm5,   xmm1
+
+            movd        [rsi+rcx-8],  mm0
+            movq        mm0,    mm1
+
+            movdq2q     mm1,    xmm5
+            psrldq      xmm7,   12
+
+            psrldq      xmm6,   12
+            add         rcx,    4
+
+            cmp         rcx,    rdx
+            jl          .nextcol4
+
+        ;s+=pitch;
+        movsxd rax, dword arg(1)
+        add    arg(0), rax
+
+        sub dword arg(2), 1 ;rows-=1
+        cmp dword arg(2), 0
+        jg .ip_row_loop
+
+    add         rsp, 16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
+;                            unsigned char blackclamp[16],
+;                            unsigned char whiteclamp[16],
+;                            unsigned char bothclamp[16],
+;                            unsigned int Width, unsigned int Height, int Pitch)
+extern sym(rand)
+global sym(vp8_plane_add_noise_wmt) PRIVATE
+sym(vp8_plane_add_noise_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+.addnoise_loop:
+    call sym(rand) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    ; we rely on the fact that the clamping vectors are stored contiguously
+    ; in black/white/both order. Note that we have to reload this here because
+    ; rdx could be trashed by rand()
+    mov     rdx, arg(2) ; blackclamp
+
+
+            mov     rdi, rcx
+            movsxd  rcx, dword arg(5) ;[Width]
+            mov     rsi, arg(0) ;Pos
+            xor         rax,rax
+
+.addnoise_nextset:
+            movdqu      xmm1,[rsi+rax]         ; get the source
+
+            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
+            paddusb     xmm1, [rdx+32] ;bothclamp
+            psubusb     xmm1, [rdx+16] ;whiteclamp
+
+            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
+            paddb       xmm1,xmm2              ; add it in
+            movdqu      [rsi+rax],xmm1         ; store the result
+
+            add         rax,16                 ; move to the next line
+
+            cmp         rax, rcx
+            jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+four8s:
+    times 4 dd 8
diff --git a/libvpx/vp8/common/x86/postproc_x86.c b/libvpx/vp8/common/x86/postproc_x86.c
new file mode 100644
index 0000000..3ec0106
--- /dev/null
+++ b/libvpx/vp8/common/x86/postproc_x86.c
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* On Android NDK, rand is inlined function, but postproc needs rand symbol */
+#if defined(__ANDROID__)
+#define rand __rand
+#include <stdlib.h>
+#undef rand
+
+extern int rand(void)
+{
+  return __rand();
+}
+#else
+/* ISO C forbids an empty translation unit. */
+int vp8_unused;
+#endif
diff --git a/libvpx/vp8/common/x86/recon_mmx.asm b/libvpx/vp8/common/x86/recon_mmx.asm
new file mode 100644
index 0000000..15e9871
--- /dev/null
+++ b/libvpx/vp8/common/x86/recon_mmx.asm
@@ -0,0 +1,274 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void copy_mem8x8_mmx(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp8_copy_mem8x8_mmx) PRIVATE
+sym(vp8_copy_mem8x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movq        mm0,        [rsi]
+
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+        mov         rdi,        arg(2) ;dst;
+
+        movq        mm1,        [rsi+rax]
+        movq        mm2,        [rsi+rax*2]
+
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+        lea         rsi,        [rsi+rax*2]
+
+        movq        [rdi],      mm0
+        add         rsi,        rax
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx*2],    mm2
+
+
+        lea         rdi,        [rdi+rcx*2]
+        movq        mm3,        [rsi]
+
+        add         rdi,        rcx
+        movq        mm4,        [rsi+rax]
+
+        movq        mm5,        [rsi+rax*2]
+        movq        [rdi],      mm3
+
+        lea         rsi,        [rsi+rax*2]
+        movq        [rdi+rcx],  mm4
+
+        movq        [rdi+rcx*2],    mm5
+        lea         rdi,        [rdi+rcx*2]
+
+        movq        mm0,        [rsi+rax]
+        movq        mm1,        [rsi+rax*2]
+
+        movq        [rdi+rcx],  mm0
+        movq        [rdi+rcx*2],mm1
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void copy_mem8x4_mmx(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp8_copy_mem8x4_mmx) PRIVATE
+sym(vp8_copy_mem8x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movq        mm0,        [rsi]
+
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+        mov         rdi,        arg(2) ;dst;
+
+        movq        mm1,        [rsi+rax]
+        movq        mm2,        [rsi+rax*2]
+
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+        lea         rsi,        [rsi+rax*2]
+
+        movq        [rdi],      mm0
+        movq        [rdi+rcx],      mm1
+
+        movq        [rdi+rcx*2],    mm2
+        lea         rdi,        [rdi+rcx*2]
+
+        movq        mm3,        [rsi+rax]
+        movq        [rdi+rcx],      mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void copy_mem16x16_mmx(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp8_copy_mem16x16_mmx) PRIVATE
+sym(vp8_copy_mem16x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+
+        mov         rdi,        arg(2) ;dst;
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        mm1,            [rsi+rax]
+        movq        mm4,            [rsi+rax+8]
+
+        movq        mm2,            [rsi+rax*2]
+        movq        mm5,            [rsi+rax*2+8]
+
+        lea         rsi,            [rsi+rax*2]
+        add         rsi,            rax
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+        movq        [rdi+rcx],      mm1
+        movq        [rdi+rcx+8],    mm4
+
+        movq        [rdi+rcx*2],    mm2
+        movq        [rdi+rcx*2+8],  mm5
+
+        lea         rdi,            [rdi+rcx*2]
+        add         rdi,            rcx
+
+        movq        mm0,            [rsi]
+        movq        mm3,            [rsi+8];
+
+        movq        [rdi],          mm0
+        movq        [rdi+8],        mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libvpx/vp8/common/x86/recon_sse2.asm b/libvpx/vp8/common/x86/recon_sse2.asm
new file mode 100644
index 0000000..fe77450
--- /dev/null
+++ b/libvpx/vp8/common/x86/recon_sse2.asm
@@ -0,0 +1,1080 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void copy_mem16x16_sse2(
+;    unsigned char *src,
+;    int src_stride,
+;    unsigned char *dst,
+;    int dst_stride
+;    )
+global sym(vp8_copy_mem16x16_sse2) PRIVATE
+sym(vp8_copy_mem16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src;
+        movdqu      xmm0,       [rsi]
+
+        movsxd      rax,        dword ptr arg(1) ;src_stride;
+        mov         rdi,        arg(2) ;dst;
+
+        movdqu      xmm1,       [rsi+rax]
+        movdqu      xmm2,       [rsi+rax*2]
+
+        movsxd      rcx,        dword ptr arg(3) ;dst_stride
+        lea         rsi,        [rsi+rax*2]
+
+        movdqa      [rdi],      xmm0
+        add         rsi,        rax
+
+        movdqa      [rdi+rcx],  xmm1
+        movdqa      [rdi+rcx*2],xmm2
+
+        lea         rdi,        [rdi+rcx*2]
+        movdqu      xmm3,       [rsi]
+
+        add         rdi,        rcx
+        movdqu      xmm4,       [rsi+rax]
+
+        movdqu      xmm5,       [rsi+rax*2]
+        lea         rsi,        [rsi+rax*2]
+
+        movdqa      [rdi],  xmm3
+        add         rsi,        rax
+
+        movdqa      [rdi+rcx],  xmm4
+        movdqa      [rdi+rcx*2],xmm5
+
+        lea         rdi,        [rdi+rcx*2]
+        movdqu      xmm0,       [rsi]
+
+        add         rdi,        rcx
+        movdqu      xmm1,       [rsi+rax]
+
+        movdqu      xmm2,       [rsi+rax*2]
+        lea         rsi,        [rsi+rax*2]
+
+        movdqa      [rdi],      xmm0
+        add         rsi,        rax
+
+        movdqa      [rdi+rcx],  xmm1
+
+        movdqa      [rdi+rcx*2],    xmm2
+        movdqu      xmm3,       [rsi]
+
+        movdqu      xmm4,       [rsi+rax]
+        lea         rdi,        [rdi+rcx*2]
+
+        add         rdi,        rcx
+        movdqu      xmm5,       [rsi+rax*2]
+
+        lea         rsi,        [rsi+rax*2]
+        movdqa      [rdi],  xmm3
+
+        add         rsi,        rax
+        movdqa      [rdi+rcx],  xmm4
+
+        movdqa      [rdi+rcx*2],xmm5
+        movdqu      xmm0,       [rsi]
+
+        lea         rdi,        [rdi+rcx*2]
+        movdqu      xmm1,       [rsi+rax]
+
+        add         rdi,        rcx
+        movdqu      xmm2,       [rsi+rax*2]
+
+        lea         rsi,        [rsi+rax*2]
+        movdqa      [rdi],      xmm0
+
+        movdqa      [rdi+rcx],  xmm1
+        movdqa      [rdi+rcx*2],xmm2
+
+        movdqu      xmm3,       [rsi+rax]
+        lea         rdi,        [rdi+rcx*2]
+
+        movdqa      [rdi+rcx],  xmm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_intra_pred_uv_dc_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
+;    )
+global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE
+sym(vp8_intra_pred_uv_dc_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from top
+    mov         rdi,        arg(2) ;above;
+    mov         rsi,        arg(3) ;left;
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
+    pxor        mm0,        mm0
+    movq        mm1,        [rdi]
+    lea         rdi,        [rax*3]
+    psadbw      mm1,        mm0
+    ; from left
+    movzx       ecx,        byte [rsi]
+    movzx       edx,        byte [rsi+rax*1]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+
+    movzx       edx,        byte [rsi+rdi]
+    lea         rsi,        [rsi+rax*4]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+
+    ; add up
+    pextrw      edx,        mm1, 0x0
+    lea         edx,        [edx+ecx+8]
+    sar         edx,        4
+    movd        mm1,        edx
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    pshufw      mm1,        mm1, 0x0
+    mov         rdi,        arg(0) ;dst;
+    packuswb    mm1,        mm1
+
+    ; write out
+    lea         rax,        [rcx*3]
+    lea         rdx,        [rdi+rcx*4]
+
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+    movq [rdx      ],       mm1
+    movq [rdx+rcx  ],       mm1
+    movq [rdx+rcx*2],       mm1
+    movq [rdx+rax  ],       mm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_dctop_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
+;    )
+global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE
+sym(vp8_intra_pred_uv_dctop_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;arg(3), arg(4) not used
+
+    ; from top
+    mov         rsi,        arg(2) ;above;
+    pxor        mm0,        mm0
+    movq        mm1,        [rsi]
+    psadbw      mm1,        mm0
+
+    ; add up
+    paddw       mm1,        [GLOBAL(dc_4)]
+    psraw       mm1,        3
+    pshufw      mm1,        mm1, 0x0
+    packuswb    mm1,        mm1
+
+    ; write out
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+    lea         rdi,        [rdi+rcx*4]
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_dcleft_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
+;    )
+global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE
+sym(vp8_intra_pred_uv_dcleft_mmx2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;arg(2) not used
+
+    ; from left
+    mov         rsi,        arg(3) ;left;
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi]
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    lea         edx,        [ecx+edx+4]
+
+    ; add up
+    shr         edx,        3
+    movd        mm1,        edx
+    pshufw      mm1,        mm1, 0x0
+    packuswb    mm1,        mm1
+
+    ; write out
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+    lea         rdi,        [rdi+rcx*4]
+    movq [rdi      ],       mm1
+    movq [rdi+rcx  ],       mm1
+    movq [rdi+rcx*2],       mm1
+    movq [rdi+rax  ],       mm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_dc128_mmx(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
+;    )
+global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE
+sym(vp8_intra_pred_uv_dc128_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    ; end prolog
+
+    ;arg(2), arg(3), arg(4) not used
+
+    ; write out
+    movq        mm1,        [GLOBAL(dc_128)]
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+    lea         rax,        [rax+rdx*4]
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_tm_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
+;    )
+%macro vp8_intra_pred_uv_tm 1
+global sym(vp8_intra_pred_uv_tm_%1) PRIVATE
+sym(vp8_intra_pred_uv_tm_%1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; read top row
+    mov         edx,        4
+    mov         rsi,        arg(2) ;above
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
+    pxor        xmm0,       xmm0
+%ifidn %1, ssse3
+    movdqa      xmm2,       [GLOBAL(dc_1024)]
+%endif
+    movq        xmm1,       [rsi]
+    punpcklbw   xmm1,       xmm0
+
+    ; set up left ptrs ans subtract topleft
+    movd        xmm3,       [rsi-1]
+    mov         rsi,        arg(3) ;left;
+%ifidn %1, sse2
+    punpcklbw   xmm3,       xmm0
+    pshuflw     xmm3,       xmm3, 0x0
+    punpcklqdq  xmm3,       xmm3
+%else
+    pshufb      xmm3,       xmm2
+%endif
+    psubw       xmm1,       xmm3
+
+    ; set up dest ptrs
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+
+.vp8_intra_pred_uv_tm_%1_loop:
+    movd        xmm3,       [rsi]
+    movd        xmm5,       [rsi+rax]
+%ifidn %1, sse2
+    punpcklbw   xmm3,       xmm0
+    punpcklbw   xmm5,       xmm0
+    pshuflw     xmm3,       xmm3, 0x0
+    pshuflw     xmm5,       xmm5, 0x0
+    punpcklqdq  xmm3,       xmm3
+    punpcklqdq  xmm5,       xmm5
+%else
+    pshufb      xmm3,       xmm2
+    pshufb      xmm5,       xmm2
+%endif
+    paddw       xmm3,       xmm1
+    paddw       xmm5,       xmm1
+    packuswb    xmm3,       xmm5
+    movq  [rdi    ],        xmm3
+    movhps[rdi+rcx],        xmm3
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz .vp8_intra_pred_uv_tm_%1_loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endmacro
+
+vp8_intra_pred_uv_tm sse2
+vp8_intra_pred_uv_tm ssse3
+
+;void vp8_intra_pred_uv_ve_mmx(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
+;    )
+global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE
+sym(vp8_intra_pred_uv_ve_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    ; end prolog
+
+    ; arg(3), arg(4) not used
+
+    ; read from top
+    mov         rax,        arg(2) ;src;
+
+    movq        mm1,        [rax]
+
+    ; write out
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+    lea         rax,        [rax+rdx*4]
+    movq [rax      ],       mm1
+    movq [rax+rdx  ],       mm1
+    movq [rax+rdx*2],       mm1
+    movq [rax+rcx  ],       mm1
+
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_uv_ho_mmx2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
+;    )
+%macro vp8_intra_pred_uv_ho 1
+global sym(vp8_intra_pred_uv_ho_%1) PRIVATE
+sym(vp8_intra_pred_uv_ho_%1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+%ifidn %1, ssse3
+%ifndef GET_GOT_SAVE_ARG
+    push        rbx
+%endif
+    GET_GOT     rbx
+%endif
+    ; end prolog
+
+    ;arg(2) not used
+
+    ; read from left and write out
+%ifidn %1, mmx2
+    mov         edx,        4
+%endif
+    mov         rsi,        arg(3) ;left
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+%ifidn %1, ssse3
+    lea         rdx,        [rcx*3]
+    movdqa      xmm2,       [GLOBAL(dc_00001111)]
+    lea         rbx,        [rax*3]
+%endif
+
+%ifidn %1, mmx2
+.vp8_intra_pred_uv_ho_%1_loop:
+    movd        mm0,        [rsi]
+    movd        mm1,        [rsi+rax]
+    punpcklbw   mm0,        mm0
+    punpcklbw   mm1,        mm1
+    pshufw      mm0,        mm0, 0x0
+    pshufw      mm1,        mm1, 0x0
+    movq  [rdi    ],        mm0
+    movq  [rdi+rcx],        mm1
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz .vp8_intra_pred_uv_ho_%1_loop
+%else
+    movd        xmm0,       [rsi]
+    movd        xmm3,       [rsi+rax]
+    movd        xmm1,       [rsi+rax*2]
+    movd        xmm4,       [rsi+rbx]
+    punpcklbw   xmm0,       xmm3
+    punpcklbw   xmm1,       xmm4
+    pshufb      xmm0,       xmm2
+    pshufb      xmm1,       xmm2
+    movq   [rdi    ],       xmm0
+    movhps [rdi+rcx],       xmm0
+    movq [rdi+rcx*2],       xmm1
+    movhps [rdi+rdx],       xmm1
+    lea         rsi,        [rsi+rax*4]
+    lea         rdi,        [rdi+rcx*4]
+    movd        xmm0,       [rsi]
+    movd        xmm3,       [rsi+rax]
+    movd        xmm1,       [rsi+rax*2]
+    movd        xmm4,       [rsi+rbx]
+    punpcklbw   xmm0,       xmm3
+    punpcklbw   xmm1,       xmm4
+    pshufb      xmm0,       xmm2
+    pshufb      xmm1,       xmm2
+    movq   [rdi    ],       xmm0
+    movhps [rdi+rcx],       xmm0
+    movq [rdi+rcx*2],       xmm1
+    movhps [rdi+rdx],       xmm1
+%endif
+
+    ; begin epilog
+%ifidn %1, ssse3
+    RESTORE_GOT
+%ifndef GET_GOT_SAVE_ARG
+    pop         rbx
+%endif
+%endif
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endmacro
+
+vp8_intra_pred_uv_ho mmx2
+vp8_intra_pred_uv_ho ssse3
+
+;void vp8_intra_pred_y_dc_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
+;    )
+global sym(vp8_intra_pred_y_dc_sse2) PRIVATE
+sym(vp8_intra_pred_y_dc_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; from top
+    mov         rdi,        arg(2) ;above
+    mov         rsi,        arg(3) ;left
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
+
+    pxor        xmm0,       xmm0
+    movdqa      xmm1,       [rdi]
+    psadbw      xmm1,       xmm0
+    movq        xmm2,       xmm1
+    punpckhqdq  xmm1,       xmm1
+    paddw       xmm1,       xmm2
+
+    ; from left
+    lea         rdi,        [rax*3]
+
+    movzx       ecx,        byte [rsi]
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+
+    ; add up
+    pextrw      edx,        xmm1, 0x0
+    lea         edx,        [edx+ecx+16]
+    sar         edx,        5
+    movd        xmm1,       edx
+    ; FIXME use pshufb for ssse3 version
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm1,       xmm1
+    packuswb    xmm1,       xmm1
+
+    ; write out
+    mov         rsi,        2
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+.label
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_dctop_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
+;    )
+global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE
+sym(vp8_intra_pred_y_dctop_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    GET_GOT     rbx
+    ; end prolog
+
+    ;arg(3), arg(4) not used
+
+    ; from top
+    mov         rcx,        arg(2) ;above;
+    pxor        xmm0,       xmm0
+    movdqa      xmm1,       [rcx]
+    psadbw      xmm1,       xmm0
+    movdqa      xmm2,       xmm1
+    punpckhqdq  xmm1,       xmm1
+    paddw       xmm1,       xmm2
+
+    ; add up
+    paddw       xmm1,       [GLOBAL(dc_8)]
+    psraw       xmm1,       4
+    ; FIXME use pshufb for ssse3 version
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm1,       xmm1
+    packuswb    xmm1,       xmm1
+
+    ; write out
+    mov         rsi,        2
+    mov         rdx,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+.label
+    movdqa [rdx      ],     xmm1
+    movdqa [rdx+rcx  ],     xmm1
+    movdqa [rdx+rcx*2],     xmm1
+    movdqa [rdx+rax  ],     xmm1
+    lea         rdx,        [rdx+rcx*4]
+    movdqa [rdx      ],     xmm1
+    movdqa [rdx+rcx  ],     xmm1
+    movdqa [rdx+rcx*2],     xmm1
+    movdqa [rdx+rax  ],     xmm1
+    lea         rdx,        [rdx+rcx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    RESTORE_GOT
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_dcleft_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
+;    )
+global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE
+sym(vp8_intra_pred_y_dcleft_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;arg(2) not used
+
+    ; from left
+    mov         rsi,        arg(3) ;left;
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
+
+    lea         rdi,        [rax*3]
+    movzx       ecx,        byte [rsi]
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    add         ecx,        edx
+    lea         rsi,        [rsi+rax*4]
+    movzx       edx,        byte [rsi]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rax*2]
+    add         ecx,        edx
+    movzx       edx,        byte [rsi+rdi]
+    lea         edx,        [ecx+edx+8]
+
+    ; add up
+    shr         edx,        4
+    movd        xmm1,       edx
+    ; FIXME use pshufb for ssse3 version
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm1,       xmm1
+    packuswb    xmm1,       xmm1
+
+    ; write out
+    mov         rsi,        2
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+    lea         rax,        [rcx*3]
+
+.label
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    movdqa [rdi      ],     xmm1
+    movdqa [rdi+rcx  ],     xmm1
+    movdqa [rdi+rcx*2],     xmm1
+    movdqa [rdi+rax  ],     xmm1
+    lea         rdi,        [rdi+rcx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_dc128_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
+;    )
+global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE
+sym(vp8_intra_pred_y_dc128_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    GET_GOT     rbx
+    ; end prolog
+
+    ;arg(2), arg(3), arg(4) not used
+
+    ; write out
+    mov         rsi,        2
+    movdqa      xmm1,       [GLOBAL(dc_128)]
+    mov         rax,        arg(0) ;dst;
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+    lea         rcx,        [rdx*3]
+
+.label
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    RESTORE_GOT
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_tm_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
+;    )
+%macro vp8_intra_pred_y_tm 1
+global sym(vp8_intra_pred_y_tm_%1) PRIVATE
+sym(vp8_intra_pred_y_tm_%1):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    GET_GOT     rbx
+    ; end prolog
+
+    ; read top row
+    mov         edx,        8
+    mov         rsi,        arg(2) ;above
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
+    pxor        xmm0,       xmm0
+%ifidn %1, ssse3
+    movdqa      xmm3,       [GLOBAL(dc_1024)]
+%endif
+    movdqa      xmm1,       [rsi]
+    movdqa      xmm2,       xmm1
+    punpcklbw   xmm1,       xmm0
+    punpckhbw   xmm2,       xmm0
+
+    ; set up left ptrs ans subtract topleft
+    movd        xmm4,       [rsi-1]
+    mov         rsi,        arg(3) ;left
+%ifidn %1, sse2
+    punpcklbw   xmm4,       xmm0
+    pshuflw     xmm4,       xmm4, 0x0
+    punpcklqdq  xmm4,       xmm4
+%else
+    pshufb      xmm4,       xmm3
+%endif
+    psubw       xmm1,       xmm4
+    psubw       xmm2,       xmm4
+
+    ; set up dest ptrs
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+vp8_intra_pred_y_tm_%1_loop:
+    movd        xmm4,       [rsi]
+    movd        xmm5,       [rsi+rax]
+%ifidn %1, sse2
+    punpcklbw   xmm4,       xmm0
+    punpcklbw   xmm5,       xmm0
+    pshuflw     xmm4,       xmm4, 0x0
+    pshuflw     xmm5,       xmm5, 0x0
+    punpcklqdq  xmm4,       xmm4
+    punpcklqdq  xmm5,       xmm5
+%else
+    pshufb      xmm4,       xmm3
+    pshufb      xmm5,       xmm3
+%endif
+    movdqa      xmm6,       xmm4
+    movdqa      xmm7,       xmm5
+    paddw       xmm4,       xmm1
+    paddw       xmm6,       xmm2
+    paddw       xmm5,       xmm1
+    paddw       xmm7,       xmm2
+    packuswb    xmm4,       xmm6
+    packuswb    xmm5,       xmm7
+    movdqa [rdi    ],       xmm4
+    movdqa [rdi+rcx],       xmm5
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz vp8_intra_pred_y_tm_%1_loop
+
+    ; begin epilog
+    RESTORE_GOT
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endmacro
+
+vp8_intra_pred_y_tm sse2
+vp8_intra_pred_y_tm ssse3
+
+;void vp8_intra_pred_y_ve_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
+;    )
+global sym(vp8_intra_pred_y_ve_sse2) PRIVATE
+sym(vp8_intra_pred_y_ve_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    ; end prolog
+
+    ;arg(3), arg(4) not used
+
+    mov         rax,        arg(2) ;above;
+    mov         rsi,        2
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+
+    ; read from top
+    movdqa      xmm1,       [rax]
+
+    ; write out
+    mov         rax,        arg(0) ;dst;
+    lea         rcx,        [rdx*3]
+
+.label
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    movdqa [rax      ],     xmm1
+    movdqa [rax+rdx  ],     xmm1
+    movdqa [rax+rdx*2],     xmm1
+    movdqa [rax+rcx  ],     xmm1
+    lea         rax,        [rax+rdx*4]
+    dec         rsi
+    jnz .label
+
+    ; begin epilog
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_intra_pred_y_ho_sse2(
+;    unsigned char *dst,
+;    int dst_stride
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
+;    )
+global sym(vp8_intra_pred_y_ho_sse2) PRIVATE
+sym(vp8_intra_pred_y_ho_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;arg(2) not used
+
+    ; read from left and write out
+    mov         edx,        8
+    mov         rsi,        arg(3) ;left;
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
+    mov         rdi,        arg(0) ;dst;
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
+
+vp8_intra_pred_y_ho_sse2_loop:
+    movd        xmm0,       [rsi]
+    movd        xmm1,       [rsi+rax]
+    ; FIXME use pshufb for ssse3 version
+    punpcklbw   xmm0,       xmm0
+    punpcklbw   xmm1,       xmm1
+    pshuflw     xmm0,       xmm0, 0x0
+    pshuflw     xmm1,       xmm1, 0x0
+    punpcklqdq  xmm0,       xmm0
+    punpcklqdq  xmm1,       xmm1
+    movdqa [rdi    ],       xmm0
+    movdqa [rdi+rcx],       xmm1
+    lea         rsi,        [rsi+rax*2]
+    lea         rdi,        [rdi+rcx*2]
+    dec         edx
+    jnz vp8_intra_pred_y_ho_sse2_loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+dc_128:
+    times 16 db 128
+dc_4:
+    times 4 dw 4
+align 16
+dc_8:
+    times 8 dw 8
+align 16
+dc_1024:
+    times 8 dw 0x400
+align 16
+dc_00001111:
+    times 8 db 0
+    times 8 db 1
diff --git a/libvpx/vp8/common/x86/recon_wrapper_sse2.c b/libvpx/vp8/common/x86/recon_wrapper_sse2.c
new file mode 100644
index 0000000..b482faa
--- /dev/null
+++ b/libvpx/vp8/common/x86/recon_wrapper_sse2.c
@@ -0,0 +1,186 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/blockd.h"
+
+#define build_intra_predictors_mbuv_prototype(sym) \
+    void sym(unsigned char *dst, int dst_stride, \
+             const unsigned char *above, \
+             const unsigned char *left, int left_stride)
+typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t));
+
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_ssse3);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
+
+static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
+                                                unsigned char * uabove_row,
+                                                unsigned char * vabove_row,
+                                                unsigned char *dst_u,
+                                                unsigned char *dst_v,
+                                                int dst_stride,
+                                                unsigned char * uleft,
+                                                unsigned char * vleft,
+                                                int left_stride,
+                                                build_intra_predictors_mbuv_fn_t tm_func,
+                                                build_intra_predictors_mbuv_fn_t ho_func)
+{
+    int mode = x->mode_info_context->mbmi.uv_mode;
+    build_intra_predictors_mbuv_fn_t fn;
+
+    switch (mode) {
+        case  V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
+        case  H_PRED: fn = ho_func; break;
+        case TM_PRED: fn = tm_func; break;
+        case DC_PRED:
+            if (x->up_available) {
+                if (x->left_available) {
+                    fn = vp8_intra_pred_uv_dc_mmx2; break;
+                } else {
+                    fn = vp8_intra_pred_uv_dctop_mmx2; break;
+                }
+            } else if (x->left_available) {
+                fn = vp8_intra_pred_uv_dcleft_mmx2; break;
+            } else {
+                fn = vp8_intra_pred_uv_dc128_mmx; break;
+            }
+            break;
+        default: return;
+    }
+
+    fn(dst_u, dst_stride, uabove_row, uleft, left_stride);
+    fn(dst_v, dst_stride, vabove_row, vleft, left_stride);
+}
+
+void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x,
+                                            unsigned char * uabove_row,
+                                            unsigned char * vabove_row,
+                                            unsigned char * uleft,
+                                            unsigned char * vleft,
+                                            int left_stride,
+                                            unsigned char * upred_ptr,
+                                            unsigned char * vpred_ptr,
+                                            int pred_stride)
+{
+    vp8_build_intra_predictors_mbuv_x86(x,
+                                        uabove_row, vabove_row,
+                                        upred_ptr,
+                                        vpred_ptr, pred_stride,
+                                        uleft,
+                                        vleft,
+                                        left_stride,
+                                        vp8_intra_pred_uv_tm_sse2,
+                                        vp8_intra_pred_uv_ho_mmx2);
+}
+
+void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x,
+                                             unsigned char * uabove_row,
+                                             unsigned char * vabove_row,
+                                             unsigned char * uleft,
+                                             unsigned char * vleft,
+                                             int left_stride,
+                                             unsigned char * upred_ptr,
+                                             unsigned char * vpred_ptr,
+                                             int pred_stride)
+{
+    vp8_build_intra_predictors_mbuv_x86(x,
+                                        uabove_row, vabove_row,
+                                        upred_ptr,
+                                        vpred_ptr, pred_stride,
+                                        uleft,
+                                        vleft,
+                                        left_stride,
+                                        vp8_intra_pred_uv_tm_ssse3,
+                                        vp8_intra_pred_uv_ho_ssse3);
+}
+
+#define build_intra_predictors_mby_prototype(sym) \
+    void sym(unsigned char *dst, int dst_stride, \
+             const unsigned char *above, \
+             const unsigned char *left, int left_stride)
+typedef build_intra_predictors_mby_prototype((*build_intra_predictors_mby_fn_t));
+
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dctop_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dcleft_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc128_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ho_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ve_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_ssse3);
+
+static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x,
+                                               unsigned char * yabove_row,
+                                               unsigned char *dst_y,
+                                               int dst_stride,
+                                               unsigned char * yleft,
+                                               int left_stride,
+                                               build_intra_predictors_mby_fn_t tm_func)
+{
+    int mode = x->mode_info_context->mbmi.mode;
+    build_intra_predictors_mbuv_fn_t fn;
+
+    switch (mode) {
+        case  V_PRED: fn = vp8_intra_pred_y_ve_sse2; break;
+        case  H_PRED: fn = vp8_intra_pred_y_ho_sse2; break;
+        case TM_PRED: fn = tm_func; break;
+        case DC_PRED:
+            if (x->up_available) {
+                if (x->left_available) {
+                    fn = vp8_intra_pred_y_dc_sse2; break;
+                } else {
+                    fn = vp8_intra_pred_y_dctop_sse2; break;
+                }
+            } else if (x->left_available) {
+                fn = vp8_intra_pred_y_dcleft_sse2; break;
+            } else {
+                fn = vp8_intra_pred_y_dc128_sse2; break;
+            }
+            break;
+        default: return;
+    }
+
+    fn(dst_y, dst_stride, yabove_row, yleft, left_stride);
+    return;
+}
+
+void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x,
+                                           unsigned char * yabove_row,
+                                           unsigned char * yleft,
+                                           int left_stride,
+                                           unsigned char * ypred_ptr,
+                                           int y_stride)
+{
+    vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr,
+                                       y_stride, yleft, left_stride,
+                                       vp8_intra_pred_y_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x,
+                                            unsigned char * yabove_row,
+                                            unsigned char * yleft,
+                                            int left_stride,
+                                            unsigned char * ypred_ptr,
+                                            int y_stride)
+{
+    vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr,
+                                     y_stride, yleft, left_stride,
+                                       vp8_intra_pred_y_tm_ssse3);
+
+}
diff --git a/libvpx/vp8/common/x86/sad_mmx.asm b/libvpx/vp8/common/x86/sad_mmx.asm
new file mode 100644
index 0000000..592112f
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_mmx.asm
@@ -0,0 +1,427 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp8_sad16x16_mmx) PRIVATE
+global sym(vp8_sad8x16_mmx) PRIVATE
+global sym(vp8_sad8x8_mmx) PRIVATE
+global sym(vp8_sad4x4_mmx) PRIVATE
+global sym(vp8_sad16x8_mmx) PRIVATE
+
+;unsigned int vp8_sad16x16_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad16x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+.x16x16sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm2,        QWORD PTR [rsi+8]
+
+        movq            mm1,        QWORD PTR [rdi]
+        movq            mm3,        QWORD PTR [rdi+8]
+
+        movq            mm4,        mm0
+        movq            mm5,        mm2
+
+        psubusb         mm0,        mm1
+        psubusb         mm1,        mm4
+
+        psubusb         mm2,        mm3
+        psubusb         mm3,        mm5
+
+        por             mm0,        mm1
+        por             mm2,        mm3
+
+        movq            mm1,        mm0
+        movq            mm3,        mm2
+
+        punpcklbw       mm0,        mm6
+        punpcklbw       mm2,        mm6
+
+        punpckhbw       mm1,        mm6
+        punpckhbw       mm3,        mm6
+
+        paddw           mm0,        mm2
+        paddw           mm1,        mm3
+
+
+        lea             rsi,        [rsi+rax]
+        add             rdi,        rdx
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm1
+
+        cmp             rsi,        rcx
+        jne             .x16x16sad_mmx_loop
+
+
+        movq            mm0,        mm7
+
+        punpcklwd       mm0,        mm6
+        punpckhwd       mm7,        mm6
+
+        paddw           mm0,        mm7
+        movq            mm7,        mm0
+
+
+        psrlq           mm0,        32
+        paddw           mm7,        mm0
+
+        movq            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad8x16_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad8x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+.x8x16sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        punpcklbw       mm0,        mm6
+
+        punpckhbw       mm2,        mm6
+        lea             rsi,        [rsi+rax]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        paddw           mm7,        mm2
+        cmp             rsi,        rcx
+
+        jne             .x8x16sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movq            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad8x8_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad8x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+.x8x8sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        punpcklbw       mm0,        mm6
+
+        punpckhbw       mm2,        mm6
+        paddw           mm0,        mm2
+
+        lea             rsi,       [rsi+rax]
+        add             rdi,        rdx
+
+        paddw           mm7,       mm0
+        cmp             rsi,        rcx
+
+        jne             .x8x8sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movq            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad4x4_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rdi]
+
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rdi+rdx]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        pxor            mm3,        mm3
+
+        punpcklbw       mm0,        mm3
+        punpckhbw       mm2,        mm3
+
+        paddw           mm0,        mm2
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movd            mm4,        DWORD PTR [rsi]
+        movd            mm5,        DWORD PTR [rdi]
+
+        movd            mm6,        DWORD PTR [rsi+rax]
+        movd            mm7,        DWORD PTR [rdi+rdx]
+
+        punpcklbw       mm4,        mm6
+        punpcklbw       mm5,        mm7
+
+        movq            mm6,        mm4
+        psubusb         mm4,        mm5
+
+        psubusb         mm5,        mm6
+        por             mm4,        mm5
+
+        movq            mm5,        mm4
+        punpcklbw       mm4,        mm3
+
+        punpckhbw       mm5,        mm3
+        paddw           mm4,        mm5
+
+        paddw           mm0,        mm4
+        movq            mm1,        mm0
+
+        punpcklwd       mm0,        mm3
+        punpckhwd       mm1,        mm3
+
+        paddw           mm0,        mm1
+        movq            mm1,        mm0
+
+        psrlq           mm0,        32
+        paddw           mm0,        mm1
+
+        movq            rax,        mm0
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad16x8_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad16x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+.x16x8sad_mmx_loop:
+
+        movq            mm0,       [rsi]
+        movq            mm1,       [rdi]
+
+        movq            mm2,        [rsi+8]
+        movq            mm3,        [rdi+8]
+
+        movq            mm4,        mm0
+        movq            mm5,        mm2
+
+        psubusb         mm0,        mm1
+        psubusb         mm1,        mm4
+
+        psubusb         mm2,        mm3
+        psubusb         mm3,        mm5
+
+        por             mm0,        mm1
+        por             mm2,        mm3
+
+        movq            mm1,        mm0
+        movq            mm3,        mm2
+
+        punpcklbw       mm0,        mm6
+        punpckhbw       mm1,        mm6
+
+        punpcklbw       mm2,        mm6
+        punpckhbw       mm3,        mm6
+
+
+        paddw           mm0,        mm2
+        paddw           mm1,        mm3
+
+        paddw           mm0,        mm1
+        lea             rsi,        [rsi+rax]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        cmp             rsi,        rcx
+        jne             .x16x8sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movq            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libvpx/vp8/common/x86/sad_sse2.asm b/libvpx/vp8/common/x86/sad_sse2.asm
new file mode 100644
index 0000000..8d86abc
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_sse2.asm
@@ -0,0 +1,410 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp8_sad16x16_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp8_sad16x16_wmt) PRIVATE
+sym(vp8_sad16x16_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    SAVE_XMM 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            xmm6,       xmm6
+
+.x16x16sad_wmt_loop:
+
+        movq            xmm0,       QWORD PTR [rsi]
+        movq            xmm2,       QWORD PTR [rsi+8]
+
+        movq            xmm1,       QWORD PTR [rdi]
+        movq            xmm3,       QWORD PTR [rdi+8]
+
+        movq            xmm4,       QWORD PTR [rsi+rax]
+        movq            xmm5,       QWORD PTR [rdi+rdx]
+
+
+        punpcklbw       xmm0,       xmm2
+        punpcklbw       xmm1,       xmm3
+
+        psadbw          xmm0,       xmm1
+        movq            xmm2,       QWORD PTR [rsi+rax+8]
+
+        movq            xmm3,       QWORD PTR [rdi+rdx+8]
+        lea             rsi,        [rsi+rax*2]
+
+        lea             rdi,        [rdi+rdx*2]
+        punpcklbw       xmm4,       xmm2
+
+        punpcklbw       xmm5,       xmm3
+        psadbw          xmm4,       xmm5
+
+        paddw           xmm6,       xmm0
+        paddw           xmm6,       xmm4
+
+        cmp             rsi,        rcx
+        jne             .x16x16sad_wmt_loop
+
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movq            rax,        xmm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp8_sad8x16_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  max_sad)
+global sym(vp8_sad8x16_wmt) PRIVATE
+sym(vp8_sad8x16_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+
+        lea             rcx,        [rcx+rbx*8]
+        pxor            mm7,        mm7
+
+.x8x16sad_wmt_loop:
+
+        movq            rax,        mm7
+        cmp             eax,        arg(4)
+        ja              .x8x16sad_wmt_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        QWORD PTR [rsi+rbx]
+        movq            mm3,        QWORD PTR [rdi+rdx]
+
+        psadbw          mm0,        mm1
+        psadbw          mm2,        mm3
+
+        lea             rsi,        [rsi+rbx*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm2
+
+        cmp             rsi,        rcx
+        jne             .x8x16sad_wmt_loop
+
+        movq            rax,        mm7
+
+.x8x16sad_wmt_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad8x8_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp8_sad8x8_wmt) PRIVATE
+sym(vp8_sad8x8_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+        pxor            mm7,        mm7
+
+.x8x8sad_wmt_loop:
+
+        movq            rax,        mm7
+        cmp             eax,        arg(4)
+        ja              .x8x8sad_wmt_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        psadbw          mm0,        mm1
+        lea             rsi,        [rsi+rbx]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        cmp             rsi,        rcx
+        jne             .x8x8sad_wmt_loop
+
+        movq            rax,        mm7
+.x8x8sad_wmt_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp8_sad4x4_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp8_sad4x4_wmt) PRIVATE
+sym(vp8_sad4x4_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        movd            mm0,        DWORD PTR [rsi]
+        movd            mm1,        DWORD PTR [rdi]
+
+        movd            mm2,        DWORD PTR [rsi+rax]
+        movd            mm3,        DWORD PTR [rdi+rdx]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        psadbw          mm0,        mm1
+        lea             rsi,        [rsi+rax*2]
+
+        lea             rdi,        [rdi+rdx*2]
+        movd            mm4,        DWORD PTR [rsi]
+
+        movd            mm5,        DWORD PTR [rdi]
+        movd            mm6,        DWORD PTR [rsi+rax]
+
+        movd            mm7,        DWORD PTR [rdi+rdx]
+        punpcklbw       mm4,        mm6
+
+        punpcklbw       mm5,        mm7
+        psadbw          mm4,        mm5
+
+        paddw           mm0,        mm4
+        movq            rax,        mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad16x8_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp8_sad16x8_wmt) PRIVATE
+sym(vp8_sad16x8_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+        pxor            mm7,        mm7
+
+.x16x8sad_wmt_loop:
+
+        movq            rax,        mm7
+        cmp             eax,        arg(4)
+        ja              .x16x8sad_wmt_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm2,        QWORD PTR [rsi+8]
+
+        movq            mm1,        QWORD PTR [rdi]
+        movq            mm3,        QWORD PTR [rdi+8]
+
+        movq            mm4,        QWORD PTR [rsi+rbx]
+        movq            mm5,        QWORD PTR [rdi+rdx]
+
+        psadbw          mm0,        mm1
+        psadbw          mm2,        mm3
+
+        movq            mm1,        QWORD PTR [rsi+rbx+8]
+        movq            mm3,        QWORD PTR [rdi+rdx+8]
+
+        psadbw          mm4,        mm5
+        psadbw          mm1,        mm3
+
+        lea             rsi,        [rsi+rbx*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        paddw           mm0,        mm2
+        paddw           mm4,        mm1
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm4
+
+        cmp             rsi,        rcx
+        jne             .x16x8sad_wmt_loop
+
+        movq            rax,        mm7
+
+.x16x8sad_wmt_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_copy32xn_sse2(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    int height);
+global sym(vp8_copy32xn_sse2) PRIVATE
+sym(vp8_copy32xn_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;dst_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;dst_stride
+        movsxd          rcx,        dword ptr arg(4) ;height
+
+.block_copy_sse2_loopx4:
+        movdqu          xmm0,       XMMWORD PTR [rsi]
+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
+        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
+        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
+
+        lea             rsi,        [rsi+rax*2]
+
+        movdqu          xmm4,       XMMWORD PTR [rsi]
+        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
+        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
+        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
+
+        lea             rsi,    [rsi+rax*2]
+
+        movdqa          XMMWORD PTR [rdi], xmm0
+        movdqa          XMMWORD PTR [rdi + 16], xmm1
+        movdqa          XMMWORD PTR [rdi + rdx], xmm2
+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
+
+        lea             rdi,    [rdi+rdx*2]
+
+        movdqa          XMMWORD PTR [rdi], xmm4
+        movdqa          XMMWORD PTR [rdi + 16], xmm5
+        movdqa          XMMWORD PTR [rdi + rdx], xmm6
+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
+
+        lea             rdi,    [rdi+rdx*2]
+
+        sub             rcx,     4
+        cmp             rcx,     4
+        jge             .block_copy_sse2_loopx4
+
+        cmp             rcx, 0
+        je              .copy_is_done
+
+.block_copy_sse2_loop:
+        movdqu          xmm0,       XMMWORD PTR [rsi]
+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
+        lea             rsi,    [rsi+rax]
+
+        movdqa          XMMWORD PTR [rdi], xmm0
+        movdqa          XMMWORD PTR [rdi + 16], xmm1
+        lea             rdi,    [rdi+rdx]
+
+        sub             rcx,     1
+        jne             .block_copy_sse2_loop
+
+.copy_is_done:
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libvpx/vp8/common/x86/sad_sse3.asm b/libvpx/vp8/common/x86/sad_sse3.asm
new file mode 100644
index 0000000..f90a589
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_sse3.asm
@@ -0,0 +1,960 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE_X3 0
+%if ABI_IS_32BIT
+  %define     src_ptr       rsi
+  %define     src_stride    rax
+  %define     ref_ptr       rdi
+  %define     ref_stride    rdx
+  %define     end_ptr       rcx
+  %define     ret_var       rbx
+  %define     result_ptr    arg(4)
+  %define     max_sad       arg(4)
+  %define     height        dword ptr arg(4)
+    push        rbp
+    mov         rbp,        rsp
+    push        rsi
+    push        rdi
+    push        rbx
+
+    mov         rsi,        arg(0)              ; src_ptr
+    mov         rdi,        arg(2)              ; ref_ptr
+
+    movsxd      rax,        dword ptr arg(1)    ; src_stride
+    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    SAVE_XMM 7, u
+    %define     src_ptr     rcx
+    %define     src_stride  rdx
+    %define     ref_ptr     r8
+    %define     ref_stride  r9
+    %define     end_ptr     r10
+    %define     ret_var     r11
+    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
+    %define     max_sad     [rsp+xmm_stack_space+8+4*8]
+    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
+  %else
+    %define     src_ptr     rdi
+    %define     src_stride  rsi
+    %define     ref_ptr     rdx
+    %define     ref_stride  rcx
+    %define     end_ptr     r9
+    %define     ret_var     r10
+    %define     result_ptr  r8
+    %define     max_sad     r8
+    %define     height      r8
+  %endif
+%endif
+
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X3 0
+  %define     src_ptr
+  %define     src_stride
+  %define     ref_ptr
+  %define     ref_stride
+  %define     end_ptr
+  %define     ret_var
+  %define     result_ptr
+  %define     max_sad
+  %define     height
+
+%if ABI_IS_32BIT
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    pop         rbp
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    RESTORE_XMM
+  %endif
+%endif
+    ret
+%endmacro
+
+%macro STACK_FRAME_CREATE_X4 0
+%if ABI_IS_32BIT
+  %define     src_ptr       rsi
+  %define     src_stride    rax
+  %define     r0_ptr        rcx
+  %define     r1_ptr        rdx
+  %define     r2_ptr        rbx
+  %define     r3_ptr        rdi
+  %define     ref_stride    rbp
+  %define     result_ptr    arg(4)
+    push        rbp
+    mov         rbp,        rsp
+    push        rsi
+    push        rdi
+    push        rbx
+
+    push        rbp
+    mov         rdi,        arg(2)              ; ref_ptr_base
+
+    LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+    mov         rsi,        arg(0)              ; src_ptr
+
+    movsxd      rbx,        dword ptr arg(1)    ; src_stride
+    movsxd      rbp,        dword ptr arg(3)    ; ref_stride
+
+    xchg        rbx,        rax
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    SAVE_XMM 7, u
+    %define     src_ptr     rcx
+    %define     src_stride  rdx
+    %define     r0_ptr      rsi
+    %define     r1_ptr      r10
+    %define     r2_ptr      r11
+    %define     r3_ptr      r8
+    %define     ref_stride  r9
+    %define     result_ptr  [rsp+xmm_stack_space+16+4*8]
+    push        rsi
+
+    LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
+  %else
+    %define     src_ptr     rdi
+    %define     src_stride  rsi
+    %define     r0_ptr      r9
+    %define     r1_ptr      r10
+    %define     r2_ptr      r11
+    %define     r3_ptr      rdx
+    %define     ref_stride  rcx
+    %define     result_ptr  r8
+
+    LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
+
+  %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X4 0
+  %define     src_ptr
+  %define     src_stride
+  %define     r0_ptr
+  %define     r1_ptr
+  %define     r2_ptr
+  %define     r3_ptr
+  %define     ref_stride
+  %define     result_ptr
+
+%if ABI_IS_32BIT
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    pop         rbp
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rsi
+    RESTORE_XMM
+  %endif
+%endif
+    ret
+%endmacro
+
+%macro PROCESS_16X2X3 5
+%if %1==0
+        movdqa          xmm0,       XMMWORD PTR [%2]
+        lddqu           xmm5,       XMMWORD PTR [%3]
+        lddqu           xmm6,       XMMWORD PTR [%3+1]
+        lddqu           xmm7,       XMMWORD PTR [%3+2]
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       XMMWORD PTR [%2]
+        lddqu           xmm1,       XMMWORD PTR [%3]
+        lddqu           xmm2,       XMMWORD PTR [%3+1]
+        lddqu           xmm3,       XMMWORD PTR [%3+2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       XMMWORD PTR [%2+%4]
+        lddqu           xmm1,       XMMWORD PTR [%3+%5]
+        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
+        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+        lea             %2,         [%2+%4*2]
+        lea             %3,         [%3+%5*2]
+%endif
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_8X2X3 5
+%if %1==0
+        movq            mm0,       QWORD PTR [%2]
+        movq            mm5,       QWORD PTR [%3]
+        movq            mm6,       QWORD PTR [%3+1]
+        movq            mm7,       QWORD PTR [%3+2]
+
+        psadbw          mm5,       mm0
+        psadbw          mm6,       mm0
+        psadbw          mm7,       mm0
+%else
+        movq            mm0,       QWORD PTR [%2]
+        movq            mm1,       QWORD PTR [%3]
+        movq            mm2,       QWORD PTR [%3+1]
+        movq            mm3,       QWORD PTR [%3+2]
+
+        psadbw          mm1,       mm0
+        psadbw          mm2,       mm0
+        psadbw          mm3,       mm0
+
+        paddw           mm5,       mm1
+        paddw           mm6,       mm2
+        paddw           mm7,       mm3
+%endif
+        movq            mm0,       QWORD PTR [%2+%4]
+        movq            mm1,       QWORD PTR [%3+%5]
+        movq            mm2,       QWORD PTR [%3+%5+1]
+        movq            mm3,       QWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+        lea             %2,        [%2+%4*2]
+        lea             %3,        [%3+%5*2]
+%endif
+
+        psadbw          mm1,       mm0
+        psadbw          mm2,       mm0
+        psadbw          mm3,       mm0
+
+        paddw           mm5,       mm1
+        paddw           mm6,       mm2
+        paddw           mm7,       mm3
+%endmacro
+
+%macro LOAD_X4_ADDRESSES 5
+        mov             %2,         [%1+REG_SZ_BYTES*0]
+        mov             %3,         [%1+REG_SZ_BYTES*1]
+
+        mov             %4,         [%1+REG_SZ_BYTES*2]
+        mov             %5,         [%1+REG_SZ_BYTES*3]
+%endmacro
+
+%macro PROCESS_16X2X4 8
+%if %1==0
+        movdqa          xmm0,       XMMWORD PTR [%2]
+        lddqu           xmm4,       XMMWORD PTR [%3]
+        lddqu           xmm5,       XMMWORD PTR [%4]
+        lddqu           xmm6,       XMMWORD PTR [%5]
+        lddqu           xmm7,       XMMWORD PTR [%6]
+
+        psadbw          xmm4,       xmm0
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       XMMWORD PTR [%2]
+        lddqu           xmm1,       XMMWORD PTR [%3]
+        lddqu           xmm2,       XMMWORD PTR [%4]
+        lddqu           xmm3,       XMMWORD PTR [%5]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm4,       xmm1
+        lddqu           xmm1,       XMMWORD PTR [%6]
+        paddw           xmm5,       xmm2
+        paddw           xmm6,       xmm3
+
+        psadbw          xmm1,       xmm0
+        paddw           xmm7,       xmm1
+%endif
+        movdqa          xmm0,       XMMWORD PTR [%2+%7]
+        lddqu           xmm1,       XMMWORD PTR [%3+%8]
+        lddqu           xmm2,       XMMWORD PTR [%4+%8]
+        lddqu           xmm3,       XMMWORD PTR [%5+%8]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm4,       xmm1
+        lddqu           xmm1,       XMMWORD PTR [%6+%8]
+        paddw           xmm5,       xmm2
+        paddw           xmm6,       xmm3
+
+%if %1==0 || %1==1
+        lea             %2,         [%2+%7*2]
+        lea             %3,         [%3+%8*2]
+
+        lea             %4,         [%4+%8*2]
+        lea             %5,         [%5+%8*2]
+
+        lea             %6,         [%6+%8*2]
+%endif
+        psadbw          xmm1,       xmm0
+        paddw           xmm7,       xmm1
+
+%endmacro
+
+%macro PROCESS_8X2X4 8
+%if %1==0
+        movq            mm0,        QWORD PTR [%2]
+        movq            mm4,        QWORD PTR [%3]
+        movq            mm5,        QWORD PTR [%4]
+        movq            mm6,        QWORD PTR [%5]
+        movq            mm7,        QWORD PTR [%6]
+
+        psadbw          mm4,        mm0
+        psadbw          mm5,        mm0
+        psadbw          mm6,        mm0
+        psadbw          mm7,        mm0
+%else
+        movq            mm0,        QWORD PTR [%2]
+        movq            mm1,        QWORD PTR [%3]
+        movq            mm2,        QWORD PTR [%4]
+        movq            mm3,        QWORD PTR [%5]
+
+        psadbw          mm1,        mm0
+        psadbw          mm2,        mm0
+        psadbw          mm3,        mm0
+
+        paddw           mm4,        mm1
+        movq            mm1,        QWORD PTR [%6]
+        paddw           mm5,        mm2
+        paddw           mm6,        mm3
+
+        psadbw          mm1,        mm0
+        paddw           mm7,        mm1
+%endif
+        movq            mm0,        QWORD PTR [%2+%7]
+        movq            mm1,        QWORD PTR [%3+%8]
+        movq            mm2,        QWORD PTR [%4+%8]
+        movq            mm3,        QWORD PTR [%5+%8]
+
+        psadbw          mm1,        mm0
+        psadbw          mm2,        mm0
+        psadbw          mm3,        mm0
+
+        paddw           mm4,        mm1
+        movq            mm1,        QWORD PTR [%6+%8]
+        paddw           mm5,        mm2
+        paddw           mm6,        mm3
+
+%if %1==0 || %1==1
+        lea             %2,         [%2+%7*2]
+        lea             %3,         [%3+%8*2]
+
+        lea             %4,         [%4+%8*2]
+        lea             %5,         [%5+%8*2]
+
+        lea             %6,         [%6+%8*2]
+%endif
+        psadbw          mm1,        mm0
+        paddw           mm7,        mm1
+
+%endmacro
+
+;void int vp8_sad16x16x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x16x3_sse3) PRIVATE
+sym(vp8_sad16x16x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+        mov             rcx,        result_ptr
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rcx],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rcx+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rcx+8],    xmm0
+
+    STACK_FRAME_DESTROY_X3
+
+;void int vp8_sad16x8x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x8x3_sse3) PRIVATE
+sym(vp8_sad16x8x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+        mov             rcx,        result_ptr
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rcx],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rcx+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rcx+8],    xmm0
+
+    STACK_FRAME_DESTROY_X3
+
+;void int vp8_sad8x16x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad8x16x3_sse3) PRIVATE
+sym(vp8_sad8x16x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+        mov             rcx,        result_ptr
+
+        punpckldq       mm5,        mm6
+
+        movq            [rcx],      mm5
+        movd            [rcx+8],    mm7
+
+    STACK_FRAME_DESTROY_X3
+
+;void int vp8_sad8x8x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad8x8x3_sse3) PRIVATE
+sym(vp8_sad8x8x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+        mov             rcx,        result_ptr
+
+        punpckldq       mm5,        mm6
+
+        movq            [rcx],      mm5
+        movd            [rcx+8],    mm7
+
+    STACK_FRAME_DESTROY_X3
+
+;void int vp8_sad4x4x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad4x4x3_sse3) PRIVATE
+sym(vp8_sad4x4x3_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        movd            mm0,        DWORD PTR [src_ptr]
+        movd            mm1,        DWORD PTR [ref_ptr]
+
+        movd            mm2,        DWORD PTR [src_ptr+src_stride]
+        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movd            mm4,        DWORD PTR [ref_ptr+1]
+        movd            mm5,        DWORD PTR [ref_ptr+2]
+
+        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
+        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
+
+        psadbw          mm1,        mm0
+
+        punpcklbw       mm4,        mm2
+        punpcklbw       mm5,        mm3
+
+        psadbw          mm4,        mm0
+        psadbw          mm5,        mm0
+
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             ref_ptr,    [ref_ptr+ref_stride*2]
+
+        movd            mm0,        DWORD PTR [src_ptr]
+        movd            mm2,        DWORD PTR [ref_ptr]
+
+        movd            mm3,        DWORD PTR [src_ptr+src_stride]
+        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
+
+        punpcklbw       mm0,        mm3
+        punpcklbw       mm2,        mm6
+
+        movd            mm3,        DWORD PTR [ref_ptr+1]
+        movd            mm7,        DWORD PTR [ref_ptr+2]
+
+        psadbw          mm2,        mm0
+
+        paddw           mm1,        mm2
+
+        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
+        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
+
+        punpcklbw       mm3,        mm2
+        punpcklbw       mm7,        mm6
+
+        psadbw          mm3,        mm0
+        psadbw          mm7,        mm0
+
+        paddw           mm3,        mm4
+        paddw           mm7,        mm5
+
+        mov             rcx,        result_ptr
+
+        punpckldq       mm1,        mm3
+
+        movq            [rcx],      mm1
+        movd            [rcx+8],    mm7
+
+    STACK_FRAME_DESTROY_X3
+
+;unsigned int vp8_sad16x16_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  max_sad)
+;%define lddqu movdqu
+global sym(vp8_sad16x16_sse3) PRIVATE
+sym(vp8_sad16x16_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+        mov             end_ptr,    4
+        pxor            xmm7,        xmm7
+
+.vp8_sad16x16_sse3_loop:
+        movdqa          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
+        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
+
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             ref_ptr,    [ref_ptr+ref_stride*2]
+
+        movdqa          xmm4,       XMMWORD PTR [src_ptr]
+        movdqu          xmm5,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
+
+        psadbw          xmm0,       xmm1
+
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
+
+        psadbw          xmm2,       xmm3
+        psadbw          xmm4,       xmm5
+        psadbw          xmm6,       xmm1
+
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             ref_ptr,    [ref_ptr+ref_stride*2]
+
+        paddw           xmm7,        xmm0
+        paddw           xmm7,        xmm2
+        paddw           xmm7,        xmm4
+        paddw           xmm7,        xmm6
+
+        sub             end_ptr,     1
+        jne             .vp8_sad16x16_sse3_loop
+
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+        paddw           xmm0,       xmm7
+        movq            rax,        xmm0
+
+    STACK_FRAME_DESTROY_X3
+
+;void vp8_copy32xn_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    int height);
+global sym(vp8_copy32xn_sse3) PRIVATE
+sym(vp8_copy32xn_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+.block_copy_sse3_loopx4:
+        lea             end_ptr,    [src_ptr+src_stride*2]
+
+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
+        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
+        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
+        movdqu          xmm4,       XMMWORD PTR [end_ptr]
+        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
+        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
+        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
+
+        lea             src_ptr,    [src_ptr+src_stride*4]
+
+        lea             end_ptr,    [ref_ptr+ref_stride*2]
+
+        movdqa          XMMWORD PTR [ref_ptr], xmm0
+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
+        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
+        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
+        movdqa          XMMWORD PTR [end_ptr], xmm4
+        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
+        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
+        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
+
+        lea             ref_ptr,    [ref_ptr+ref_stride*4]
+
+        sub             height,     4
+        cmp             height,     4
+        jge             .block_copy_sse3_loopx4
+
+        ;Check to see if there is more rows need to be copied.
+        cmp             height, 0
+        je              .copy_is_done
+
+.block_copy_sse3_loop:
+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
+        lea             src_ptr,    [src_ptr+src_stride]
+
+        movdqa          XMMWORD PTR [ref_ptr], xmm0
+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
+        lea             ref_ptr,    [ref_ptr+ref_stride]
+
+        sub             height,     1
+        jne             .block_copy_sse3_loop
+
+.copy_is_done:
+    STACK_FRAME_DESTROY_X3
+
+;void vp8_sad16x16x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr_base,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x16x4d_sse3) PRIVATE
+sym(vp8_sad16x16x4d_sse3):
+
+    STACK_FRAME_CREATE_X4
+
+        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+        pop             rbp
+%endif
+        mov             rcx,        result_ptr
+
+        movq            xmm0,       xmm4
+        psrldq          xmm4,       8
+
+        paddw           xmm0,       xmm4
+        movd            [rcx],      xmm0
+;-
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rcx+4],    xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rcx+8],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rcx+12],   xmm0
+
+    STACK_FRAME_DESTROY_X4
+
+;void vp8_sad16x8x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr_base,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x8x4d_sse3) PRIVATE
+sym(vp8_sad16x8x4d_sse3):
+
+    STACK_FRAME_CREATE_X4
+
+        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+        pop             rbp
+%endif
+        mov             rcx,        result_ptr
+
+        movq            xmm0,       xmm4
+        psrldq          xmm4,       8
+
+        paddw           xmm0,       xmm4
+        movd            [rcx],      xmm0
+;-
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rcx+4],    xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rcx+8],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rcx+12],   xmm0
+
+    STACK_FRAME_DESTROY_X4
+
+;void int vp8_sad8x16x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad8x16x4d_sse3) PRIVATE
+sym(vp8_sad8x16x4d_sse3):
+
+    STACK_FRAME_CREATE_X4
+
+        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+        pop             rbp
+%endif
+        mov             rcx,        result_ptr
+
+        punpckldq       mm4,        mm5
+        punpckldq       mm6,        mm7
+
+        movq            [rcx],      mm4
+        movq            [rcx+8],    mm6
+
+    STACK_FRAME_DESTROY_X4
+
+;void int vp8_sad8x8x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad8x8x4d_sse3) PRIVATE
+sym(vp8_sad8x8x4d_sse3):
+
+    STACK_FRAME_CREATE_X4
+
+        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+        pop             rbp
+%endif
+        mov             rcx,        result_ptr
+
+        punpckldq       mm4,        mm5
+        punpckldq       mm6,        mm7
+
+        movq            [rcx],      mm4
+        movq            [rcx+8],    mm6
+
+    STACK_FRAME_DESTROY_X4
+
+;void int vp8_sad4x4x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad4x4x4d_sse3) PRIVATE
+sym(vp8_sad4x4x4d_sse3):
+
+    STACK_FRAME_CREATE_X4
+
+        movd            mm0,        DWORD PTR [src_ptr]
+        movd            mm1,        DWORD PTR [r0_ptr]
+
+        movd            mm2,        DWORD PTR [src_ptr+src_stride]
+        movd            mm3,        DWORD PTR [r0_ptr+ref_stride]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movd            mm4,        DWORD PTR [r1_ptr]
+        movd            mm5,        DWORD PTR [r2_ptr]
+
+        movd            mm6,        DWORD PTR [r3_ptr]
+        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
+
+        movd            mm3,        DWORD PTR [r2_ptr+ref_stride]
+        movd            mm7,        DWORD PTR [r3_ptr+ref_stride]
+
+        psadbw          mm1,        mm0
+
+        punpcklbw       mm4,        mm2
+        punpcklbw       mm5,        mm3
+
+        punpcklbw       mm6,        mm7
+        psadbw          mm4,        mm0
+
+        psadbw          mm5,        mm0
+        psadbw          mm6,        mm0
+
+
+
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             r0_ptr,     [r0_ptr+ref_stride*2]
+
+        lea             r1_ptr,     [r1_ptr+ref_stride*2]
+        lea             r2_ptr,     [r2_ptr+ref_stride*2]
+
+        lea             r3_ptr,     [r3_ptr+ref_stride*2]
+
+        movd            mm0,        DWORD PTR [src_ptr]
+        movd            mm2,        DWORD PTR [r0_ptr]
+
+        movd            mm3,        DWORD PTR [src_ptr+src_stride]
+        movd            mm7,        DWORD PTR [r0_ptr+ref_stride]
+
+        punpcklbw       mm0,        mm3
+        punpcklbw       mm2,        mm7
+
+        movd            mm3,        DWORD PTR [r1_ptr]
+        movd            mm7,        DWORD PTR [r2_ptr]
+
+        psadbw          mm2,        mm0
+%if ABI_IS_32BIT
+        mov             rax,        rbp
+
+        pop             rbp
+%define     ref_stride    rax
+%endif
+        mov             rsi,        result_ptr
+
+        paddw           mm1,        mm2
+        movd            [rsi],      mm1
+
+        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
+        movd            mm1,        DWORD PTR [r2_ptr+ref_stride]
+
+        punpcklbw       mm3,        mm2
+        punpcklbw       mm7,        mm1
+
+        psadbw          mm3,        mm0
+        psadbw          mm7,        mm0
+
+        movd            mm2,        DWORD PTR [r3_ptr]
+        movd            mm1,        DWORD PTR [r3_ptr+ref_stride]
+
+        paddw           mm3,        mm4
+        paddw           mm7,        mm5
+
+        movd            [rsi+4],    mm3
+        punpcklbw       mm2,        mm1
+
+        movd            [rsi+8],    mm7
+        psadbw          mm2,        mm0
+
+        paddw           mm2,        mm6
+        movd            [rsi+12],   mm2
+
+
+    STACK_FRAME_DESTROY_X4
+
diff --git a/libvpx/vp8/common/x86/sad_sse4.asm b/libvpx/vp8/common/x86/sad_sse4.asm
new file mode 100644
index 0000000..f7fccd7
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_sse4.asm
@@ -0,0 +1,353 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X8 1
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm1,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm1,       xmm2
+        paddw           xmm1,       xmm3
+        paddw           xmm1,       xmm4
+%else
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        movq            xmm2,       MMWORD PTR [rdi+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endif
+        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
+        punpcklqdq      xmm5,       xmm3
+        punpcklqdq      xmm3,       xmm2
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+
+        psrldq          xmm0,       8
+        movdqa          xmm4,       xmm3
+        mpsadbw         xmm3,       xmm0,  0x0
+        mpsadbw         xmm4,       xmm0,  0x5
+
+        paddw           xmm5,       xmm2
+        paddw           xmm5,       xmm3
+        paddw           xmm5,       xmm4
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_8X2X8 1
+%if %1
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        movdqa          xmm2,       xmm1
+        mpsadbw         xmm1,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm1,       xmm2
+%else
+        movq            xmm0,       MMWORD PTR [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endif
+        movq            xmm0,       MMWORD PTR [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movdqa          xmm2,       xmm5
+        mpsadbw         xmm5,       xmm0,  0x0
+        mpsadbw         xmm2,       xmm0,  0x5
+        paddw           xmm5,       xmm2
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+%macro PROCESS_4X2X8 1
+%if %1
+        movd            xmm0,       [rsi]
+        movq            xmm1,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm1,       xmm3
+
+        mpsadbw         xmm1,       xmm0,  0x0
+%else
+        movd            xmm0,       [rsi]
+        movq            xmm5,       MMWORD PTR [rdi]
+        movq            xmm3,       MMWORD PTR [rdi+8]
+        punpcklqdq      xmm5,       xmm3
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endif
+        movd            xmm0,       [rsi + rax]
+        movq            xmm5,       MMWORD PTR [rdi+ rdx]
+        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
+        punpcklqdq      xmm5,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        mpsadbw         xmm5,       xmm0,  0x0
+
+        paddw           xmm1,       xmm5
+%endmacro
+
+
+;void vp8_sad16x16x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array);
+global sym(vp8_sad16x16x8_sse4) PRIVATE
+sym(vp8_sad16x16x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X8 1
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqa          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad16x8x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad16x8x8_sse4) PRIVATE
+sym(vp8_sad16x8x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X8 1
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+        PROCESS_16X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqa          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad8x8x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad8x8x8_sse4) PRIVATE
+sym(vp8_sad8x8x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X8 1
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqa          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad8x16x8_sse4(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad8x16x8_sse4) PRIVATE
+sym(vp8_sad8x16x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X8 1
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        PROCESS_8X2X8 0
+        mov             rdi,        arg(4)           ;Results
+        movdqa          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_sad4x4x8_c(
+;    const unsigned char *src_ptr,
+;    int  src_stride,
+;    const unsigned char *ref_ptr,
+;    int  ref_stride,
+;    unsigned short *sad_array
+;);
+global sym(vp8_sad4x4x8_sse4) PRIVATE
+sym(vp8_sad4x4x8_sse4):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0)           ;src_ptr
+        mov             rdi,        arg(2)           ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_4X2X8 1
+        PROCESS_4X2X8 0
+
+        mov             rdi,        arg(4)           ;Results
+        movdqa          XMMWORD PTR [rdi],    xmm1
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+
diff --git a/libvpx/vp8/common/x86/sad_ssse3.asm b/libvpx/vp8/common/x86/sad_ssse3.asm
new file mode 100644
index 0000000..278fc06
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_ssse3.asm
@@ -0,0 +1,370 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X3 1
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm5,       XMMWORD PTR [rdi]
+        lddqu           xmm6,       XMMWORD PTR [rdi+1]
+        lddqu           xmm7,       XMMWORD PTR [rdi+2]
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        lddqu           xmm1,       XMMWORD PTR [rdi]
+        lddqu           xmm2,       XMMWORD PTR [rdi+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
+        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
+        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_16X2X3_OFFSET 2
+%if %1
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movdqa          xmm4,       XMMWORD PTR [rdi]
+        movdqa          xmm7,       XMMWORD PTR [rdi+16]
+
+        movdqa          xmm5,       xmm7
+        palignr         xmm5,       xmm4,       %2
+
+        movdqa          xmm6,       xmm7
+        palignr         xmm6,       xmm4,       (%2+1)
+
+        palignr         xmm7,       xmm4,       (%2+2)
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       XMMWORD PTR [rsi]
+        movdqa          xmm4,       XMMWORD PTR [rdi]
+        movdqa          xmm3,       XMMWORD PTR [rdi+16]
+
+        movdqa          xmm1,       xmm3
+        palignr         xmm1,       xmm4,       %2
+
+        movdqa          xmm2,       xmm3
+        palignr         xmm2,       xmm4,       (%2+1)
+
+        palignr         xmm3,       xmm4,       (%2+2)
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
+        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
+        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
+
+        movdqa          xmm1,       xmm3
+        palignr         xmm1,       xmm4,       %2
+
+        movdqa          xmm2,       xmm3
+        palignr         xmm2,       xmm4,       (%2+1)
+
+        palignr         xmm3,       xmm4,       (%2+2)
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_16X16X3_OFFSET 2
+%2_aligned_by_%1:
+
+        sub             rdi,        %1
+
+        PROCESS_16X2X3_OFFSET 1, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+
+        jmp             %2_store_off
+
+%endmacro
+
+%macro PROCESS_16X8X3_OFFSET 2
+%2_aligned_by_%1:
+
+        sub             rdi,        %1
+
+        PROCESS_16X2X3_OFFSET 1, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+
+        jmp             %2_store_off
+
+%endmacro
+
+;void int vp8_sad16x16x3_ssse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x16x3_ssse3) PRIVATE
+sym(vp8_sad16x16x3_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rcx
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        mov             rdx,        0xf
+        and             rdx,        rdi
+
+        jmp .vp8_sad16x16x3_ssse3_skiptable
+.vp8_sad16x16x3_ssse3_jumptable:
+        dd .vp8_sad16x16x3_ssse3_aligned_by_0  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_1  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_2  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_3  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_4  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_5  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_6  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_7  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_8  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_9  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_skiptable:
+
+        call .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_do_jump:
+        pop             rcx                         ; get the address of do_jump
+        mov             rax,  .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
+
+        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
+        add             rcx,        rax
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        jmp             rcx
+
+        PROCESS_16X16X3_OFFSET 0,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 1,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 2,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 3,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 4,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 5,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 6,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 7,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 8,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 9,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
+
+.vp8_sad16x16x3_ssse3_aligned_by_15:
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+.vp8_sad16x16x3_ssse3_store_off:
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad16x8x3_ssse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x8x3_ssse3) PRIVATE
+sym(vp8_sad16x8x3_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rcx
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        mov             rdx,        0xf
+        and             rdx,        rdi
+
+        jmp .vp8_sad16x8x3_ssse3_skiptable
+.vp8_sad16x8x3_ssse3_jumptable:
+        dd .vp8_sad16x8x3_ssse3_aligned_by_0  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_1  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_2  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_3  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_4  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_5  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_6  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_7  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_8  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_9  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_skiptable:
+
+        call .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_do_jump:
+        pop             rcx                         ; get the address of do_jump
+        mov             rax,  .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
+
+        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
+        add             rcx,        rax
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        jmp             rcx
+
+        PROCESS_16X8X3_OFFSET 0,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 1,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 2,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 3,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 4,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 5,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 6,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 7,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 8,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 9,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
+
+.vp8_sad16x8x3_ssse3_aligned_by_15:
+
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+.vp8_sad16x8x3_ssse3_store_off:
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libvpx/vp8/common/x86/subpixel_mmx.asm b/libvpx/vp8/common/x86/subpixel_mmx.asm
new file mode 100644
index 0000000..47dd452
--- /dev/null
+++ b/libvpx/vp8/common/x86/subpixel_mmx.asm
@@ -0,0 +1,702 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
+
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define vp8_filter_weight 128
+%define VP8_FILTER_SHIFT  7
+
+
+;void vp8_filter_block1d_h6_mmx
+;(
+;    unsigned char   *src_ptr,
+;    unsigned short  *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    pixel_step,
+;    unsigned int    output_height,
+;    unsigned int    output_width,
+;    short           * vp8_filter
+;)
+global sym(vp8_filter_block1d_h6_mmx) PRIVATE
+sym(vp8_filter_block1d_h6_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,    arg(6) ;vp8_filter
+
+        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
+        movq        mm2,    [rdx + 32]         ;
+        movq        mm6,    [rdx + 48]        ;
+        movq        mm7,    [rdx + 64]        ;
+
+        mov         rdi,    arg(1) ;output_ptr
+        mov         rsi,    arg(0) ;src_ptr
+        movsxd      rcx,    dword ptr arg(4) ;output_height
+        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
+        pxor        mm0,    mm0              ; mm0 = 00000000
+
+.nextrow:
+        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
+        movq        mm4,    mm3              ; mm4 = p-2..p5
+        psrlq       mm3,    8                ; mm3 = p-1..p5
+        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
+        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
+
+        movq        mm5,    mm4              ; mm5 = p-2..p5
+        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
+        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
+        paddsw      mm3,    mm4              ; mm3 += mm5
+
+        movq        mm4,    mm5              ; mm4 = p-2..p5;
+        psrlq       mm5,    16               ; mm5 = p0..p5;
+        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
+        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
+        paddsw      mm3,    mm5              ; mm3 += mm5
+
+        movq        mm5,    mm4              ; mm5 = p-2..p5
+        psrlq       mm4,    24               ; mm4 = p1..p5
+        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
+        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
+        paddsw      mm3,    mm4              ; mm3 += mm5
+
+        ; do outer positive taps
+        movd        mm4,    [rsi+3]
+        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
+        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
+        paddsw      mm3,    mm4              ; mm3 += mm5
+
+        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
+        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
+        paddsw      mm3,    mm5              ; mm3 += mm5
+
+        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
+        psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
+        packuswb    mm3,    mm0              ; pack and unpack to saturate
+        punpcklbw   mm3,    mm0              ;
+
+        movq        [rdi],  mm3              ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
+        add         rdi,    rax;
+%else
+        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
+        add         rdi,    rax;
+
+        add         rsi,    r8               ; next line
+%endif
+
+        dec         rcx                      ; decrement count
+        jnz         .nextrow                 ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1dc_v6_mmx
+;(
+;   short *src_ptr,
+;   unsigned char *output_ptr,
+;    int output_pitch,
+;   unsigned int pixels_per_line,
+;   unsigned int pixel_step,
+;   unsigned int output_height,
+;   unsigned int output_width,
+;   short * vp8_filter
+;)
+global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
+sym(vp8_filter_block1dc_v6_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        movq      mm5, [GLOBAL(rd)]
+        push        rbx
+        mov         rbx, arg(7) ;vp8_filter
+        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
+        movq      mm2, [rbx + 32]         ;
+        movq      mm6, [rbx + 48]        ;
+        movq      mm7, [rbx + 64]        ;
+
+        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
+        mov         rdi, arg(1) ;output_ptr
+        mov         rsi, arg(0) ;src_ptr
+        sub         rsi, rdx
+        sub         rsi, rdx
+        movsxd      rcx, DWORD PTR arg(5) ;output_height
+        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
+        pxor        mm0, mm0              ; mm0 = 00000000
+
+
+.nextrow_cv:
+        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
+        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
+
+
+        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
+        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
+        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
+        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+
+        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
+        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
+        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
+        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
+        paddsw      mm3, mm4              ; mm3 += mm4
+
+
+        paddsw      mm3, mm5               ; mm3 += round value
+        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
+        packuswb    mm3, mm0              ; pack and saturate
+
+        movd        [rdi],mm3             ; store the results in the destination
+        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
+        ; recon block should be in cache this shouldn't cost much.  Its obviously
+        ; avoidable!!!.
+        lea         rdi,  [rdi+rax] ;
+        dec         rcx                   ; decrement count
+        jnz         .nextrow_cv           ; next row
+
+        pop         rbx
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void bilinear_predict8x8_mmx
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;   unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
+sym(vp8_bilinear_predict8x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        mov         rdi,        arg(4) ;dst_ptr           ;
+
+        shl         rax,        5 ; offset * 32
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+
+        add         rax,        rcx ; HFilter
+        mov         rsi,        arg(0) ;src_ptr              ;
+
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+        movq        mm1,        [rax]               ;
+
+        movq        mm2,        [rax+16]            ;
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        pxor        mm0,        mm0                 ;
+
+        shl         rax,        5 ; offset*32
+        add         rax,        rcx ; VFilter
+
+        lea         rcx,        [rdi+rdx*8]          ;
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
+
+
+
+        ; get the first horizontal line done       ;
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+        add         rsi,        rdx                 ; next line
+.next_row_8x8:
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        movq        mm5,        mm7                 ;
+        movq        mm6,        mm7                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0
+
+        pmullw      mm5,        [rax]               ;
+        pmullw      mm6,        [rax]               ;
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+
+        pmullw      mm3,        [rax+16]            ;
+        pmullw      mm4,        [rax+16]            ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        packuswb    mm3,        mm4
+
+        movq        [rdi],      mm3                 ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,        rdx                 ; next line
+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
+%else
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
+        add         rsi,        rdx                 ; next line
+        add         rdi,        r8                  ;dst_pitch
+%endif
+        cmp         rdi,        rcx                 ;
+        jne         .next_row_8x8
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void bilinear_predict8x4_mmx
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
+sym(vp8_bilinear_predict8x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        mov         rdi,        arg(4) ;dst_ptr           ;
+
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+        shl         rax,        5
+
+        mov         rsi,        arg(0) ;src_ptr              ;
+        add         rax,        rcx
+
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+        movq        mm1,        [rax]               ;
+
+        movq        mm2,        [rax+16]            ;
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        pxor        mm0,        mm0                 ;
+        shl         rax,        5
+
+        add         rax,        rcx
+        lea         rcx,        [rdi+rdx*4]          ;
+
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
+
+        ; get the first horizontal line done       ;
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+        add         rsi,        rdx                 ; next line
+.next_row_8x4:
+        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movq        mm4,        mm3                 ; make a copy of current line
+
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   mm4,        mm0                 ;
+
+        pmullw      mm3,        mm1                 ;
+        pmullw      mm4,        mm1                 ;
+
+        movq        mm5,        [rsi+1]             ;
+        movq        mm6,        mm5                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0                 ;
+
+        pmullw      mm5,        mm2                 ;
+        pmullw      mm6,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+        movq        mm5,        mm7                 ;
+        movq        mm6,        mm7                 ;
+
+        punpcklbw   mm5,        mm0                 ;
+        punpckhbw   mm6,        mm0
+
+        pmullw      mm5,        [rax]               ;
+        pmullw      mm6,        [rax]               ;
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm4                 ;
+
+
+        pmullw      mm3,        [rax+16]            ;
+        pmullw      mm4,        [rax+16]            ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm4,        mm6                 ;
+
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       mm4,        [GLOBAL(rd)]                 ;
+        psraw       mm4,        VP8_FILTER_SHIFT        ;
+
+        packuswb    mm3,        mm4
+
+        movq        [rdi],      mm3                 ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,        rdx                 ; next line
+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
+%else
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
+        add         rsi,        rdx                 ; next line
+        add         rdi,        r8
+%endif
+        cmp         rdi,        rcx                 ;
+        jne         .next_row_8x4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void bilinear_predict4x4_mmx
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
+sym(vp8_bilinear_predict4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        mov         rdi,        arg(4) ;dst_ptr           ;
+
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+        shl         rax,        5
+
+        add         rax,        rcx ; HFilter
+        mov         rsi,        arg(0) ;src_ptr              ;
+
+        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
+        movq        mm1,        [rax]               ;
+
+        movq        mm2,        [rax+16]            ;
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        pxor        mm0,        mm0                 ;
+        shl         rax,        5
+
+        add         rax,        rcx
+        lea         rcx,        [rdi+rdx*4]          ;
+
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
+
+        ; get the first horizontal line done       ;
+        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+
+        pmullw      mm3,        mm1                 ;
+        movd        mm5,        [rsi+1]             ;
+
+        punpcklbw   mm5,        mm0                 ;
+        pmullw      mm5,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        movq        mm7,        mm3                 ;
+        packuswb    mm7,        mm0                 ;
+
+        add         rsi,        rdx                 ; next line
+.next_row_4x4:
+        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
+
+        pmullw      mm3,        mm1                 ;
+        movd        mm5,        [rsi+1]             ;
+
+        punpcklbw   mm5,        mm0                 ;
+        pmullw      mm5,        mm2                 ;
+
+        paddw       mm3,        mm5                 ;
+
+        movq        mm5,        mm7                 ;
+        punpcklbw   mm5,        mm0                 ;
+
+        pmullw      mm5,        [rax]               ;
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+        movq        mm7,        mm3                 ;
+
+        packuswb    mm7,        mm0                 ;
+
+        pmullw      mm3,        [rax+16]            ;
+        paddw       mm3,        mm5                 ;
+
+
+        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
+        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        packuswb    mm3,        mm0
+        movd        [rdi],      mm3                 ; store the results in the destination
+
+%if ABI_IS_32BIT
+        add         rsi,        rdx                 ; next line
+        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
+%else
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
+        add         rsi,        rdx                 ; next line
+        add         rdi,        r8
+%endif
+
+        cmp         rdi,        rcx                 ;
+        jne         .next_row_4x4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+SECTION_RODATA
+align 16
+rd:
+    times 4 dw 0x40
+
+align 16
+global HIDDEN_DATA(sym(vp8_six_tap_mmx))
+sym(vp8_six_tap_mmx):
+    times 8 dw 0
+    times 8 dw 0
+    times 8 dw 128
+    times 8 dw 0
+    times 8 dw 0
+    times 8 dw 0
+
+    times 8 dw 0
+    times 8 dw -6
+    times 8 dw 123
+    times 8 dw 12
+    times 8 dw -1
+    times 8 dw 0
+
+    times 8 dw 2
+    times 8 dw -11
+    times 8 dw 108
+    times 8 dw 36
+    times 8 dw -8
+    times 8 dw 1
+
+    times 8 dw 0
+    times 8 dw -9
+    times 8 dw 93
+    times 8 dw 50
+    times 8 dw -6
+    times 8 dw 0
+
+    times 8 dw 3
+    times 8 dw -16
+    times 8 dw 77
+    times 8 dw 77
+    times 8 dw -16
+    times 8 dw 3
+
+    times 8 dw 0
+    times 8 dw -6
+    times 8 dw 50
+    times 8 dw 93
+    times 8 dw -9
+    times 8 dw 0
+
+    times 8 dw 1
+    times 8 dw -8
+    times 8 dw 36
+    times 8 dw 108
+    times 8 dw -11
+    times 8 dw 2
+
+    times 8 dw 0
+    times 8 dw -1
+    times 8 dw 12
+    times 8 dw 123
+    times 8 dw -6
+    times 8 dw 0
+
+
diff --git a/libvpx/vp8/common/x86/subpixel_sse2.asm b/libvpx/vp8/common/x86/subpixel_sse2.asm
new file mode 100644
index 0000000..69f8d10
--- /dev/null
+++ b/libvpx/vp8/common/x86/subpixel_sse2.asm
@@ -0,0 +1,1372 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT  7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+;void vp8_filter_block1d8_h6_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned short *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    pixel_step,
+;    unsigned int    output_height,
+;    unsigned int    output_width,
+;    short           *vp8_filter
+;)
+global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
+sym(vp8_filter_block1d8_h6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(6) ;vp8_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(1) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5) ;output_width
+%endif
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+.filter_block1d8_h6_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+        punpcklbw   xmm4,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi],         xmm4
+        lea         rsi,        [rsi + rax]
+
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(5) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx
+
+        jnz         .filter_block1d8_h6_rowloop                ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d16_h6_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned short *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    pixel_step,
+;    unsigned int    output_height,
+;    unsigned int    output_width,
+;    short           *vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
+sym(vp8_filter_block1d16_h6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(6) ;vp8_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(1) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5) ;output_width
+%endif
+
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+.filter_block1d16_h6_sse2_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        movq        xmm2,       MMWORD PTR [rsi +14]
+        pslldq      xmm2,       8
+
+        por         xmm2,       xmm1
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+        punpcklbw   xmm4,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi],         xmm4
+
+        movdqa      xmm3,       xmm2
+        movdqa      xmm4,       xmm2
+
+        movdqa      xmm5,       xmm2
+        movdqa      xmm6,       xmm2
+
+        movdqa      xmm7,       xmm2
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm2
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+        punpcklbw   xmm4,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi+16],      xmm4
+
+        lea         rsi,        [rsi + rax]
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(5) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+
+        dec         rcx
+        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d8_v6_sse2
+;(
+;    short *src_ptr,
+;    unsigned char *output_ptr,
+;    int dst_ptich,
+;    unsigned int pixels_per_line,
+;    unsigned int pixel_step,
+;    unsigned int output_height,
+;    unsigned int output_width,
+;    short * vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
+sym(vp8_filter_block1d8_v6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rax,        arg(7) ;vp8_filter
+        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
+
+        mov         rdi,        arg(1) ;output_ptr
+        mov         rsi,        arg(0) ;src_ptr
+
+        sub         rsi,        rdx
+        sub         rsi,        rdx
+
+        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
+        pxor        xmm0,       xmm0                        ; clear xmm0
+
+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp8_filter_block1d8_v6_sse2_loop:
+        movdqa      xmm1,       XMMWORD PTR [rsi]
+        pmullw      xmm1,       [rax]
+
+        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
+        pmullw      xmm2,       [rax + 16]
+
+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
+        pmullw      xmm3,       [rax + 32]
+
+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
+        pmullw      xmm5,       [rax + 64]
+
+        add         rsi,        rdx
+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
+
+        pmullw      xmm4,       [rax + 48]
+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
+
+        pmullw      xmm6,       [rax + 80]
+
+        paddsw      xmm2,       xmm5
+        paddsw      xmm2,       xmm3
+
+        paddsw      xmm2,       xmm1
+        paddsw      xmm2,       xmm4
+
+        paddsw      xmm2,       xmm6
+        paddsw      xmm2,       xmm7
+
+        psraw       xmm2,       7
+        packuswb    xmm2,       xmm0              ; pack and saturate
+
+        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx         ; decrement count
+        jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d16_v6_sse2
+;(
+;    unsigned short *src_ptr,
+;    unsigned char *output_ptr,
+;    int dst_ptich,
+;    unsigned int pixels_per_line,
+;    unsigned int pixel_step,
+;    unsigned int output_height,
+;    unsigned int output_width,
+;    const short    *vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
+sym(vp8_filter_block1d16_v6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rax,        arg(7) ;vp8_filter
+        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
+
+        mov         rdi,        arg(1) ;output_ptr
+        mov         rsi,        arg(0) ;src_ptr
+
+        sub         rsi,        rdx
+        sub         rsi,        rdx
+
+        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp8_filter_block1d16_v6_sse2_loop:
+; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
+        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
+        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
+        pmullw      xmm1,       [rax + 16]
+        pmullw      xmm2,       [rax + 16]
+
+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
+        pmullw      xmm3,       [rax + 64]
+        pmullw      xmm4,       [rax + 64]
+
+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
+        pmullw      xmm5,       [rax + 32]
+        pmullw      xmm6,       [rax + 32]
+
+        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
+        movdqa      xmm0,       XMMWORD PTR [rsi + 16]
+        pmullw      xmm7,       [rax]
+        pmullw      xmm0,       [rax]
+
+        paddsw      xmm1,       xmm3
+        paddsw      xmm2,       xmm4
+        paddsw      xmm1,       xmm5
+        paddsw      xmm2,       xmm6
+        paddsw      xmm1,       xmm7
+        paddsw      xmm2,       xmm0
+
+        add         rsi,        rdx
+
+        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
+        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
+        pmullw      xmm3,       [rax + 48]
+        pmullw      xmm4,       [rax + 48]
+
+        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
+        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
+        pmullw      xmm5,       [rax + 80]
+        pmullw      xmm6,       [rax + 80]
+
+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
+        pxor        xmm0,       xmm0                        ; clear xmm0
+
+        paddsw      xmm1,       xmm3
+        paddsw      xmm2,       xmm4
+        paddsw      xmm1,       xmm5
+        paddsw      xmm2,       xmm6
+
+        paddsw      xmm1,       xmm7
+        paddsw      xmm2,       xmm7
+
+        psraw       xmm1,       7
+        psraw       xmm2,       7
+
+        packuswb    xmm1,       xmm2              ; pack and saturate
+        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx         ; decrement count
+        jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d8_h6_only_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    int dst_ptich,
+;    unsigned int    output_height,
+;    const short    *vp8_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
+sym(vp8_filter_block1d8_h6_only_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(5) ;vp8_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(2) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(3) ;dst_ptich
+%endif
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+.filter_block1d8_h6_only_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0
+
+        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
+        lea         rsi,        [rsi + rax]
+
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx
+
+        jnz         .filter_block1d8_h6_only_rowloop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d16_h6_only_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    int dst_ptich,
+;    unsigned int    output_height,
+;    const short    *vp8_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
+sym(vp8_filter_block1d16_h6_only_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rdx,        arg(5) ;vp8_filter
+        mov         rsi,        arg(0) ;src_ptr
+
+        mov         rdi,        arg(2) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(3) ;dst_ptich
+%endif
+
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+
+.filter_block1d16_h6_only_sse2_rowloop:
+        movq        xmm3,       MMWORD PTR [rsi - 2]
+        movq        xmm1,       MMWORD PTR [rsi + 6]
+
+        movq        xmm2,       MMWORD PTR [rsi +14]
+        pslldq      xmm2,       8
+
+        por         xmm2,       xmm1
+        prefetcht2  [rsi+rax-2]
+
+        pslldq      xmm1,       8
+        por         xmm1,       xmm3
+
+        movdqa      xmm4,       xmm1
+        movdqa      xmm5,       xmm1
+
+        movdqa      xmm6,       xmm1
+        movdqa      xmm7,       xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm1
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0                        ; lower 8 bytes
+
+        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
+
+        movdqa      xmm3,       xmm2
+        movdqa      xmm4,       xmm2
+
+        movdqa      xmm5,       xmm2
+        movdqa      xmm6,       xmm2
+
+        movdqa      xmm7,       xmm2
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
+        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
+
+        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
+
+        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
+
+        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
+
+        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
+
+        paddsw      xmm4,       xmm7
+        paddsw      xmm4,       xmm5
+
+        paddsw      xmm4,       xmm3
+        paddsw      xmm4,       xmm6
+
+        paddsw      xmm4,       xmm2
+        paddsw      xmm4,       [GLOBAL(rd)]
+
+        psraw       xmm4,       7
+
+        packuswb    xmm4,       xmm0                        ; higher 8 bytes
+
+        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
+
+        lea         rsi,        [rsi + rax]
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
+%else
+        add         rdi,        r8
+%endif
+
+        dec         rcx
+        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_filter_block1d8_v6_only_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char *output_ptr,
+;    int dst_ptich,
+;    unsigned int output_height,
+;    const short    *vp8_filter
+;)
+; Second-pass filter only when xoffset==0
+global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
+sym(vp8_filter_block1d8_v6_only_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(2) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(4) ;output_height
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+        mov         rax,        arg(5) ;vp8_filter
+
+        pxor        xmm0,       xmm0                        ; clear xmm0
+
+        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(3) ; dst_ptich
+%endif
+
+.vp8_filter_block1d8_v6_only_sse2_loop:
+        movq        xmm1,       MMWORD PTR [rsi]
+        movq        xmm2,       MMWORD PTR [rsi + rdx]
+        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
+        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
+        add         rsi,        rdx
+        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
+        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
+
+        punpcklbw   xmm1,       xmm0
+        pmullw      xmm1,       [rax]
+
+        punpcklbw   xmm2,       xmm0
+        pmullw      xmm2,       [rax + 16]
+
+        punpcklbw   xmm3,       xmm0
+        pmullw      xmm3,       [rax + 32]
+
+        punpcklbw   xmm5,       xmm0
+        pmullw      xmm5,       [rax + 64]
+
+        punpcklbw   xmm4,       xmm0
+        pmullw      xmm4,       [rax + 48]
+
+        punpcklbw   xmm6,       xmm0
+        pmullw      xmm6,       [rax + 80]
+
+        paddsw      xmm2,       xmm5
+        paddsw      xmm2,       xmm3
+
+        paddsw      xmm2,       xmm1
+        paddsw      xmm2,       xmm4
+
+        paddsw      xmm2,       xmm6
+        paddsw      xmm2,       xmm7
+
+        psraw       xmm2,       7
+        packuswb    xmm2,       xmm0              ; pack and saturate
+
+        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx         ; decrement count
+        jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_unpack_block1d16_h6_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned short *output_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned int    output_height,
+;    unsigned int    output_width
+;)
+global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
+sym(vp8_unpack_block1d16_h6_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0) ;src_ptr
+        mov         rdi,        arg(1) ;output_ptr
+
+        movsxd      rcx,        dword ptr arg(3) ;output_height
+        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
+
+        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
+%endif
+
+.unpack_block1d16_h6_sse2_rowloop:
+        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
+        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
+
+        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+        punpcklbw   xmm1,       xmm0
+
+        movdqa      XMMWORD Ptr [rdi],         xmm1
+        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
+
+        lea         rsi,        [rsi + rax]
+%if ABI_IS_32BIT
+        add         rdi,        DWORD Ptr arg(4) ;[output_width]
+%else
+        add         rdi,        r8
+%endif
+        dec         rcx
+        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_bilinear_predict16x16_sse2
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+extern sym(vp8_bilinear_filters_x86_8)
+global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
+sym(vp8_bilinear_predict16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
+
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+
+        cmp         rax,        0      ;skip first_pass filter if xoffset=0
+        je          .b16x16_sp_only
+
+        shl         rax,        5
+        add         rax,        rcx    ;HFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+
+        movdqa      xmm1,       [rax]
+        movdqa      xmm2,       [rax+16]
+
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+
+        cmp         rax,        0      ;skip second_pass filter if yoffset=0
+        je          .b16x16_fp_only
+
+        shl         rax,        5
+        add         rax,        rcx    ;VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+        pxor        xmm0,       xmm0
+
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5) ;dst_pitch
+%endif
+        ; get the first horizontal line done
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm1
+
+        movdqu      xmm5,       [rsi+1]
+        movdqa      xmm6,       xmm5
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       xmm2
+        pmullw      xmm6,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm4
+
+        add         rsi,        rdx                 ; next line
+.next_row:
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm1
+
+        movdqu      xmm5,       [rsi+1]
+        movdqa      xmm6,       xmm5
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       xmm2
+        pmullw      xmm6,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        movdqa      xmm5,       xmm7
+        movdqa      xmm6,       xmm7
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       [rax]
+        pmullw      xmm6,       [rax]
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm4
+
+        pmullw      xmm3,       [rax+16]
+        pmullw      xmm4,       [rax+16]
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm4
+        movdqa      [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsi,        rdx                 ; next line
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(5) ;dst_pitch
+%else
+        add         rdi,        r8
+%endif
+
+        cmp         rdi,        rcx
+        jne         .next_row
+
+        jmp         .done
+
+.b16x16_sp_only:
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+        shl         rax,        5
+        add         rax,        rcx    ;VFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+
+        movdqa      xmm1,       [rax]
+        movdqa      xmm2,       [rax+16]
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
+
+        pxor        xmm0,       xmm0
+
+        ; get the first horizontal line done
+        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+        add         rsi,        rax                 ; next line
+.next_row_spo:
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+        movdqa      xmm5,       xmm7
+        movdqa      xmm6,       xmm7
+
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+        movdqa      xmm7,       xmm3
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm5,       xmm1
+        pmullw      xmm6,       xmm1
+        pmullw      xmm3,       xmm2
+        pmullw      xmm4,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm4
+        movdqa      [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsi,        rax                 ; next line
+        add         rdi,        rdx                 ;dst_pitch
+        cmp         rdi,        rcx
+        jne         .next_row_spo
+
+        jmp         .done
+
+.b16x16_fp_only:
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
+        pxor        xmm0,       xmm0
+
+.next_row_fpo:
+        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+
+        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
+        punpckhbw   xmm4,       xmm0
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm1
+
+        movdqu      xmm5,       [rsi+1]
+        movdqa      xmm6,       xmm5
+
+        punpcklbw   xmm5,       xmm0
+        punpckhbw   xmm6,       xmm0
+
+        pmullw      xmm5,       xmm2
+        pmullw      xmm6,       xmm2
+
+        paddw       xmm3,       xmm5
+        paddw       xmm4,       xmm6
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm4
+        movdqa      [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsi,        rax                 ; next line
+        add         rdi,        rdx                 ; dst_pitch
+        cmp         rdi,        rcx
+        jne         .next_row_fpo
+
+.done:
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_bilinear_predict8x8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
+sym(vp8_bilinear_predict8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 144                         ; reserve 144 bytes
+
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+    ;Read 9-line unaligned data in and put them on stack. This gives a big
+    ;performance boost.
+        movdqu      xmm0,       [rsi]
+        lea         rax,        [rdx + rdx*2]
+        movdqu      xmm1,       [rsi+rdx]
+        movdqu      xmm2,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm3,       [rsi]
+        movdqu      xmm4,       [rsi+rdx]
+        movdqu      xmm5,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm6,       [rsi]
+        movdqu      xmm7,       [rsi+rdx]
+
+        movdqa      XMMWORD PTR [rsp],            xmm0
+
+        movdqu      xmm0,       [rsi+rdx*2]
+
+        movdqa      XMMWORD PTR [rsp+16],         xmm1
+        movdqa      XMMWORD PTR [rsp+32],         xmm2
+        movdqa      XMMWORD PTR [rsp+48],         xmm3
+        movdqa      XMMWORD PTR [rsp+64],         xmm4
+        movdqa      XMMWORD PTR [rsp+80],         xmm5
+        movdqa      XMMWORD PTR [rsp+96],         xmm6
+        movdqa      XMMWORD PTR [rsp+112],        xmm7
+        movdqa      XMMWORD PTR [rsp+128],        xmm0
+
+        movsxd      rax,        dword ptr arg(2) ;xoffset
+        shl         rax,        5
+        add         rax,        rcx    ;HFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
+
+        movdqa      xmm1,       [rax]
+        movdqa      xmm2,       [rax+16]
+
+        movsxd      rax,        dword ptr arg(3) ;yoffset
+        shl         rax,        5
+        add         rax,        rcx    ;VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+
+        movdqa      xmm5,       [rax]
+        movdqa      xmm6,       [rax+16]
+
+        pxor        xmm0,       xmm0
+
+        ; get the first horizontal line done
+        movdqa      xmm3,       XMMWORD PTR [rsp]
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+        psrldq      xmm4,       1
+
+        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
+        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm2
+
+        paddw       xmm3,       xmm4
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        movdqa      xmm7,       xmm3
+        add         rsp,        16                 ; next line
+.next_row8x8:
+        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        movdqa      xmm4,       xmm3                 ; make a copy of current line
+        psrldq      xmm4,       1
+
+        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
+        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
+
+        pmullw      xmm3,       xmm1
+        pmullw      xmm4,       xmm2
+
+        paddw       xmm3,       xmm4
+        pmullw      xmm7,       xmm5
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        movdqa      xmm4,       xmm3
+
+        pmullw      xmm3,       xmm6
+        paddw       xmm3,       xmm7
+
+        movdqa      xmm7,       xmm4
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
+
+        packuswb    xmm3,       xmm0
+        movq        [rdi],      xmm3                 ; store the results in the destination
+
+        add         rsp,        16                 ; next line
+        add         rdi,        rdx
+
+        cmp         rdi,        rcx
+        jne         .next_row8x8
+
+    ;add rsp, 144
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+rd:
+    times 8 dw 0x40
diff --git a/libvpx/vp8/common/x86/subpixel_ssse3.asm b/libvpx/vp8/common/x86/subpixel_ssse3.asm
new file mode 100644
index 0000000..13bcaf6
--- /dev/null
+++ b/libvpx/vp8/common/x86/subpixel_ssse3.asm
@@ -0,0 +1,1507 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT  7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;
+; This is an implementation of some of the SSE optimizations first seen in ffvp8
+;
+;*************************************************************************************/
+;void vp8_filter_block1d8_h6_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    unsigned int    vp8_filter_index
+;)
+global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d8_h6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4
+
+    movdqa      xmm7, [GLOBAL(rd)]
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+    mov         rdi, arg(2)             ;output_ptr
+
+    cmp         esi, DWORD PTR [rax]
+    je          vp8_filter_block1d8_h4_ssse3
+
+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)   ;output_height
+
+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+
+    sub         rdi, rdx
+;xmm3 free
+.filter_block1d8_h6_rowloop_ssse3:
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
+
+    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
+
+    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
+
+    movdqa      xmm1,   xmm0
+    pmaddubsw   xmm0,   xmm4
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
+
+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
+    pmaddubsw   xmm1,   xmm5
+
+    lea         rdi,    [rdi + rdx]
+    pmaddubsw   xmm2,   xmm6
+
+    lea         rsi,    [rsi + rax]
+    dec         rcx
+
+    paddsw      xmm0,   xmm1
+    paddsw      xmm2,   xmm7
+
+    paddsw      xmm0,   xmm2
+
+    psraw       xmm0,   7
+
+    packuswb    xmm0,   xmm0
+
+    movq        MMWORD Ptr [rdi], xmm0
+    jnz         .filter_block1d8_h6_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+vp8_filter_block1d8_h4_ssse3:
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+
+    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
+    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)   ;output_height
+
+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+
+    sub         rdi, rdx
+
+.filter_block1d8_h4_rowloop_ssse3:
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
+
+    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
+
+    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
+
+    movdqa      xmm2,   xmm0
+    pshufb      xmm0,   xmm3
+
+    pshufb      xmm2,   xmm4
+    pmaddubsw   xmm0,   xmm5
+
+    lea         rdi,    [rdi + rdx]
+    pmaddubsw   xmm2,   xmm6
+
+    lea         rsi,    [rsi + rax]
+    dec         rcx
+
+    paddsw      xmm0,   xmm7
+
+    paddsw      xmm0,   xmm2
+
+    psraw       xmm0,   7
+
+    packuswb    xmm0,   xmm0
+
+    movq        MMWORD Ptr [rdi], xmm0
+
+    jnz         .filter_block1d8_h4_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+;void vp8_filter_block1d16_h6_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    unsigned int    vp8_filter_index
+;)
+global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d16_h6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)           ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+
+    mov         rdi, arg(2)                     ;output_ptr
+
+    mov         rsi, arg(0)                     ;src_ptr
+
+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+
+    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)           ;output_height
+    movsxd      rdx, dword ptr arg(3)           ;output_pitch
+
+.filter_block1d16_h6_rowloop_ssse3:
+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
+
+    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
+
+    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
+
+    movdqa      xmm1,   xmm0
+    pmaddubsw   xmm0,   xmm4
+
+    movdqa      xmm2,   xmm1
+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
+
+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
+    movq        xmm3,   MMWORD PTR [rsi +  6]
+
+    pmaddubsw   xmm1,   xmm5
+    movq        xmm7,   MMWORD PTR [rsi + 11]
+
+    pmaddubsw   xmm2,   xmm6
+    punpcklbw   xmm3,   xmm7
+
+    paddsw      xmm0,   xmm1
+    movdqa      xmm1,   xmm3
+
+    pmaddubsw   xmm3,   xmm4
+    paddsw      xmm0,   xmm2
+
+    movdqa      xmm2,   xmm1
+    paddsw      xmm0,   [GLOBAL(rd)]
+
+    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
+    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
+
+    psraw       xmm0,   7
+    pmaddubsw   xmm1,   xmm5
+
+    pmaddubsw   xmm2,   xmm6
+    packuswb    xmm0,   xmm0
+
+    lea         rsi,    [rsi + rax]
+    paddsw      xmm3,   xmm1
+
+    paddsw      xmm3,   xmm2
+
+    paddsw      xmm3,   [GLOBAL(rd)]
+
+    psraw       xmm3,   7
+
+    packuswb    xmm3,   xmm3
+
+    punpcklqdq  xmm0,   xmm3
+
+    movdqa      XMMWORD Ptr [rdi], xmm0
+
+    lea         rdi,    [rdi + rdx]
+    dec         rcx
+    jnz         .filter_block1d16_h6_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_filter_block1d4_h6_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    unsigned int    vp8_filter_index
+;)
+global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d4_h6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+    movdqa      xmm7, [GLOBAL(rd)]
+
+    cmp         esi, DWORD PTR [rax]
+    je          .vp8_filter_block1d4_h4_ssse3
+
+    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+    mov         rdi, arg(2)             ;output_ptr
+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)   ;output_height
+
+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+
+;xmm3 free
+.filter_block1d4_h6_rowloop_ssse3:
+    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
+
+    movdqa      xmm1, xmm0
+    pshufb      xmm0, [GLOBAL(shuf1b)]
+
+    movdqa      xmm2, xmm1
+    pshufb      xmm1, [GLOBAL(shuf2b)]
+    pmaddubsw   xmm0, xmm4
+    pshufb      xmm2, [GLOBAL(shuf3b)]
+    pmaddubsw   xmm1, xmm5
+
+;--
+    pmaddubsw   xmm2, xmm6
+
+    lea         rsi,    [rsi + rax]
+;--
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm7
+    pxor        xmm1, xmm1
+    paddsw      xmm0, xmm2
+    psraw       xmm0, 7
+    packuswb    xmm0, xmm0
+
+    movd        DWORD PTR [rdi], xmm0
+
+    add         rdi, rdx
+    dec         rcx
+    jnz         .filter_block1d4_h6_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+.vp8_filter_block1d4_h4_ssse3:
+    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
+    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
+    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
+
+    mov         rsi, arg(0)             ;src_ptr
+    mov         rdi, arg(2)             ;output_ptr
+    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
+    movsxd      rcx, dword ptr arg(4)   ;output_height
+
+    movsxd      rdx, dword ptr arg(3)   ;output_pitch
+
+.filter_block1d4_h4_rowloop_ssse3:
+    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
+
+    movdqa      xmm2, xmm1
+    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
+    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
+    pmaddubsw   xmm1, xmm5
+
+;--
+    pmaddubsw   xmm2, xmm6
+
+    lea         rsi,    [rsi + rax]
+;--
+    paddsw      xmm1, xmm7
+    paddsw      xmm1, xmm2
+    psraw       xmm1, 7
+    packuswb    xmm1, xmm1
+
+    movd        DWORD PTR [rdi], xmm1
+
+    add         rdi, rdx
+    dec         rcx
+    jnz         .filter_block1d4_h4_rowloop_ssse3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;void vp8_filter_block1d16_v6_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    unsigned int   vp8_filter_index
+;)
+global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d16_v6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+
+    cmp         esi, DWORD PTR [rax]
+    je          .vp8_filter_block1d16_v4_ssse3
+
+    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
+    mov         rdi, arg(2)             ;output_ptr
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)   ;output_height
+    add         rax, rdx
+
+
+.vp8_filter_block1d16_v6_ssse3_loop:
+    movq        xmm1, MMWORD PTR [rsi]                  ;A
+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
+
+    pmaddubsw   xmm3, xmm6
+    punpcklbw   xmm1, xmm0                  ;A F
+    pmaddubsw   xmm2, xmm7
+    pmaddubsw   xmm1, xmm5
+
+    paddsw      xmm2, xmm3
+    paddsw      xmm2, xmm1
+    paddsw      xmm2, [GLOBAL(rd)]
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    movq        MMWORD PTR [rdi], xmm2          ;store the results
+
+    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
+    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
+    pmaddubsw   xmm3, xmm6
+    punpcklbw   xmm1, xmm0                  ;A F
+    pmaddubsw   xmm2, xmm7
+    pmaddubsw   xmm1, xmm5
+
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      xmm2, xmm3
+    paddsw      xmm2, xmm1
+    paddsw      xmm2, [GLOBAL(rd)]
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    movq        MMWORD PTR [rdi+8], xmm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;out_pitch
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp8_filter_block1d16_v6_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+.vp8_filter_block1d16_v4_ssse3:
+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
+    mov         rdi, arg(2)             ;output_ptr
+
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
+%endif
+    mov         rax, rsi
+    movsxd      rcx, DWORD PTR arg(4)   ;output_height
+    add         rax, rdx
+
+.vp8_filter_block1d16_v4_ssse3_loop:
+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    pmaddubsw   xmm3, xmm6
+    pmaddubsw   xmm2, xmm7
+    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
+    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
+
+    paddsw      xmm2, [GLOBAL(rd)]
+    paddsw      xmm2, xmm3
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    punpcklbw   xmm5, xmm4                  ;B D
+    punpcklbw   xmm1, xmm0                  ;C E
+
+    pmaddubsw   xmm1, xmm6
+    pmaddubsw   xmm5, xmm7
+
+    movdqa      xmm4, [GLOBAL(rd)]
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      xmm5, xmm1
+    paddsw      xmm5, xmm4
+    psraw       xmm5, 7
+    packuswb    xmm5, xmm5
+
+    punpcklqdq  xmm2, xmm5
+
+    movdqa       XMMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;out_pitch
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp8_filter_block1d16_v4_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_filter_block1d8_v6_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    unsigned int   vp8_filter_index
+;)
+global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d8_v6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+
+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
+    mov         rdi, arg(2)             ;output_ptr
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
+%endif
+    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
+
+    cmp         esi, DWORD PTR [rax]
+    je          .vp8_filter_block1d8_v4_ssse3
+
+    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    mov         rax, rsi
+    add         rax, rdx
+
+.vp8_filter_block1d8_v6_ssse3_loop:
+    movq        xmm1, MMWORD PTR [rsi]                  ;A
+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
+    movdqa      xmm4, [GLOBAL(rd)]
+
+    pmaddubsw   xmm3, xmm6
+    punpcklbw   xmm1, xmm0                  ;A F
+    pmaddubsw   xmm2, xmm7
+    pmaddubsw   xmm1, xmm5
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      xmm2, xmm3
+    paddsw      xmm2, xmm1
+    paddsw      xmm2, xmm4
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    movq        MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp8_filter_block1d8_v6_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+.vp8_filter_block1d8_v4_ssse3:
+    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
+    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
+    movdqa      xmm5, [GLOBAL(rd)]
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    mov         rax, rsi
+    add         rax, rdx
+
+.vp8_filter_block1d8_v4_ssse3_loop:
+    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
+    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
+    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
+    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   xmm2, xmm4                  ;B D
+    punpcklbw   xmm3, xmm0                  ;C E
+
+    pmaddubsw   xmm3, xmm6
+    pmaddubsw   xmm2, xmm7
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      xmm2, xmm3
+    paddsw      xmm2, xmm5
+    psraw       xmm2, 7
+    packuswb    xmm2, xmm2
+
+    movq        MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp8_filter_block1d8_v4_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+;void vp8_filter_block1d4_v6_ssse3
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    unsigned int   vp8_filter_index
+;)
+global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d4_v6_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movsxd      rdx, DWORD PTR arg(5)   ;table index
+    xor         rsi, rsi
+    shl         rdx, 4      ;
+
+    lea         rax, [GLOBAL(k0_k5)]
+    add         rax, rdx
+
+    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
+    mov         rdi, arg(2)             ;output_ptr
+%if ABI_IS_32BIT=0
+    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
+%endif
+    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
+
+    cmp         esi, DWORD PTR [rax]
+    je          .vp8_filter_block1d4_v4_ssse3
+
+    movq        mm5, MMWORD PTR [rax]         ;k0_k5
+    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
+    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    mov         rax, rsi
+    add         rax, rdx
+
+.vp8_filter_block1d4_v6_ssse3_loop:
+    movd        mm1, DWORD PTR [rsi]                  ;A
+    movd        mm2, DWORD PTR [rsi + rdx]            ;B
+    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
+    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
+    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   mm2, mm4                  ;B D
+    punpcklbw   mm3, mm0                  ;C E
+
+    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
+
+    movq        mm4, [GLOBAL(rd)]
+
+    pmaddubsw   mm3, mm6
+    punpcklbw   mm1, mm0                  ;A F
+    pmaddubsw   mm2, mm7
+    pmaddubsw   mm1, mm5
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      mm2, mm3
+    paddsw      mm2, mm1
+    paddsw      mm2, mm4
+    psraw       mm2, 7
+    packuswb    mm2, mm2
+
+    movd        DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp8_filter_block1d4_v6_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+.vp8_filter_block1d4_v4_ssse3:
+    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
+    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
+    movq        mm5, MMWORD PTR [GLOBAL(rd)]
+
+    mov         rsi, arg(0)             ;src_ptr
+
+    mov         rax, rsi
+    add         rax, rdx
+
+.vp8_filter_block1d4_v4_ssse3_loop:
+    movd        mm2, DWORD PTR [rsi + rdx]            ;B
+    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
+    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
+    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
+
+    punpcklbw   mm2, mm4                  ;B D
+    punpcklbw   mm3, mm0                  ;C E
+
+    pmaddubsw   mm3, mm6
+    pmaddubsw   mm2, mm7
+    add         rsi,  rdx
+    add         rax,  rdx
+;--
+;--
+    paddsw      mm2, mm3
+    paddsw      mm2, mm5
+    psraw       mm2, 7
+    packuswb    mm2, mm2
+
+    movd        DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
+%else
+    add         rdi,        r8
+%endif
+    dec         rcx
+    jnz         .vp8_filter_block1d4_v4_ssse3_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_bilinear_predict16x16_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
+sym(vp8_bilinear_predict16x16_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
+        movsxd      rax,        dword ptr arg(2)    ; xoffset
+
+        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
+        je          .b16x16_sp_only
+
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; HFilter
+
+        mov         rdi,        arg(4)              ; dst_ptr
+        mov         rsi,        arg(0)              ; src_ptr
+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
+
+        movdqa      xmm1,       [rax]
+
+        movsxd      rax,        dword ptr arg(3)    ; yoffset
+
+        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
+        je          .b16x16_fp_only
+
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
+
+        movdqa      xmm2,       [rax]
+
+%if ABI_IS_32BIT=0
+        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
+%endif
+        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
+
+        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
+
+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
+
+        lea         rsi,        [rsi + rdx]         ; next line
+
+        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
+
+        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
+        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
+        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
+
+        punpcklbw   xmm6,       xmm5
+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
+
+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
+        lea         rsi,        [rsi + rdx]         ; next line
+
+        pmaddubsw   xmm6,       xmm1
+
+        punpcklbw   xmm4,       xmm5
+        pmaddubsw   xmm4,       xmm1
+
+        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
+        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
+
+        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
+        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
+
+        packuswb    xmm6,       xmm4
+        movdqa      xmm5,       xmm7
+
+        punpcklbw   xmm5,       xmm6
+        pmaddubsw   xmm5,       xmm2
+
+        punpckhbw   xmm7,       xmm6
+        pmaddubsw   xmm7,       xmm2
+
+        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
+        psraw       xmm5,       VP8_FILTER_SHIFT    ; xmm5 /= 128
+
+        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
+        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
+
+        packuswb    xmm5,       xmm7
+        movdqa      xmm7,       xmm6
+
+        movdqa      [rdi],      xmm5                ; store the results in the destination
+%if ABI_IS_32BIT
+        add         rdi,        DWORD PTR arg(5)    ; dst_pitch
+%else
+        add         rdi,        r8
+%endif
+
+        cmp         rdi,        rcx
+        jne         .next_row
+
+        jmp         .done
+
+.b16x16_sp_only:
+        movsxd      rax,        dword ptr arg(3)    ; yoffset
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; VFilter
+
+        mov         rdi,        arg(4)              ; dst_ptr
+        mov         rsi,        arg(0)              ; src_ptr
+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
+
+        movdqa      xmm1,       [rax]               ; VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
+
+        ; get the first horizontal line done
+        movq        xmm4,       [rsi]               ; load row 0
+        movq        xmm2,       [rsi + 8]           ; load row 0
+
+        lea         rsi,        [rsi + rax]         ; next line
+.next_row_sp:
+        movq        xmm3,       [rsi]               ; load row + 1
+        movq        xmm5,       [rsi + 8]           ; load row + 1
+
+        punpcklbw   xmm4,       xmm3
+        punpcklbw   xmm2,       xmm5
+
+        pmaddubsw   xmm4,       xmm1
+        movq        xmm7,       [rsi + rax]         ; load row + 2
+
+        pmaddubsw   xmm2,       xmm1
+        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
+
+        punpcklbw   xmm3,       xmm7
+        punpcklbw   xmm5,       xmm6
+
+        pmaddubsw   xmm3,       xmm1
+        paddw       xmm4,       [GLOBAL(rd)]
+
+        pmaddubsw   xmm5,       xmm1
+        paddw       xmm2,       [GLOBAL(rd)]
+
+        psraw       xmm4,       VP8_FILTER_SHIFT
+        psraw       xmm2,       VP8_FILTER_SHIFT
+
+        packuswb    xmm4,       xmm2
+        paddw       xmm3,       [GLOBAL(rd)]
+
+        movdqa      [rdi],      xmm4                ; store row 0
+        paddw       xmm5,       [GLOBAL(rd)]
+
+        psraw       xmm3,       VP8_FILTER_SHIFT
+        psraw       xmm5,       VP8_FILTER_SHIFT
+
+        packuswb    xmm3,       xmm5
+        movdqa      xmm4,       xmm7
+
+        movdqa      [rdi + rdx],xmm3                ; store row 1
+        lea         rsi,        [rsi + 2*rax]
+
+        movdqa      xmm2,       xmm6
+        lea         rdi,        [rdi + 2*rdx]
+
+        cmp         rdi,        rcx
+        jne         .next_row_sp
+
+        jmp         .done
+
+.b16x16_fp_only:
+        lea         rcx,        [rdi+rdx*8]
+        lea         rcx,        [rcx+rdx*8]
+        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
+
+.next_row_fp:
+        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
+        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
+
+        punpcklbw   xmm2,       xmm4
+        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
+
+        pmaddubsw   xmm2,       xmm1
+        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
+
+        lea         rsi,        [rsi + rax]         ; next line
+        punpcklbw   xmm3,       xmm4
+
+        pmaddubsw   xmm3,       xmm1
+        movq        xmm5,       [rsi]
+
+        paddw       xmm2,       [GLOBAL(rd)]
+        movq        xmm7,       [rsi+1]
+
+        movq        xmm6,       [rsi+8]
+        psraw       xmm2,       VP8_FILTER_SHIFT
+
+        punpcklbw   xmm5,       xmm7
+        movq        xmm7,       [rsi+9]
+
+        paddw       xmm3,       [GLOBAL(rd)]
+        pmaddubsw   xmm5,       xmm1
+
+        psraw       xmm3,       VP8_FILTER_SHIFT
+        punpcklbw   xmm6,       xmm7
+
+        packuswb    xmm2,       xmm3
+        pmaddubsw   xmm6,       xmm1
+
+        movdqa      [rdi],      xmm2                ; store the results in the destination
+        paddw       xmm5,       [GLOBAL(rd)]
+
+        lea         rdi,        [rdi + rdx]         ; dst_pitch
+        psraw       xmm5,       VP8_FILTER_SHIFT
+
+        paddw       xmm6,       [GLOBAL(rd)]
+        psraw       xmm6,       VP8_FILTER_SHIFT
+
+        packuswb    xmm5,       xmm6
+        lea         rsi,        [rsi + rax]         ; next line
+
+        movdqa      [rdi],      xmm5                ; store the results in the destination
+        lea         rdi,        [rdi + rdx]         ; dst_pitch
+
+        cmp         rdi,        rcx
+
+        jne         .next_row_fp
+
+.done:
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_bilinear_predict8x8_ssse3
+;(
+;    unsigned char  *src_ptr,
+;    int   src_pixels_per_line,
+;    int  xoffset,
+;    int  yoffset,
+;    unsigned char *dst_ptr,
+;    int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
+sym(vp8_bilinear_predict8x8_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 144                         ; reserve 144 bytes
+
+        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
+
+        mov         rsi,        arg(0) ;src_ptr
+        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
+
+    ;Read 9-line unaligned data in and put them on stack. This gives a big
+    ;performance boost.
+        movdqu      xmm0,       [rsi]
+        lea         rax,        [rdx + rdx*2]
+        movdqu      xmm1,       [rsi+rdx]
+        movdqu      xmm2,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm3,       [rsi]
+        movdqu      xmm4,       [rsi+rdx]
+        movdqu      xmm5,       [rsi+rdx*2]
+        add         rsi,        rax
+        movdqu      xmm6,       [rsi]
+        movdqu      xmm7,       [rsi+rdx]
+
+        movdqa      XMMWORD PTR [rsp],            xmm0
+
+        movdqu      xmm0,       [rsi+rdx*2]
+
+        movdqa      XMMWORD PTR [rsp+16],         xmm1
+        movdqa      XMMWORD PTR [rsp+32],         xmm2
+        movdqa      XMMWORD PTR [rsp+48],         xmm3
+        movdqa      XMMWORD PTR [rsp+64],         xmm4
+        movdqa      XMMWORD PTR [rsp+80],         xmm5
+        movdqa      XMMWORD PTR [rsp+96],         xmm6
+        movdqa      XMMWORD PTR [rsp+112],        xmm7
+        movdqa      XMMWORD PTR [rsp+128],        xmm0
+
+        movsxd      rax,        dword ptr arg(2)    ; xoffset
+        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
+        je          .b8x8_sp_only
+
+        shl         rax,        4
+        add         rax,        rcx                 ; HFilter
+
+        mov         rdi,        arg(4)              ; dst_ptr
+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
+
+        movdqa      xmm0,       [rax]
+
+        movsxd      rax,        dword ptr arg(3)    ; yoffset
+        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
+        je          .b8x8_fp_only
+
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; VFilter
+
+        lea         rcx,        [rdi+rdx*8]
+
+        movdqa      xmm1,       [rax]
+
+        ; get the first horizontal line done
+        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
+
+        psrldq      xmm5,       1
+        lea         rsp,        [rsp + 16]          ; next line
+
+        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
+
+        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
+        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
+
+        movdqa      xmm7,       xmm3
+        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+        lea         rsp,        [rsp + 16]          ; next line
+
+        movdqa      xmm5,       xmm6
+
+        psrldq      xmm5,       1
+
+        punpcklbw   xmm6,       xmm5
+        pmaddubsw   xmm6,       xmm0
+
+        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
+        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
+
+        packuswb    xmm6,       xmm6
+
+        punpcklbw   xmm7,       xmm6
+        pmaddubsw   xmm7,       xmm1
+
+        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
+        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
+
+        packuswb    xmm7,       xmm7
+
+        movq        [rdi],      xmm7                ; store the results in the destination
+        lea         rdi,        [rdi + rdx]
+
+        movdqa      xmm7,       xmm6
+
+        cmp         rdi,        rcx
+        jne         .next_row
+
+        jmp         .done8x8
+
+.b8x8_sp_only:
+        movsxd      rax,        dword ptr arg(3)    ; yoffset
+        shl         rax,        4
+        lea         rax,        [rax + rcx]         ; VFilter
+
+        mov         rdi,        arg(4) ;dst_ptr
+        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
+
+        movdqa      xmm0,       [rax]               ; VFilter
+
+        movq        xmm1,       XMMWORD PTR [rsp]
+        movq        xmm2,       XMMWORD PTR [rsp+16]
+
+        movq        xmm3,       XMMWORD PTR [rsp+32]
+        punpcklbw   xmm1,       xmm2
+
+        movq        xmm4,       XMMWORD PTR [rsp+48]
+        punpcklbw   xmm2,       xmm3
+
+        movq        xmm5,       XMMWORD PTR [rsp+64]
+        punpcklbw   xmm3,       xmm4
+
+        movq        xmm6,       XMMWORD PTR [rsp+80]
+        punpcklbw   xmm4,       xmm5
+
+        movq        xmm7,       XMMWORD PTR [rsp+96]
+        punpcklbw   xmm5,       xmm6
+
+        pmaddubsw   xmm1,       xmm0
+        pmaddubsw   xmm2,       xmm0
+
+        pmaddubsw   xmm3,       xmm0
+        pmaddubsw   xmm4,       xmm0
+
+        pmaddubsw   xmm5,       xmm0
+        punpcklbw   xmm6,       xmm7
+
+        pmaddubsw   xmm6,       xmm0
+        paddw       xmm1,       [GLOBAL(rd)]
+
+        paddw       xmm2,       [GLOBAL(rd)]
+        psraw       xmm1,       VP8_FILTER_SHIFT
+
+        paddw       xmm3,       [GLOBAL(rd)]
+        psraw       xmm2,       VP8_FILTER_SHIFT
+
+        paddw       xmm4,       [GLOBAL(rd)]
+        psraw       xmm3,       VP8_FILTER_SHIFT
+
+        paddw       xmm5,       [GLOBAL(rd)]
+        psraw       xmm4,       VP8_FILTER_SHIFT
+
+        paddw       xmm6,       [GLOBAL(rd)]
+        psraw       xmm5,       VP8_FILTER_SHIFT
+
+        psraw       xmm6,       VP8_FILTER_SHIFT
+        packuswb    xmm1,       xmm1
+
+        packuswb    xmm2,       xmm2
+        movq        [rdi],      xmm1
+
+        packuswb    xmm3,       xmm3
+        movq        [rdi+rdx],  xmm2
+
+        packuswb    xmm4,       xmm4
+        movq        xmm1,       XMMWORD PTR [rsp+112]
+
+        lea         rdi,        [rdi + 2*rdx]
+        movq        xmm2,       XMMWORD PTR [rsp+128]
+
+        packuswb    xmm5,       xmm5
+        movq        [rdi],      xmm3
+
+        packuswb    xmm6,       xmm6
+        movq        [rdi+rdx],  xmm4
+
+        lea         rdi,        [rdi + 2*rdx]
+        punpcklbw   xmm7,       xmm1
+
+        movq        [rdi],      xmm5
+        pmaddubsw   xmm7,       xmm0
+
+        movq        [rdi+rdx],  xmm6
+        punpcklbw   xmm1,       xmm2
+
+        pmaddubsw   xmm1,       xmm0
+        paddw       xmm7,       [GLOBAL(rd)]
+
+        psraw       xmm7,       VP8_FILTER_SHIFT
+        paddw       xmm1,       [GLOBAL(rd)]
+
+        psraw       xmm1,       VP8_FILTER_SHIFT
+        packuswb    xmm7,       xmm7
+
+        packuswb    xmm1,       xmm1
+        lea         rdi,        [rdi + 2*rdx]
+
+        movq        [rdi],      xmm7
+
+        movq        [rdi+rdx],  xmm1
+        lea         rsp,        [rsp + 144]
+
+        jmp         .done8x8
+
+.b8x8_fp_only:
+        lea         rcx,        [rdi+rdx*8]
+
+.next_row_fp:
+        movdqa      xmm1,       XMMWORD PTR [rsp]
+        movdqa      xmm3,       XMMWORD PTR [rsp+16]
+
+        movdqa      xmm2,       xmm1
+        movdqa      xmm5,       XMMWORD PTR [rsp+32]
+
+        psrldq      xmm2,       1
+        movdqa      xmm7,       XMMWORD PTR [rsp+48]
+
+        movdqa      xmm4,       xmm3
+        psrldq      xmm4,       1
+
+        movdqa      xmm6,       xmm5
+        psrldq      xmm6,       1
+
+        punpcklbw   xmm1,       xmm2
+        pmaddubsw   xmm1,       xmm0
+
+        punpcklbw   xmm3,       xmm4
+        pmaddubsw   xmm3,       xmm0
+
+        punpcklbw   xmm5,       xmm6
+        pmaddubsw   xmm5,       xmm0
+
+        movdqa      xmm2,       xmm7
+        psrldq      xmm2,       1
+
+        punpcklbw   xmm7,       xmm2
+        pmaddubsw   xmm7,       xmm0
+
+        paddw       xmm1,       [GLOBAL(rd)]
+        psraw       xmm1,       VP8_FILTER_SHIFT
+
+        paddw       xmm3,       [GLOBAL(rd)]
+        psraw       xmm3,       VP8_FILTER_SHIFT
+
+        paddw       xmm5,       [GLOBAL(rd)]
+        psraw       xmm5,       VP8_FILTER_SHIFT
+
+        paddw       xmm7,       [GLOBAL(rd)]
+        psraw       xmm7,       VP8_FILTER_SHIFT
+
+        packuswb    xmm1,       xmm1
+        packuswb    xmm3,       xmm3
+
+        packuswb    xmm5,       xmm5
+        movq        [rdi],      xmm1
+
+        packuswb    xmm7,       xmm7
+        movq        [rdi+rdx],  xmm3
+
+        lea         rdi,        [rdi + 2*rdx]
+        movq        [rdi],      xmm5
+
+        lea         rsp,        [rsp + 4*16]
+        movq        [rdi+rdx],  xmm7
+
+        lea         rdi,        [rdi + 2*rdx]
+        cmp         rdi,        rcx
+
+        jne         .next_row_fp
+
+        lea         rsp,        [rsp + 16]
+
+.done8x8:
+    ;add rsp, 144
+    pop         rsp
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+shuf1b:
+    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
+shuf2b:
+    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
+shuf3b:
+    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
+
+align 16
+shuf2bfrom1:
+    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
+align 16
+shuf3bfrom1:
+    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
+
+align 16
+rd:
+    times 8 dw 0x40
+
+align 16
+k0_k5:
+    times 8 db 0, 0             ;placeholder
+    times 8 db 0, 0
+    times 8 db 2, 1
+    times 8 db 0, 0
+    times 8 db 3, 3
+    times 8 db 0, 0
+    times 8 db 1, 2
+    times 8 db 0, 0
+k1_k3:
+    times 8 db  0,    0         ;placeholder
+    times 8 db  -6,  12
+    times 8 db -11,  36
+    times 8 db  -9,  50
+    times 8 db -16,  77
+    times 8 db  -6,  93
+    times 8 db  -8, 108
+    times 8 db  -1, 123
+k2_k4:
+    times 8 db 128,    0        ;placeholder
+    times 8 db 123,   -1
+    times 8 db 108,   -8
+    times 8 db  93,   -6
+    times 8 db  77,  -16
+    times 8 db  50,   -9
+    times 8 db  36,  -11
+    times 8 db  12,   -6
+align 16
+vp8_bilinear_filters_ssse3:
+    times 8 db 128, 0
+    times 8 db 112, 16
+    times 8 db 96,  32
+    times 8 db 80,  48
+    times 8 db 64,  64
+    times 8 db 48,  80
+    times 8 db 32,  96
+    times 8 db 16,  112
+
diff --git a/libvpx/vp8/common/x86/variance_impl_mmx.asm b/libvpx/vp8/common/x86/variance_impl_mmx.asm
new file mode 100644
index 0000000..d9120d0
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_impl_mmx.asm
@@ -0,0 +1,851 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
+global sym(vp8_get_mb_ss_mmx) PRIVATE
+sym(vp8_get_mb_ss_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 8
+    ; end prolog
+
+        mov         rax, arg(0) ;src_ptr
+        mov         rcx, 16
+        pxor        mm4, mm4
+
+.NEXTROW:
+        movq        mm0, [rax]
+        movq        mm1, [rax+8]
+        movq        mm2, [rax+16]
+        movq        mm3, [rax+24]
+        pmaddwd     mm0, mm0
+        pmaddwd     mm1, mm1
+        pmaddwd     mm2, mm2
+        pmaddwd     mm3, mm3
+
+        paddd       mm4, mm0
+        paddd       mm4, mm1
+        paddd       mm4, mm2
+        paddd       mm4, mm3
+
+        add         rax, 32
+        dec         rcx
+        ja          .NEXTROW
+        movq        QWORD PTR [rsp], mm4
+
+        ;return sum[0]+sum[1];
+        movsxd      rax, dword ptr [rsp]
+        movsxd      rcx, dword ptr [rsp+4]
+        add         rax, rcx
+
+
+    ; begin epilog
+    add rsp, 8
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_get8x8var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vp8_get8x8var_mmx) PRIVATE
+sym(vp8_get8x8var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+
+        ; Row 2
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 3
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 4
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 5
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        ;              movq        mm4, [rbx + rdx]
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 6
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 7
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 8
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;unsigned int
+;vp8_get4x4var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vp8_get4x4var_mmx) PRIVATE
+sym(vp8_get4x4var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+
+        ; Row 2
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 3
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 4
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;unsigned int
+;vp8_get4x4sse_cs_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride
+;)
+global sym(vp8_get4x4sse_cs_mmx) PRIVATE
+sym(vp8_get4x4sse_cs_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    push rbx
+    ; end prolog
+
+
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+        ; Row 1
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 2
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 3
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm1, mm6
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 4
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        movq        mm0,    mm7                 ;
+        psrlq       mm7,    32
+
+        paddd       mm0,    mm7
+        movq        rax,    mm0
+
+
+    ; begin epilog
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%define mmx_filter_shift            7
+
+;void vp8_filter_block2d_bil4x4_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
+sym(vp8_filter_block2d_bil4x4_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+
+        mov             rax,            arg(4) ;HFilter             ;
+        mov             rdx,            arg(5) ;VFilter             ;
+
+        mov             rsi,            arg(0) ;ref_ptr              ;
+        mov             rdi,            arg(2) ;src_ptr              ;
+
+        mov             rcx,            4                   ;
+        pxor            mm0,            mm0                 ;
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+%if ABI_IS_32BIT
+        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
+%else
+        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
+        add             rsi, r8
+%endif
+
+.filter_block2d_bil4x4_var_mmx_loop:
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm3,            mm5                 ;
+
+        movq            mm5,            mm1                 ;
+        pmullw          mm3,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        paddw           mm1,            mm3                 ;
+
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        movd            mm3,            [rdi]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        paddw           mm6,            mm1                 ;
+
+        pmaddwd         mm1,            mm1                 ;
+        paddd           mm7,            mm1                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
+
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(6) ;sum
+        mov             rsi,            arg(7) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+
+;void vp8_filter_block2d_bil_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
+sym(vp8_filter_block2d_bil_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+        mov             rax,            arg(5) ;HFilter             ;
+
+        mov             rdx,            arg(6) ;VFilter             ;
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+
+        pxor            mm0,            mm0                 ;
+        movq            mm1,            [rsi]               ;
+
+        movq            mm3,            [rsi+1]             ;
+        movq            mm2,            mm1                 ;
+
+        movq            mm4,            mm3                 ;
+        punpcklbw       mm1,            mm0                 ;
+
+        punpckhbw       mm2,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        pmullw          mm2,            [rax]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        punpckhbw       mm4,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        pmullw          mm4,            [rax+8]             ;
+        paddw           mm1,            mm3                 ;
+
+        paddw           mm2,            mm4                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm2,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+        packuswb        mm5,            mm2                 ;
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        add             rsi,            r8
+%endif
+
+.filter_block2d_bil_var_mmx_loop:
+
+        movq            mm1,            [rsi]               ;
+        movq            mm3,            [rsi+1]             ;
+
+        movq            mm2,            mm1                 ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm1,            mm0                 ;
+        punpckhbw       mm2,            mm0                 ;
+
+        pmullw          mm1,            [rax]               ;
+        pmullw          mm2,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        pmullw          mm3,            [rax+8]             ;
+        pmullw          mm4,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            mm5                 ;
+        movq            mm4,            mm5                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        movq            mm5,            mm1                 ;
+        packuswb        mm5,            mm2                 ;
+
+        pmullw          mm3,            [rdx]               ;
+        pmullw          mm4,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        pmullw          mm2,            [rdx+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            [rdi]               ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        psubw           mm2,            mm4                 ;
+
+        paddw           mm6,            mm1                 ;
+        pmaddwd         mm1,            mm1                 ;
+
+        paddw           mm6,            mm2                 ;
+        pmaddwd         mm2,            mm2                 ;
+
+        paddd           mm7,            mm1                 ;
+        paddd           mm7,            mm2                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             .filter_block2d_bil_var_mmx_loop       ;
+
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(7) ;sum
+        mov             rsi,            arg(8) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+;short mmx_bi_rd[4] = { 64, 64, 64, 64};
+align 16
+mmx_bi_rd:
+    times 4 dw 64
diff --git a/libvpx/vp8/common/x86/variance_impl_sse2.asm b/libvpx/vp8/common/x86/variance_impl_sse2.asm
new file mode 100644
index 0000000..761433c
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_impl_sse2.asm
@@ -0,0 +1,1359 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift            7
+
+;unsigned int vp8_get_mb_ss_sse2
+;(
+;    short *src_ptr
+;)
+global sym(vp8_get_mb_ss_sse2) PRIVATE
+sym(vp8_get_mb_ss_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 1
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+
+        mov         rax, arg(0) ;[src_ptr]
+        mov         rcx, 8
+        pxor        xmm4, xmm4
+
+.NEXTROW:
+        movdqa      xmm0, [rax]
+        movdqa      xmm1, [rax+16]
+        movdqa      xmm2, [rax+32]
+        movdqa      xmm3, [rax+48]
+        pmaddwd     xmm0, xmm0
+        pmaddwd     xmm1, xmm1
+        pmaddwd     xmm2, xmm2
+        pmaddwd     xmm3, xmm3
+
+        paddd       xmm0, xmm1
+        paddd       xmm2, xmm3
+        paddd       xmm4, xmm0
+        paddd       xmm4, xmm2
+
+        add         rax, 0x40
+        dec         rcx
+        ja          .NEXTROW
+
+        movdqa      xmm3,xmm4
+        psrldq      xmm4,8
+        paddd       xmm4,xmm3
+        movdqa      xmm3,xmm4
+        psrldq      xmm4,4
+        paddd       xmm4,xmm3
+        movq        rax,xmm4
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_get16x16var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp8_get16x16var_sse2) PRIVATE
+sym(vp8_get16x16var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+
+        ; Prefetch data
+        lea             rcx,    [rax+rax*2]
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+rax]
+        prefetcht0      [rsi+rax*2]
+        prefetcht0      [rsi+rcx]
+        lea             rbx,    [rsi+rax*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax*2]
+        prefetcht0      [rbx+rcx]
+
+        lea             rcx,    [rdx+rdx*2]
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+rdx]
+        prefetcht0      [rdi+rdx*2]
+        prefetcht0      [rdi+rcx]
+        lea             rbx,    [rdi+rdx*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx*2]
+        prefetcht0      [rbx+rcx]
+
+        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+.var16loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        prefetcht0      [rsi+rax*8]
+        prefetcht0      [rdi+rdx*8]
+
+        movdqa      xmm3,           xmm1
+        movdqa      xmm4,           xmm2
+
+
+        punpcklbw   xmm1,           xmm0
+        punpckhbw   xmm3,           xmm0
+
+        punpcklbw   xmm2,           xmm0
+        punpckhbw   xmm4,           xmm0
+
+
+        psubw       xmm1,           xmm2
+        psubw       xmm3,           xmm4
+
+        paddw       xmm7,           xmm1
+        pmaddwd     xmm1,           xmm1
+
+        paddw       xmm7,           xmm3
+        pmaddwd     xmm3,           xmm3
+
+        paddd       xmm6,           xmm1
+        paddd       xmm6,           xmm3
+
+        add         rsi,            rax
+        add         rdi,            rdx
+
+        sub         rcx,            1
+        jnz         .var16loop
+
+
+        movdqa      xmm1,           xmm6
+        pxor        xmm6,           xmm6
+
+        pxor        xmm5,           xmm5
+        punpcklwd   xmm6,           xmm7
+
+        punpckhwd   xmm5,           xmm7
+        psrad       xmm5,           16
+
+        psrad       xmm6,           16
+        paddd       xmm6,           xmm5
+
+        movdqa      xmm2,           xmm1
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        movdqa      xmm7,           xmm6
+
+        paddd       xmm1,           xmm2
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm7,           xmm0
+        paddd       xmm6,           xmm7
+
+        movdqa      xmm2,           xmm1
+        movdqa      xmm7,           xmm6
+
+        psrldq      xmm1,           8
+        psrldq      xmm6,           8
+
+        paddd       xmm7,           xmm6
+        paddd       xmm1,           xmm2
+
+        mov         rax,            arg(5) ;[Sum]
+        mov         rdi,            arg(4) ;[SSE]
+
+        movd DWORD PTR [rax],       xmm7
+        movd DWORD PTR [rdi],       xmm1
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+
+;unsigned int vp8_get8x8var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp8_get8x8var_sse2) PRIVATE
+sym(vp8_get8x8var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+
+        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
+
+        movq        xmm1,           QWORD PTR [rsi]
+        movq        xmm2,           QWORD PTR [rdi]
+
+        punpcklbw   xmm1,           xmm0
+        punpcklbw   xmm2,           xmm0
+
+        psubsw      xmm1,           xmm2
+        paddw       xmm7,           xmm1
+
+        pmaddwd     xmm1,           xmm1
+
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        movq        xmm2,           QWORD PTR[rsi + rax * 2]
+        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        lea         rsi,            [rsi + rax * 2]
+        lea         rdi,            [rdi + rdx * 2]
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+        movq        xmm2,           QWORD PTR[rsi + rax *2]
+        movq        xmm3,           QWORD PTR[rdi + rdx *2]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        lea         rsi,            [rsi + rax * 2]
+        lea         rdi,            [rdi + rdx * 2]
+
+
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+        movq        xmm2,           QWORD PTR[rsi + rax *2]
+        movq        xmm3,           QWORD PTR[rdi + rdx *2]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        lea         rsi,            [rsi + rax * 2]
+        lea         rdi,            [rdi + rdx * 2]
+
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        movdqa      xmm6,           xmm7
+        punpcklwd   xmm6,           xmm0
+
+        punpckhwd   xmm7,           xmm0
+        movdqa      xmm2,           xmm1
+
+        paddw       xmm6,           xmm7
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        movdqa      xmm7,           xmm6
+
+        paddd       xmm1,           xmm2
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm7,           xmm0
+        paddw       xmm6,           xmm7
+
+        movdqa      xmm2,           xmm1
+        movdqa      xmm7,           xmm6
+
+        psrldq      xmm1,           8
+        psrldq      xmm6,           8
+
+        paddw       xmm7,           xmm6
+        paddd       xmm1,           xmm2
+
+        mov         rax,            arg(5) ;[Sum]
+        mov         rdi,            arg(4) ;[SSE]
+
+        movq        rdx,            xmm7
+        movsx       rcx,            dx
+
+        mov  dword ptr [rax],       ecx
+        movd DWORD PTR [rdi],       xmm1
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_filter_block2d_bil_var_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int  xoffset,
+;    int  yoffset,
+;    int *sum,
+;    unsigned int *sumsquared;;
+;
+;)
+global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
+sym(vp8_filter_block2d_bil_var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    push rbx
+    ; end prolog
+
+        pxor            xmm6,           xmm6                 ;
+        pxor            xmm7,           xmm7                 ;
+
+        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
+        movdqa          xmm4,           XMMWORD PTR [rsi]
+
+        lea             rcx,            [GLOBAL(vp8_bilinear_filters_sse2)]
+        movsxd          rax,            dword ptr arg(5)     ; xoffset
+
+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
+        je              filter_block2d_bil_var_sse2_sp_only
+
+        shl             rax,            5                    ; point to filter coeff with xoffset
+        lea             rax,            [rax + rcx]          ; HFilter
+
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
+        je              filter_block2d_bil_var_sse2_fp_only
+
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+
+        pxor            xmm0,           xmm0                 ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        movq            xmm3,           QWORD PTR [rsi+1]    ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]                ;
+        punpcklbw       xmm3,           xmm0
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift     ;
+        movdqa          xmm5,           xmm1
+
+        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
+        lea             rsi,            [rsi + rbx]
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_sse2_loop:
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        movq            xmm3,           QWORD PTR [rsi+1]             ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4               ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movdqa          xmm3,           xmm5                 ;
+        movdqa          xmm5,           xmm1                 ;
+
+        pmullw          xmm3,           [rdx]               ;
+        pmullw          xmm1,           [rdx+16]             ;
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
+%if ABI_IS_32BIT
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_var_sse2_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_sp_only:
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
+        je              filter_block2d_bil_var_sse2_full_pixel
+
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        lea             rsi,            [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+        movq            xmm3,           QWORD PTR [rsi]             ;
+        punpcklbw       xmm3,           xmm0                 ;
+        movdqa          xmm5,           xmm3
+
+        pmullw          xmm1,           [rdx]               ;
+        pmullw          xmm3,           [rdx+16]             ;
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        movdqa          xmm1,           xmm5                 ;
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_sp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0                 ;
+
+filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movq            xmm2,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm2,           xmm0                 ;
+
+        psubw           xmm1,           xmm2                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_full_pixel_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_fp_only:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+        movq            xmm1,           QWORD PTR [rsi]       ;
+        movq            xmm3,           QWORD PTR [rsi+1]     ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4  ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]     ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+        lea             rsi,            [rsi + rdx]
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_fp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(7) ; sum
+        mov             rdi,            arg(8) ; sumsquared
+
+        movd            [rsi],          mm2    ; xsum
+        movd            [rdi],          mm4    ; xxsum
+
+    ; begin epilog
+    pop rbx
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_horiz_vert_variance8x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
+sym(vp8_half_horiz_vert_variance8x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=0
+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
+        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+%else
+        add             rsi, r8
+%endif
+
+vp8_half_horiz_vert_variance8x_h_1:
+
+        movq            xmm1,           QWORD PTR [rsi]     ;
+        movq            xmm2,           QWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+%if ABI_IS_32BIT
+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
+%else
+        add             rsi, r8
+        add             rdi, r9
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_vert_variance8x_h_1     ;
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(5) ; sum
+        mov             rdi,            arg(6) ; sumsquared
+
+        movd            [rsi],          mm2                 ;
+        movd            [rdi],          mm4                 ;
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_half_horiz_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
+sym(vp8_half_horiz_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+        lea             rsi,            [rsi + rax]
+
+vp8_half_horiz_vert_variance16x_h_1:
+        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+
+        movq            xmm3,           QWORD PTR [rdi+8]
+        punpcklbw       xmm3,           xmm0
+        psubw           xmm4,           xmm3
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_vert_variance8x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
+sym(vp8_half_vert_variance8x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=0
+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+vp8_half_vert_variance8x_h_1:
+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
+        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
+%else
+        add             rsi, r8
+        add             rdi, r9
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_vert_variance8x_h_1          ;
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(5) ; sum
+        mov             rdi,            arg(6) ; sumsquared
+
+        movd            [rsi],          mm2                 ;
+        movd            [rdi],          mm4                 ;
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_half_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
+sym(vp8_half_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0)              ;ref_ptr
+
+        mov             rdi,            arg(2)              ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)    ;Height
+        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        lea             rsi,            [rsi + rax          ]
+        pxor            xmm0,           xmm0
+
+vp8_half_vert_variance16x_h_1:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm2,           QWORD PTR [rdi]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm5,           xmm2
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm4,           xmm2
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm3
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1
+        jnz             vp8_half_vert_variance16x_h_1
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_horiz_variance8x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
+sym(vp8_half_horiz_variance8x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=0
+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+
+        pxor            xmm0,           xmm0                ;
+vp8_half_horiz_variance8x_h_1:
+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
+        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
+%else
+        add             rsi, r8
+        add             rdi, r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_variance8x_h_1        ;
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(5) ; sum
+        mov             rdi,            arg(6) ; sumsquared
+
+        movd            [rsi],          mm2                 ;
+        movd            [rdi],          mm4                 ;
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_half_horiz_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
+sym(vp8_half_horiz_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+vp8_half_horiz_variance16x_h_1:
+        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm1,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm1,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        psubw           xmm1,           xmm2
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm1
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm1,           xmm1
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm1
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_variance16x_h_1        ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+    times 8 dw 64
+align 16
+vp8_bilinear_filters_sse2:
+    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
+    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
diff --git a/libvpx/vp8/common/x86/variance_impl_ssse3.asm b/libvpx/vp8/common/x86/variance_impl_ssse3.asm
new file mode 100644
index 0000000..686b4a9
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_impl_ssse3.asm
@@ -0,0 +1,364 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift            7
+
+
+;void vp8_filter_block2d_bil_var_ssse3
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int  xoffset,
+;    int  yoffset,
+;    int *sum,
+;    unsigned int *sumsquared;;
+;
+;)
+;Note: The filter coefficient at offset=0 is 128. Since the second register
+;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
+global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
+sym(vp8_filter_block2d_bil_var_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6
+        pxor            xmm7,           xmm7
+
+        lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]
+        movsxd          rax,            dword ptr arg(5)     ; xoffset
+
+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
+        je              .filter_block2d_bil_var_ssse3_sp_only
+
+        shl             rax,            4                    ; point to filter coeff with xoffset
+        lea             rax,            [rax + rcx]          ; HFilter
+
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
+        je              .filter_block2d_bil_var_ssse3_fp_only
+
+        shl             rdx,            4
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+
+        movdqu          xmm0,           XMMWORD PTR [rsi]
+        movdqu          xmm1,           XMMWORD PTR [rsi+1]
+        movdqa          xmm2,           xmm0
+
+        punpcklbw       xmm0,           xmm1
+        punpckhbw       xmm2,           xmm1
+        pmaddubsw       xmm0,           [rax]
+        pmaddubsw       xmm2,           [rax]
+
+        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm0,           xmm_filter_shift
+        psraw           xmm2,           xmm_filter_shift
+
+        packuswb        xmm0,           xmm2
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+        lea             rsi,            [rsi + r8]
+%endif
+
+.filter_block2d_bil_var_ssse3_loop:
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]
+        movdqa          xmm3,           xmm1
+
+        punpcklbw       xmm1,           xmm2
+        punpckhbw       xmm3,           xmm2
+        pmaddubsw       xmm1,           [rax]
+        pmaddubsw       xmm3,           [rax]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+        packuswb        xmm1,           xmm3
+
+        movdqa          xmm2,           xmm0
+        movdqa          xmm0,           xmm1
+        movdqa          xmm3,           xmm2
+
+        punpcklbw       xmm2,           xmm1
+        punpckhbw       xmm3,           xmm1
+        pmaddubsw       xmm2,           [rdx]
+        pmaddubsw       xmm3,           [rdx]
+
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm2,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+
+        movq            xmm1,           QWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm1,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm2,           xmm1
+        psubw           xmm3,           xmm5
+        paddw           xmm6,           xmm2
+        paddw           xmm6,           xmm3
+        pmaddwd         xmm2,           xmm2
+        pmaddwd         xmm3,           xmm3
+        paddd           xmm7,           xmm2
+        paddd           xmm7,           xmm3
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rsi,            [rsi + r8]
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1
+        jnz             .filter_block2d_bil_var_ssse3_loop
+
+        jmp             .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_sp_only:
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
+        je              .filter_block2d_bil_var_ssse3_full_pixel
+
+        shl             rdx,            4
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqa          xmm0,           xmm1
+
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        lea             rsi,            [rsi + rax]
+
+.filter_block2d_bil_sp_only_loop:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+        movdqa          xmm2,           xmm1
+        movdqa          xmm0,           xmm3
+
+        punpcklbw       xmm1,           xmm3
+        punpckhbw       xmm2,           xmm3
+        pmaddubsw       xmm1,           [rdx]
+        pmaddubsw       xmm2,           [rdx]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm2,           xmm_filter_shift
+
+        movq            xmm3,           QWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm3,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm1,           xmm3
+        psubw           xmm2,           xmm5
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm2
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm2,           xmm2
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm2
+
+        movdqa          xmm1,           xmm0
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+
+%if ABI_IS_32BIT
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1
+        jnz             .filter_block2d_bil_sp_only_loop
+
+        jmp             .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0
+
+.filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]
+        punpcklbw       xmm1,           xmm0
+        movq            xmm2,           QWORD PTR [rsi+8]
+        punpcklbw       xmm2,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]
+        punpcklbw       xmm3,           xmm0
+        movq            xmm4,           QWORD PTR [rdi+8]
+        punpcklbw       xmm4,           xmm0
+
+        psubw           xmm1,           xmm3
+        psubw           xmm2,           xmm4
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm2
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm2,           xmm2
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm2
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
+        sub             rcx,            1
+        jnz             .filter_block2d_bil_full_pixel_loop
+
+        jmp             .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_fp_only:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0
+
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+.filter_block2d_bil_fp_only_loop:
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]
+        movdqa          xmm3,           xmm1
+
+        punpcklbw       xmm1,           xmm2
+        punpckhbw       xmm3,           xmm2
+        pmaddubsw       xmm1,           [rax]
+        pmaddubsw       xmm3,           [rax]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+
+        movq            xmm2,           XMMWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm2,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm1,           xmm2
+        psubw           xmm3,           xmm5
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm3
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm3,           xmm3
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm3
+
+        lea             rsi,            [rsi + rdx]
+%if ABI_IS_32BIT
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1
+        jnz             .filter_block2d_bil_fp_only_loop
+
+        jmp             .filter_block2d_bil_variance
+
+.filter_block2d_bil_variance:
+        pxor        xmm0,           xmm0
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(7) ;[Sum]
+        mov         rdi,            arg(8) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+xmm_bi_rd:
+    times 8 dw 64
+align 16
+vp8_bilinear_filters_ssse3:
+    times 8 db 128, 0
+    times 8 db 112, 16
+    times 8 db 96,  32
+    times 8 db 80,  48
+    times 8 db 64,  64
+    times 8 db 48,  80
+    times 8 db 32,  96
+    times 8 db 16,  112
diff --git a/libvpx/vp8/common/x86/variance_mmx.c b/libvpx/vp8/common/x86/variance_mmx.c
new file mode 100644
index 0000000..0c4dd4a
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_mmx.c
@@ -0,0 +1,398 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/common/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"
+
+extern void filter_block1d_h6_mmx
+(
+    const unsigned char *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    short *filter
+);
+extern void filter_block1d_v6_mmx
+(
+    const short *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    short *filter
+);
+
+extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
+extern unsigned int vp8_get8x8var_mmx
+(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+extern unsigned int vp8_get4x4var_mmx
+(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    const short *HFilter,
+    const short *VFilter,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_mmx
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    const short *HFilter,
+    const short *VFilter,
+    int *sum,
+    unsigned int *sumsquared
+);
+
+
+unsigned int vp8_variance4x4_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+    *sse = var;
+    return (var - ((unsigned int)(avg * avg) >> 4));
+
+}
+
+unsigned int vp8_variance8x8_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+    *sse = var;
+
+    return (var - ((unsigned int)(avg * avg) >> 6));
+
+}
+
+unsigned int vp8_mse16x16_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3;
+
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    *sse = var;
+    return var;
+}
+
+
+unsigned int vp8_variance16x16_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3, avg;
+
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    avg = sum0 + sum1 + sum2 + sum3;
+    *sse = var;
+    return (var - ((unsigned int)(avg * avg) >> 8));
+}
+
+unsigned int vp8_variance16x8_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+    return (var - ((unsigned int)(avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_variance8x16_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+
+    return (var - ((unsigned int)(avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_sub_pixel_variance4x4_mmx
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse)
+
+{
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil4x4_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line,
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+        &xsum, &xxsum
+    );
+    *sse = xxsum;
+    return (xxsum - ((unsigned int)(xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_mmx
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+        &xsum, &xxsum
+    );
+    *sse = xxsum;
+    return (xxsum - ((unsigned int)(xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_mmx
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+        &xsum0, &xxsum0
+    );
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 16,
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+
+
+}
+
+unsigned int vp8_sub_pixel_mse16x16_mmx(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+    return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_mmx
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+        &xsum0, &xxsum0
+    );
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 8,
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_mmx
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+        &xsum, &xxsum
+    );
+    *sse = xxsum;
+    return (xxsum - ((unsigned int)(xsum * xsum) >> 7));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
+                                           ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
+                                           ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
+                                           ref_ptr, recon_stride, sse);
+}
diff --git a/libvpx/vp8/common/x86/variance_sse2.c b/libvpx/vp8/common/x86/variance_sse2.c
new file mode 100644
index 0000000..afd6429
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_sse2.c
@@ -0,0 +1,558 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/common/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"
+
+extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    const short *HFilter,
+    const short *VFilter,
+    int *sum,
+    unsigned int *sumsquared
+);
+
+extern unsigned int vp8_get4x4var_mmx
+(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+
+unsigned int vp8_get_mb_ss_sse2
+(
+    const short *src_ptr
+);
+unsigned int vp8_get16x16var_sse2
+(
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+unsigned int vp8_get8x8var_sse2
+(
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+void vp8_filter_block2d_bil_var_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int  xoffset,
+    int  yoffset,
+    int *sum,
+    unsigned int *sumsquared
+);
+void vp8_half_horiz_vert_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+void vp8_half_horiz_vert_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+void vp8_half_horiz_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+void vp8_half_horiz_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+void vp8_half_vert_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+void vp8_half_vert_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+
+unsigned int vp8_variance4x4_wmt(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+    *sse = var;
+    return (var - ((unsigned int)(avg * avg) >> 4));
+
+}
+
+unsigned int vp8_variance8x8_wmt
+(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+    *sse = var;
+    return (var - ((unsigned int)(avg * avg) >> 6));
+
+}
+
+
+unsigned int vp8_variance16x16_wmt
+(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0;
+    int sum0;
+
+
+    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    *sse = sse0;
+    return (sse0 - ((unsigned int)(sum0 * sum0) >> 8));
+}
+unsigned int vp8_mse16x16_wmt(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+
+    unsigned int sse0;
+    int sum0;
+    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    *sse = sse0;
+    return sse0;
+
+}
+
+
+unsigned int vp8_variance16x8_wmt
+(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+    return (var - ((unsigned int)(avg * avg) >> 7));
+
+}
+
+unsigned int vp8_variance8x16_wmt
+(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+    return (var - ((unsigned int)(avg * avg) >> 7));
+
+}
+
+unsigned int vp8_sub_pixel_variance4x4_wmt
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil4x4_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line,
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+        &xsum, &xxsum
+    );
+    *sse = xxsum;
+    return (xxsum - ((unsigned int)(xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_wmt
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum;
+    unsigned int xxsum;
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance8x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance8x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance8x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum, &xxsum);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum, &xxsum);
+    }
+
+    *sse = xxsum;
+    return (xxsum - ((unsigned int)(xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_wmt
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+
+    /* note we could avoid these if statements if the calling function
+     * just called the appropriate functions inside.
+     */
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            xoffset, yoffset,
+            &xsum0, &xxsum0
+        );
+
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 16,
+            xoffset, yoffset,
+            &xsum1, &xxsum1
+        );
+        xsum0 += xsum1;
+        xxsum0 += xxsum1;
+    }
+
+    *sse = xxsum0;
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp8_sub_pixel_mse16x16_wmt(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+    return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_wmt
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
+
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum1, &xxsum1);
+        xsum0 += xsum1;
+        xxsum0 += xxsum1;
+    }
+
+    *sse = xxsum0;
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_wmt
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum;
+    unsigned int xxsum;
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance8x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance8x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum, &xxsum);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance8x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum, &xxsum);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            xoffset, yoffset,
+            &xsum, &xxsum);
+    }
+
+    *sse = xxsum;
+    return (xxsum - ((unsigned int)(xsum * xsum) >> 7));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_wmt(
+    const unsigned char *src_ptr,
+    int  src_pixels_per_line,
+    const unsigned char *dst_ptr,
+    int  dst_pixels_per_line,
+    unsigned int *sse)
+{
+    int xsum0;
+    unsigned int xxsum0;
+
+    vp8_half_horiz_variance16x_h_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        &xsum0, &xxsum0);
+
+    *sse = xxsum0;
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_wmt(
+    const unsigned char *src_ptr,
+    int  src_pixels_per_line,
+    const unsigned char *dst_ptr,
+    int  dst_pixels_per_line,
+    unsigned int *sse)
+{
+    int xsum0;
+    unsigned int xxsum0;
+    vp8_half_vert_variance16x_h_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        &xsum0, &xxsum0);
+
+    *sse = xxsum0;
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
+    const unsigned char *src_ptr,
+    int  src_pixels_per_line,
+    const unsigned char *dst_ptr,
+    int  dst_pixels_per_line,
+    unsigned int *sse)
+{
+    int xsum0;
+    unsigned int xxsum0;
+
+    vp8_half_horiz_vert_variance16x_h_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        &xsum0, &xxsum0);
+
+    *sse = xxsum0;
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
diff --git a/libvpx/vp8/common/x86/variance_ssse3.c b/libvpx/vp8/common/x86/variance_ssse3.c
new file mode 100644
index 0000000..ba2055c
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_ssse3.c
@@ -0,0 +1,166 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/common/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern unsigned int vp8_get16x16var_sse2
+(
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+extern void vp8_half_horiz_vert_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_half_horiz_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_half_vert_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_ssse3
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int  xoffset,
+    int  yoffset,
+    int *sum,
+    unsigned int *sumsquared
+);
+
+unsigned int vp8_sub_pixel_variance16x16_ssse3
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0;
+    unsigned int xxsum0;
+
+    /* note we could avoid these if statements if the calling function
+     * just called the appropriate functions inside.
+     */
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_ssse3(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
+    }
+
+    *sse = xxsum0;
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp8_sub_pixel_variance16x8_ssse3
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+
+)
+{
+    int xsum0;
+    unsigned int xxsum0;
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_ssse3(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
+    }
+
+    *sse = xxsum0;
+    return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
+}
diff --git a/libvpx/vp8/common/x86/vp8_asm_stubs.c b/libvpx/vp8/common/x86/vp8_asm_stubs.c
new file mode 100644
index 0000000..3437a23
--- /dev/null
+++ b/libvpx/vp8/common/x86/vp8_asm_stubs.c
@@ -0,0 +1,629 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "filter_x86.h"
+
+extern const short vp8_six_tap_mmx[8][6*8];
+
+extern void vp8_filter_block1d_h6_mmx
+(
+    unsigned char   *src_ptr,
+    unsigned short  *output_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned int    pixel_step,
+    unsigned int    output_height,
+    unsigned int    output_width,
+    const short      *vp8_filter
+);
+extern void vp8_filter_block1dc_v6_mmx
+(
+    unsigned short *src_ptr,
+    unsigned char  *output_ptr,
+    int             output_pitch,
+    unsigned int    pixels_per_line,
+    unsigned int    pixel_step,
+    unsigned int    output_height,
+    unsigned int    output_width,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d8_h6_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned int    pixel_step,
+    unsigned int    output_height,
+    unsigned int    output_width,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d16_h6_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned int    pixel_step,
+    unsigned int    output_height,
+    unsigned int    output_width,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d8_v6_sse2
+(
+    unsigned short *src_ptr,
+    unsigned char *output_ptr,
+    int dst_ptich,
+    unsigned int pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d16_v6_sse2
+(
+    unsigned short *src_ptr,
+    unsigned char *output_ptr,
+    int dst_ptich,
+    unsigned int pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short    *vp8_filter
+);
+extern void vp8_unpack_block1d16_h6_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned int    output_height,
+    unsigned int    output_width
+);
+extern void vp8_filter_block1d8_h6_only_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned char  *output_ptr,
+    int dst_ptich,
+    unsigned int    output_height,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d16_h6_only_sse2
+(
+    unsigned char  *src_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned char  *output_ptr,
+    int dst_ptich,
+    unsigned int    output_height,
+    const short    *vp8_filter
+);
+extern void vp8_filter_block1d8_v6_only_sse2
+(
+    unsigned char *src_ptr,
+    unsigned int   src_pixels_per_line,
+    unsigned char *output_ptr,
+    int dst_ptich,
+    unsigned int   output_height,
+    const short   *vp8_filter
+);
+
+
+#if HAVE_MMX
+void vp8_sixtap_predict4x4_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16);  /* Temp data bufffer used in filtering */
+    const short *HFilter, *VFilter;
+    HFilter = vp8_six_tap_mmx[xoffset];
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
+    VFilter = vp8_six_tap_mmx[yoffset];
+    vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
+
+}
+
+
+void vp8_sixtap_predict16x16_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);  /* Temp data bufffer used in filtering */
+
+    const short *HFilter, *VFilter;
+
+
+    HFilter = vp8_six_tap_mmx[xoffset];
+
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 21, 32, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,  FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
+
+    VFilter = vp8_six_tap_mmx[yoffset];
+    vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, 16, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
+
+}
+
+
+void vp8_sixtap_predict8x8_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    /* Temp data bufffer used in filtering */
+
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_six_tap_mmx[xoffset];
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 13, 16, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
+
+    VFilter = vp8_six_tap_mmx[yoffset];
+    vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 8, 8, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
+
+}
+
+
+void vp8_sixtap_predict8x4_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    /* Temp data bufffer used in filtering */
+
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_six_tap_mmx[xoffset];
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),    FData2,   src_pixels_per_line, 1, 9, 16, HFilter);
+    vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,  FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
+
+    VFilter = vp8_six_tap_mmx[yoffset];
+    vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 4, 8, VFilter);
+    vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
+
+}
+
+
+
+void vp8_bilinear_predict16x16_mmx
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    vp8_bilinear_predict8x8_mmx(src_ptr,   src_pixels_per_line, xoffset, yoffset, dst_ptr,   dst_pitch);
+    vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
+    vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,   src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8,   dst_pitch);
+    vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
+}
+#endif
+
+
+#if HAVE_SSE2
+void vp8_sixtap_predict16x16_sse2
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+
+)
+{
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);    /* Temp data bufffer used in filtering */
+
+    const short *HFilter, *VFilter;
+
+    if (xoffset)
+    {
+        if (yoffset)
+        {
+            HFilter = vp8_six_tap_mmx[xoffset];
+            vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 21, 32, HFilter);
+            VFilter = vp8_six_tap_mmx[yoffset];
+            vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
+        }
+        else
+        {
+            /* First-pass only */
+            HFilter = vp8_six_tap_mmx[xoffset];
+            vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
+        }
+    }
+    else
+    {
+        /* Second-pass only */
+        VFilter = vp8_six_tap_mmx[yoffset];
+        vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 21, 32);
+        vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr,   dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
+    }
+}
+
+
+void vp8_sixtap_predict8x8_sse2
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  /* Temp data bufffer used in filtering */
+    const short *HFilter, *VFilter;
+
+    if (xoffset)
+    {
+        if (yoffset)
+        {
+            HFilter = vp8_six_tap_mmx[xoffset];
+            vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 13, 16, HFilter);
+            VFilter = vp8_six_tap_mmx[yoffset];
+            vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
+        }
+        else
+        {
+            /* First-pass only */
+            HFilter = vp8_six_tap_mmx[xoffset];
+            vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
+        }
+    }
+    else
+    {
+        /* Second-pass only */
+        VFilter = vp8_six_tap_mmx[yoffset];
+        vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
+    }
+}
+
+
+void vp8_sixtap_predict8x4_sse2
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  /* Temp data bufffer used in filtering */
+    const short *HFilter, *VFilter;
+
+    if (xoffset)
+    {
+        if (yoffset)
+        {
+            HFilter = vp8_six_tap_mmx[xoffset];
+            vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,   src_pixels_per_line, 1, 9, 16, HFilter);
+            VFilter = vp8_six_tap_mmx[yoffset];
+            vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr,   dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
+        }
+        else
+        {
+            /* First-pass only */
+            HFilter = vp8_six_tap_mmx[xoffset];
+            vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
+        }
+    }
+    else
+    {
+        /* Second-pass only */
+        VFilter = vp8_six_tap_mmx[yoffset];
+        vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
+    }
+}
+
+#endif
+
+#if HAVE_SSSE3
+
+extern void vp8_filter_block1d8_h6_ssse3
+(
+    unsigned char  *src_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned char  *output_ptr,
+    unsigned int    output_pitch,
+    unsigned int    output_height,
+    unsigned int    vp8_filter_index
+);
+
+extern void vp8_filter_block1d16_h6_ssse3
+(
+    unsigned char  *src_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned char  *output_ptr,
+    unsigned int    output_pitch,
+    unsigned int    output_height,
+    unsigned int    vp8_filter_index
+);
+
+extern void vp8_filter_block1d16_v6_ssse3
+(
+    unsigned char *src_ptr,
+    unsigned int   src_pitch,
+    unsigned char *output_ptr,
+    unsigned int   out_pitch,
+    unsigned int   output_height,
+    unsigned int   vp8_filter_index
+);
+
+extern void vp8_filter_block1d8_v6_ssse3
+(
+    unsigned char *src_ptr,
+    unsigned int   src_pitch,
+    unsigned char *output_ptr,
+    unsigned int   out_pitch,
+    unsigned int   output_height,
+    unsigned int   vp8_filter_index
+);
+
+extern void vp8_filter_block1d4_h6_ssse3
+(
+    unsigned char  *src_ptr,
+    unsigned int    src_pixels_per_line,
+    unsigned char  *output_ptr,
+    unsigned int    output_pitch,
+    unsigned int    output_height,
+    unsigned int    vp8_filter_index
+);
+
+extern void vp8_filter_block1d4_v6_ssse3
+(
+    unsigned char *src_ptr,
+    unsigned int   src_pitch,
+    unsigned char *output_ptr,
+    unsigned int   out_pitch,
+    unsigned int   output_height,
+    unsigned int   vp8_filter_index
+);
+
+void vp8_sixtap_predict16x16_ssse3
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+
+)
+{
+    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24);
+
+    if (xoffset)
+    {
+        if (yoffset)
+        {
+            vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                          src_pixels_per_line, FData2,
+                                          16, 21, xoffset);
+            vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch,
+                                          16, yoffset);
+        }
+        else
+        {
+            /* First-pass only */
+            vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
+                                          dst_ptr, dst_pitch, 16, xoffset);
+        }
+    }
+    else
+    {
+        if (yoffset)
+        {
+            /* Second-pass only */
+            vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                          src_pixels_per_line,
+                                          dst_ptr, dst_pitch, 16, yoffset);
+        }
+        else
+        {
+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+             * yoffset==0) case correctly. Add copy function here to guarantee
+             * six-tap function handles all possible offsets. */
+            vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+        }
+    }
+}
+
+void vp8_sixtap_predict8x8_ssse3
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
+
+    if (xoffset)
+    {
+        if (yoffset)
+        {
+            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line, FData2,
+                                         8, 13, xoffset);
+            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+                                         8, yoffset);
+        }
+        else
+        {
+            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 8, xoffset);
+        }
+    }
+    else
+    {
+        if (yoffset)
+        {
+            /* Second-pass only */
+            vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 8, yoffset);
+        }
+        else
+        {
+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+             * yoffset==0) case correctly. Add copy function here to guarantee
+             * six-tap function handles all possible offsets. */
+            vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+        }
+    }
+}
+
+
+void vp8_sixtap_predict8x4_ssse3
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
+
+    if (xoffset)
+    {
+        if (yoffset)
+        {
+            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line, FData2,
+                                         8, 9, xoffset);
+            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+                                         4, yoffset);
+        }
+        else
+        {
+            /* First-pass only */
+            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 4, xoffset);
+        }
+    }
+    else
+    {
+        if (yoffset)
+        {
+            /* Second-pass only */
+            vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 4, yoffset);
+        }
+        else
+        {
+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+             * yoffset==0) case correctly. Add copy function here to guarantee
+             * six-tap function handles all possible offsets. */
+            vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+        }
+    }
+}
+
+void vp8_sixtap_predict4x4_ssse3
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
+
+  if (xoffset)
+  {
+      if (yoffset)
+      {
+          vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                       src_pixels_per_line,
+                                       FData2, 4, 9, xoffset);
+          vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch,
+                                       4, yoffset);
+      }
+      else
+      {
+          vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
+                                       dst_ptr, dst_pitch, 4, xoffset);
+      }
+  }
+  else
+  {
+      if (yoffset)
+      {
+          vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                       src_pixels_per_line,
+                                       dst_ptr, dst_pitch, 4, yoffset);
+      }
+      else
+      {
+        /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+          * yoffset==0) case correctly. Add copy function here to guarantee
+          * six-tap function handles all possible offsets. */
+          int r;
+
+          for (r = 0; r < 4; r++)
+          {
+  #if !(CONFIG_FAST_UNALIGNED)
+            dst_ptr[0]  = src_ptr[0];
+            dst_ptr[1]  = src_ptr[1];
+            dst_ptr[2]  = src_ptr[2];
+            dst_ptr[3]  = src_ptr[3];
+  #else
+              *(uint32_t *)dst_ptr = *(uint32_t *)src_ptr ;
+  #endif
+              dst_ptr     += dst_pitch;
+              src_ptr     += src_pixels_per_line;
+          }
+      }
+  }
+}
+
+#endif
diff --git a/libvpx/vp8/decoder/asm_dec_offsets.c b/libvpx/vp8/decoder/asm_dec_offsets.c
new file mode 100644
index 0000000..842a0d5
--- /dev/null
+++ b/libvpx/vp8/decoder/asm_dec_offsets.c
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/asm_offsets.h"
+#include "onyxd_int.h"
+
+BEGIN
+
+DEFINE(bool_decoder_user_buffer_end,            offsetof(BOOL_DECODER, user_buffer_end));
+DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));
+DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));
+DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));
+DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));
+
+END
+
+/* add asserts for any offset that is not supported by assembly code */
+/* add asserts for any size that is not supported by assembly code */
diff --git a/libvpx/vp8/decoder/dboolhuff.c b/libvpx/vp8/decoder/dboolhuff.c
new file mode 100644
index 0000000..7e7b05a
--- /dev/null
+++ b/libvpx/vp8/decoder/dboolhuff.c
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "dboolhuff.h"
+#include "vpx_ports/mem.h"
+#include "vpx_mem/vpx_mem.h"
+
+int vp8dx_start_decode(BOOL_DECODER *br,
+                       const unsigned char *source,
+                       unsigned int source_sz)
+{
+    br->user_buffer_end = source+source_sz;
+    br->user_buffer     = source;
+    br->value    = 0;
+    br->count    = -8;
+    br->range    = 255;
+
+    if (source_sz && !source)
+        return 1;
+
+    /* Populate the buffer */
+    vp8dx_bool_decoder_fill(br);
+
+    return 0;
+}
+
+
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
+{
+    const unsigned char *bufptr;
+    const unsigned char *bufend;
+    VP8_BD_VALUE         value;
+    int                  count;
+    bufend = br->user_buffer_end;
+    bufptr = br->user_buffer;
+    value = br->value;
+    count = br->count;
+
+    VP8DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
+
+    br->user_buffer = bufptr;
+    br->value = value;
+    br->count = count;
+}
diff --git a/libvpx/vp8/decoder/dboolhuff.h b/libvpx/vp8/decoder/dboolhuff.h
new file mode 100644
index 0000000..1a08c05
--- /dev/null
+++ b/libvpx/vp8/decoder/dboolhuff.h
@@ -0,0 +1,154 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DBOOLHUFF_H
+#define DBOOLHUFF_H
+#include <stddef.h>
+#include <limits.h>
+#include "vpx_config.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+typedef size_t VP8_BD_VALUE;
+
+# define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT)
+/*This is meant to be a large, positive constant that can still be efficiently
+   loaded as an immediate (on platforms like ARM, for example).
+  Even relatively modest values like 100 would work fine.*/
+# define VP8_LOTS_OF_BITS (0x40000000)
+
+typedef struct
+{
+    const unsigned char *user_buffer_end;
+    const unsigned char *user_buffer;
+    VP8_BD_VALUE         value;
+    int                  count;
+    unsigned int         range;
+} BOOL_DECODER;
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
+
+int vp8dx_start_decode(BOOL_DECODER *br,
+                       const unsigned char *source,
+                       unsigned int source_sz);
+
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
+
+/*The refill loop is used in several places, so define it in a macro to make
+   sure they're all consistent.
+  An inline function would be cleaner, but has a significant penalty, because
+   multiple BOOL_DECODER fields must be modified, and the compiler is not smart
+   enough to eliminate the stores to those fields and the subsequent reloads
+   from them when inlining the function.*/
+#define VP8DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \
+    do \
+    { \
+        int shift = VP8_BD_VALUE_SIZE - 8 - ((_count) + 8); \
+        int loop_end, x; \
+        size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \
+        \
+        x = (int)(shift + CHAR_BIT - bits_left); \
+        loop_end = 0; \
+        if(x >= 0) \
+        { \
+            (_count) += VP8_LOTS_OF_BITS; \
+            loop_end = x; \
+            if(!bits_left) break; \
+        } \
+        while(shift >= loop_end) \
+        { \
+            (_count) += CHAR_BIT; \
+            (_value) |= (VP8_BD_VALUE)*(_bufptr)++ << shift; \
+            shift -= CHAR_BIT; \
+        } \
+    } \
+    while(0) \
+
+
+static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
+    unsigned int bit = 0;
+    VP8_BD_VALUE value;
+    unsigned int split;
+    VP8_BD_VALUE bigsplit;
+    int count;
+    unsigned int range;
+
+    split = 1 + (((br->range - 1) * probability) >> 8);
+
+    if(br->count < 0)
+        vp8dx_bool_decoder_fill(br);
+
+    value = br->value;
+    count = br->count;
+
+    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
+
+    range = split;
+
+    if (value >= bigsplit)
+    {
+        range = br->range - split;
+        value = value - bigsplit;
+        bit = 1;
+    }
+
+    {
+        register unsigned int shift = vp8_norm[range];
+        range <<= shift;
+        value <<= shift;
+        count -= shift;
+    }
+    br->value = value;
+    br->count = count;
+    br->range = range;
+
+    return bit;
+}
+
+static int vp8_decode_value(BOOL_DECODER *br, int bits)
+{
+    int z = 0;
+    int bit;
+
+    for (bit = bits - 1; bit >= 0; bit--)
+    {
+        z |= (vp8dx_decode_bool(br, 0x80) << bit);
+    }
+
+    return z;
+}
+
+static int vp8dx_bool_error(BOOL_DECODER *br)
+{
+    /* Check if we have reached the end of the buffer.
+     *
+     * Variable 'count' stores the number of bits in the 'value' buffer, minus
+     * 8. The top byte is part of the algorithm, and the remainder is buffered
+     * to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+     * occupied, 8 for the algorithm and 8 in the buffer.
+     *
+     * When reading a byte from the user's buffer, count is filled with 8 and
+     * one byte is filled into the value buffer. When we reach the end of the
+     * data, count is additionally filled with VP8_LOTS_OF_BITS. So when
+     * count == VP8_LOTS_OF_BITS - 1, the user's data has been exhausted.
+     */
+    if ((br->count > VP8_BD_VALUE_SIZE) && (br->count < VP8_LOTS_OF_BITS))
+    {
+       /* We have tried to decode bits after the end of
+        * stream was encountered.
+        */
+        return 1;
+    }
+
+    /* No error. */
+    return 0;
+}
+#endif
diff --git a/libvpx/vp8/decoder/decodemv.c b/libvpx/vp8/decoder/decodemv.c
new file mode 100644
index 0000000..8027a07
--- /dev/null
+++ b/libvpx/vp8/decoder/decodemv.c
@@ -0,0 +1,668 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "treereader.h"
+#include "vp8/common/entropymv.h"
+#include "vp8/common/entropymode.h"
+#include "onyxd_int.h"
+#include "vp8/common/findnearmv.h"
+
+#if CONFIG_DEBUG
+#include <assert.h>
+#endif
+static B_PREDICTION_MODE read_bmode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_bmode_tree, p);
+
+    return (B_PREDICTION_MODE)i;
+}
+
+static MB_PREDICTION_MODE read_ymode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_ymode_tree, p);
+
+    return (MB_PREDICTION_MODE)i;
+}
+
+static MB_PREDICTION_MODE read_kf_ymode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_kf_ymode_tree, p);
+
+    return (MB_PREDICTION_MODE)i;
+}
+
+static MB_PREDICTION_MODE read_uv_mode(vp8_reader *bc, const vp8_prob *p)
+{
+    const int i = vp8_treed_read(bc, vp8_uv_mode_tree, p);
+
+    return (MB_PREDICTION_MODE)i;
+}
+
+static void read_kf_modes(VP8D_COMP *pbi, MODE_INFO *mi)
+{
+    vp8_reader *const bc = & pbi->mbc[8];
+    const int mis = pbi->common.mode_info_stride;
+
+    mi->mbmi.ref_frame = INTRA_FRAME;
+    mi->mbmi.mode = read_kf_ymode(bc, vp8_kf_ymode_prob);
+
+    if (mi->mbmi.mode == B_PRED)
+    {
+        int i = 0;
+        mi->mbmi.is_4x4 = 1;
+
+        do
+        {
+            const B_PREDICTION_MODE A = above_block_mode(mi, i, mis);
+            const B_PREDICTION_MODE L = left_block_mode(mi, i);
+
+            mi->bmi[i].as_mode =
+                read_bmode(bc, vp8_kf_bmode_prob [A] [L]);
+        }
+        while (++i < 16);
+    }
+
+    mi->mbmi.uv_mode = read_uv_mode(bc, vp8_kf_uv_mode_prob);
+}
+
+static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc)
+{
+    const vp8_prob *const p = (const vp8_prob *) mvc;
+    int x = 0;
+
+    if (vp8_read(r, p [mvpis_short]))  /* Large */
+    {
+        int i = 0;
+
+        do
+        {
+            x += vp8_read(r, p [MVPbits + i]) << i;
+        }
+        while (++i < 3);
+
+        i = mvlong_width - 1;  /* Skip bit 3, which is sometimes implicit */
+
+        do
+        {
+            x += vp8_read(r, p [MVPbits + i]) << i;
+        }
+        while (--i > 3);
+
+        if (!(x & 0xFFF0)  ||  vp8_read(r, p [MVPbits + 3]))
+            x += 8;
+    }
+    else   /* small */
+        x = vp8_treed_read(r, vp8_small_mvtree, p + MVPshort);
+
+    if (x  &&  vp8_read(r, p [MVPsign]))
+        x = -x;
+
+    return x;
+}
+
+static void read_mv(vp8_reader *r, MV *mv, const MV_CONTEXT *mvc)
+{
+    mv->row = (short)(read_mvcomponent(r,   mvc) << 1);
+    mv->col = (short)(read_mvcomponent(r, ++mvc) << 1);
+}
+
+
+static void read_mvcontexts(vp8_reader *bc, MV_CONTEXT *mvc)
+{
+    int i = 0;
+
+    do
+    {
+        const vp8_prob *up = vp8_mv_update_probs[i].prob;
+        vp8_prob *p = (vp8_prob *)(mvc + i);
+        vp8_prob *const pstop = p + MVPcount;
+
+        do
+        {
+            if (vp8_read(bc, *up++))
+            {
+                const vp8_prob x = (vp8_prob)vp8_read_literal(bc, 7);
+
+                *p = x ? x << 1 : 1;
+            }
+        }
+        while (++p < pstop);
+    }
+    while (++i < 2);
+}
+
+static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};
+static const unsigned char mbsplit_fill_offset[4][16] = {
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
+    { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15},
+    { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15},
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};
+
+
+static void mb_mode_mv_init(VP8D_COMP *pbi)
+{
+    vp8_reader *const bc = & pbi->mbc[8];
+    MV_CONTEXT *const mvc = pbi->common.fc.mvc;
+
+#if CONFIG_ERROR_CONCEALMENT
+    /* Default is that no macroblock is corrupt, therefore we initialize
+     * mvs_corrupt_from_mb to something very big, which we can be sure is
+     * outside the frame. */
+    pbi->mvs_corrupt_from_mb = UINT_MAX;
+#endif
+    /* Read the mb_no_coeff_skip flag */
+    pbi->common.mb_no_coeff_skip = (int)vp8_read_bit(bc);
+
+    pbi->prob_skip_false = 0;
+    if (pbi->common.mb_no_coeff_skip)
+        pbi->prob_skip_false = (vp8_prob)vp8_read_literal(bc, 8);
+
+    if(pbi->common.frame_type != KEY_FRAME)
+    {
+        pbi->prob_intra = (vp8_prob)vp8_read_literal(bc, 8);
+        pbi->prob_last  = (vp8_prob)vp8_read_literal(bc, 8);
+        pbi->prob_gf    = (vp8_prob)vp8_read_literal(bc, 8);
+
+        if (vp8_read_bit(bc))
+        {
+            int i = 0;
+
+            do
+            {
+                pbi->common.fc.ymode_prob[i] =
+                    (vp8_prob) vp8_read_literal(bc, 8);
+            }
+            while (++i < 4);
+        }
+
+        if (vp8_read_bit(bc))
+        {
+            int i = 0;
+
+            do
+            {
+                pbi->common.fc.uv_mode_prob[i] =
+                    (vp8_prob) vp8_read_literal(bc, 8);
+            }
+            while (++i < 3);
+        }
+
+        read_mvcontexts(bc, mvc);
+    }
+}
+
+const vp8_prob vp8_sub_mv_ref_prob3 [8][VP8_SUBMVREFS-1] =
+{
+    { 147, 136, 18 },   /* SUBMVREF_NORMAL          */
+    { 223, 1  , 34 },   /* SUBMVREF_LEFT_ABOVE_SAME */
+    { 106, 145, 1  },   /* SUBMVREF_LEFT_ZED        */
+    { 208, 1  , 1  },   /* SUBMVREF_LEFT_ABOVE_ZED  */
+    { 179, 121, 1  },   /* SUBMVREF_ABOVE_ZED       */
+    { 223, 1  , 34 },   /* SUBMVREF_LEFT_ABOVE_SAME */
+    { 179, 121, 1  },   /* SUBMVREF_ABOVE_ZED       */
+    { 208, 1  , 1  }    /* SUBMVREF_LEFT_ABOVE_ZED  */
+};
+
+static
+const vp8_prob * get_sub_mv_ref_prob(const int left, const int above)
+{
+    int lez = (left == 0);
+    int aez = (above == 0);
+    int lea = (left == above);
+    const vp8_prob * prob;
+
+    prob = vp8_sub_mv_ref_prob3[(aez << 2) |
+                                (lez << 1) |
+                                (lea)];
+
+    return prob;
+}
+
+static void decode_split_mv(vp8_reader *const bc, MODE_INFO *mi,
+                        const MODE_INFO *left_mb, const MODE_INFO *above_mb,
+                        MB_MODE_INFO *mbmi, int_mv best_mv,
+                        MV_CONTEXT *const mvc, int mb_to_left_edge,
+                        int mb_to_right_edge, int mb_to_top_edge,
+                        int mb_to_bottom_edge)
+{
+    int s;      /* split configuration (16x8, 8x16, 8x8, 4x4) */
+    int num_p;  /* number of partitions in the split configuration
+                  (see vp8_mbsplit_count) */
+    int j = 0;
+
+    s = 3;
+    num_p = 16;
+    if( vp8_read(bc, 110) )
+    {
+        s = 2;
+        num_p = 4;
+        if( vp8_read(bc, 111) )
+        {
+            s = vp8_read(bc, 150);
+            num_p = 2;
+        }
+    }
+
+    do  /* for each subset j */
+    {
+        int_mv leftmv, abovemv;
+        int_mv blockmv;
+        int k;  /* first block in subset j */
+
+        const vp8_prob *prob;
+        k = vp8_mbsplit_offset[s][j];
+
+        if (!(k & 3))
+        {
+            /* On L edge, get from MB to left of us */
+            if(left_mb->mbmi.mode != SPLITMV)
+                leftmv.as_int =  left_mb->mbmi.mv.as_int;
+            else
+                leftmv.as_int =  (left_mb->bmi + k + 4 - 1)->mv.as_int;
+        }
+        else
+            leftmv.as_int =  (mi->bmi + k - 1)->mv.as_int;
+
+        if (!(k >> 2))
+        {
+            /* On top edge, get from MB above us */
+            if(above_mb->mbmi.mode != SPLITMV)
+                abovemv.as_int =  above_mb->mbmi.mv.as_int;
+            else
+                abovemv.as_int =  (above_mb->bmi + k + 16 - 4)->mv.as_int;
+        }
+        else
+            abovemv.as_int = (mi->bmi + k - 4)->mv.as_int;
+
+        prob = get_sub_mv_ref_prob(leftmv.as_int, abovemv.as_int);
+
+        if( vp8_read(bc, prob[0]) )
+        {
+            if( vp8_read(bc, prob[1]) )
+            {
+                blockmv.as_int = 0;
+                if( vp8_read(bc, prob[2]) )
+                {
+                    blockmv.as_mv.row = read_mvcomponent(bc, &mvc[0]) << 1;
+                    blockmv.as_mv.row += best_mv.as_mv.row;
+                    blockmv.as_mv.col = read_mvcomponent(bc, &mvc[1]) << 1;
+                    blockmv.as_mv.col += best_mv.as_mv.col;
+                }
+            }
+            else
+            {
+                blockmv.as_int = abovemv.as_int;
+            }
+        }
+        else
+        {
+            blockmv.as_int = leftmv.as_int;
+        }
+
+        mbmi->need_to_clamp_mvs |= vp8_check_mv_bounds(&blockmv,
+                                                  mb_to_left_edge,
+                                                  mb_to_right_edge,
+                                                  mb_to_top_edge,
+                                                  mb_to_bottom_edge);
+
+        {
+            /* Fill (uniform) modes, mvs of jth subset.
+             Must do it here because ensuing subsets can
+             refer back to us via "left" or "above". */
+            const unsigned char *fill_offset;
+            unsigned int fill_count = mbsplit_fill_count[s];
+
+            fill_offset = &mbsplit_fill_offset[s]
+                             [(unsigned char)j * mbsplit_fill_count[s]];
+
+            do {
+                mi->bmi[ *fill_offset].mv.as_int = blockmv.as_int;
+                fill_offset++;
+            }while (--fill_count);
+        }
+
+    }
+    while (++j < num_p);
+
+    mbmi->partitioning = s;
+}
+
+static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi)
+{
+    vp8_reader *const bc = & pbi->mbc[8];
+    mbmi->ref_frame = (MV_REFERENCE_FRAME) vp8_read(bc, pbi->prob_intra);
+    if (mbmi->ref_frame)    /* inter MB */
+    {
+        enum {CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV};
+        int cnt[4];
+        int *cntx = cnt;
+        int_mv near_mvs[4];
+        int_mv *nmv = near_mvs;
+        const int mis = pbi->mb.mode_info_stride;
+        const MODE_INFO *above = mi - mis;
+        const MODE_INFO *left = mi - 1;
+        const MODE_INFO *aboveleft = above - 1;
+        int *ref_frame_sign_bias = pbi->common.ref_frame_sign_bias;
+
+        mbmi->need_to_clamp_mvs = 0;
+
+        if (vp8_read(bc, pbi->prob_last))
+        {
+            mbmi->ref_frame =
+                (MV_REFERENCE_FRAME)((int)(2 + vp8_read(bc, pbi->prob_gf)));
+        }
+
+        /* Zero accumulators */
+        nmv[0].as_int = nmv[1].as_int = nmv[2].as_int = 0;
+        cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+
+        /* Process above */
+        if (above->mbmi.ref_frame != INTRA_FRAME)
+        {
+            if (above->mbmi.mv.as_int)
+            {
+                (++nmv)->as_int = above->mbmi.mv.as_int;
+                mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame],
+                        mbmi->ref_frame, nmv, ref_frame_sign_bias);
+                ++cntx;
+            }
+
+            *cntx += 2;
+        }
+
+        /* Process left */
+        if (left->mbmi.ref_frame != INTRA_FRAME)
+        {
+            if (left->mbmi.mv.as_int)
+            {
+                int_mv this_mv;
+
+                this_mv.as_int = left->mbmi.mv.as_int;
+                mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame],
+                        mbmi->ref_frame, &this_mv, ref_frame_sign_bias);
+
+                if (this_mv.as_int != nmv->as_int)
+                {
+                    (++nmv)->as_int = this_mv.as_int;
+                    ++cntx;
+                }
+
+                *cntx += 2;
+            }
+            else
+                cnt[CNT_INTRA] += 2;
+        }
+
+        /* Process above left */
+        if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
+        {
+            if (aboveleft->mbmi.mv.as_int)
+            {
+                int_mv this_mv;
+
+                this_mv.as_int = aboveleft->mbmi.mv.as_int;
+                mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame],
+                        mbmi->ref_frame, &this_mv, ref_frame_sign_bias);
+
+                if (this_mv.as_int != nmv->as_int)
+                {
+                    (++nmv)->as_int = this_mv.as_int;
+                    ++cntx;
+                }
+
+                *cntx += 1;
+            }
+            else
+                cnt[CNT_INTRA] += 1;
+        }
+
+        if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_INTRA]] [0]) )
+        {
+
+            /* If we have three distinct MV's ... */
+            /* See if above-left MV can be merged with NEAREST */
+            cnt[CNT_NEAREST] += ( (cnt[CNT_SPLITMV] > 0) &
+                (nmv->as_int == near_mvs[CNT_NEAREST].as_int));
+
+            /* Swap near and nearest if necessary */
+            if (cnt[CNT_NEAR] > cnt[CNT_NEAREST])
+            {
+                int tmp;
+                tmp = cnt[CNT_NEAREST];
+                cnt[CNT_NEAREST] = cnt[CNT_NEAR];
+                cnt[CNT_NEAR] = tmp;
+                tmp = near_mvs[CNT_NEAREST].as_int;
+                near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
+                near_mvs[CNT_NEAR].as_int = tmp;
+            }
+
+            if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_NEAREST]] [1]) )
+            {
+
+                if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_NEAR]] [2]) )
+                {
+                    int mb_to_top_edge;
+                    int mb_to_bottom_edge;
+                    int mb_to_left_edge;
+                    int mb_to_right_edge;
+                    MV_CONTEXT *const mvc = pbi->common.fc.mvc;
+                    int near_index;
+
+                    mb_to_top_edge = pbi->mb.mb_to_top_edge;
+                    mb_to_bottom_edge = pbi->mb.mb_to_bottom_edge;
+                    mb_to_top_edge -= LEFT_TOP_MARGIN;
+                    mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;
+                    mb_to_right_edge = pbi->mb.mb_to_right_edge;
+                    mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
+                    mb_to_left_edge = pbi->mb.mb_to_left_edge;
+                    mb_to_left_edge -= LEFT_TOP_MARGIN;
+
+                    /* Use near_mvs[0] to store the "best" MV */
+                    near_index = CNT_INTRA +
+                        (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]);
+
+                    vp8_clamp_mv2(&near_mvs[near_index], &pbi->mb);
+
+                    cnt[CNT_SPLITMV] = ((above->mbmi.mode == SPLITMV)
+                                        + (left->mbmi.mode == SPLITMV)) * 2
+                                       + (aboveleft->mbmi.mode == SPLITMV);
+
+                    if( vp8_read(bc, vp8_mode_contexts [cnt[CNT_SPLITMV]] [3]) )
+                    {
+                        decode_split_mv(bc, mi, left, above,
+                                                    mbmi,
+                                                    near_mvs[near_index],
+                                                    mvc, mb_to_left_edge,
+                                                    mb_to_right_edge,
+                                                    mb_to_top_edge,
+                                                    mb_to_bottom_edge);
+                        mbmi->mv.as_int = mi->bmi[15].mv.as_int;
+                        mbmi->mode =  SPLITMV;
+                        mbmi->is_4x4 = 1;
+                    }
+                    else
+                    {
+                        int_mv *const mbmi_mv = & mbmi->mv;
+                        read_mv(bc, &mbmi_mv->as_mv, (const MV_CONTEXT *) mvc);
+                        mbmi_mv->as_mv.row += near_mvs[near_index].as_mv.row;
+                        mbmi_mv->as_mv.col += near_mvs[near_index].as_mv.col;
+
+                        /* Don't need to check this on NEARMV and NEARESTMV
+                         * modes since those modes clamp the MV. The NEWMV mode
+                         * does not, so signal to the prediction stage whether
+                         * special handling may be required.
+                         */
+                        mbmi->need_to_clamp_mvs =
+                            vp8_check_mv_bounds(mbmi_mv, mb_to_left_edge,
+                                                mb_to_right_edge,
+                                                mb_to_top_edge,
+                                                mb_to_bottom_edge);
+                        mbmi->mode =  NEWMV;
+                    }
+                }
+                else
+                {
+                    mbmi->mode =  NEARMV;
+                    vp8_clamp_mv2(&near_mvs[CNT_NEAR], &pbi->mb);
+                    mbmi->mv.as_int = near_mvs[CNT_NEAR].as_int;
+                }
+            }
+            else
+            {
+                mbmi->mode =  NEARESTMV;
+                vp8_clamp_mv2(&near_mvs[CNT_NEAREST], &pbi->mb);
+                mbmi->mv.as_int = near_mvs[CNT_NEAREST].as_int;
+            }
+        }
+        else
+        {
+            mbmi->mode =  ZEROMV;
+            mbmi->mv.as_int = 0;
+        }
+
+#if CONFIG_ERROR_CONCEALMENT
+        if(pbi->ec_enabled && (mbmi->mode != SPLITMV))
+        {
+            mi->bmi[ 0].mv.as_int =
+            mi->bmi[ 1].mv.as_int =
+            mi->bmi[ 2].mv.as_int =
+            mi->bmi[ 3].mv.as_int =
+            mi->bmi[ 4].mv.as_int =
+            mi->bmi[ 5].mv.as_int =
+            mi->bmi[ 6].mv.as_int =
+            mi->bmi[ 7].mv.as_int =
+            mi->bmi[ 8].mv.as_int =
+            mi->bmi[ 9].mv.as_int =
+            mi->bmi[10].mv.as_int =
+            mi->bmi[11].mv.as_int =
+            mi->bmi[12].mv.as_int =
+            mi->bmi[13].mv.as_int =
+            mi->bmi[14].mv.as_int =
+            mi->bmi[15].mv.as_int = mbmi->mv.as_int;
+        }
+#endif
+    }
+    else
+    {
+        /* required for left and above block mv */
+        mbmi->mv.as_int = 0;
+
+        /* MB is intra coded */
+        if ((mbmi->mode = read_ymode(bc, pbi->common.fc.ymode_prob)) == B_PRED)
+        {
+            int j = 0;
+            mbmi->is_4x4 = 1;
+            do
+            {
+                mi->bmi[j].as_mode = read_bmode(bc, pbi->common.fc.bmode_prob);
+            }
+            while (++j < 16);
+        }
+
+        mbmi->uv_mode = read_uv_mode(bc, pbi->common.fc.uv_mode_prob);
+    }
+
+}
+
+static void read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x)
+{
+    /* Is segmentation enabled */
+    if (x->segmentation_enabled && x->update_mb_segmentation_map)
+    {
+        /* If so then read the segment id. */
+        if (vp8_read(r, x->mb_segment_tree_probs[0]))
+            mi->segment_id =
+                (unsigned char)(2 + vp8_read(r, x->mb_segment_tree_probs[2]));
+        else
+            mi->segment_id =
+                (unsigned char)(vp8_read(r, x->mb_segment_tree_probs[1]));
+    }
+}
+
+static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi,
+                               MB_MODE_INFO *mbmi)
+{
+    /* Read the Macroblock segmentation map if it is being updated explicitly
+     * this frame (reset to 0 above by default)
+     * By default on a key frame reset all MBs to segment 0
+     */
+    if (pbi->mb.update_mb_segmentation_map)
+        read_mb_features(&pbi->mbc[8], &mi->mbmi, &pbi->mb);
+    else if(pbi->common.frame_type == KEY_FRAME)
+        mi->mbmi.segment_id = 0;
+
+    /* Read the macroblock coeff skip flag if this feature is in use,
+     * else default to 0 */
+    if (pbi->common.mb_no_coeff_skip)
+        mi->mbmi.mb_skip_coeff = vp8_read(&pbi->mbc[8], pbi->prob_skip_false);
+    else
+        mi->mbmi.mb_skip_coeff = 0;
+
+    mi->mbmi.is_4x4 = 0;
+    if(pbi->common.frame_type == KEY_FRAME)
+        read_kf_modes(pbi, mi);
+    else
+        read_mb_modes_mv(pbi, mi, &mi->mbmi);
+
+}
+
+void vp8_decode_mode_mvs(VP8D_COMP *pbi)
+{
+    MODE_INFO *mi = pbi->common.mi;
+    int mb_row = -1;
+    int mb_to_right_edge_start;
+
+    mb_mode_mv_init(pbi);
+
+    pbi->mb.mb_to_top_edge = 0;
+    pbi->mb.mb_to_bottom_edge = ((pbi->common.mb_rows - 1) * 16) << 3;
+    mb_to_right_edge_start = ((pbi->common.mb_cols - 1) * 16) << 3;
+
+    while (++mb_row < pbi->common.mb_rows)
+    {
+        int mb_col = -1;
+
+        pbi->mb.mb_to_left_edge =  0;
+        pbi->mb.mb_to_right_edge = mb_to_right_edge_start;
+
+        while (++mb_col < pbi->common.mb_cols)
+        {
+#if CONFIG_ERROR_CONCEALMENT
+            int mb_num = mb_row * pbi->common.mb_cols + mb_col;
+#endif
+
+            decode_mb_mode_mvs(pbi, mi, &mi->mbmi);
+
+#if CONFIG_ERROR_CONCEALMENT
+            /* look for corruption. set mvs_corrupt_from_mb to the current
+             * mb_num if the frame is corrupt from this macroblock. */
+            if (vp8dx_bool_error(&pbi->mbc[8]) && mb_num <
+                (int)pbi->mvs_corrupt_from_mb)
+            {
+                pbi->mvs_corrupt_from_mb = mb_num;
+                /* no need to continue since the partition is corrupt from
+                 * here on.
+                 */
+                return;
+            }
+#endif
+
+            pbi->mb.mb_to_left_edge -= (16 << 3);
+            pbi->mb.mb_to_right_edge -= (16 << 3);
+            mi++;       /* next macroblock */
+        }
+        pbi->mb.mb_to_top_edge -= (16 << 3);
+        pbi->mb.mb_to_bottom_edge -= (16 << 3);
+
+        mi++;           /* skip left predictor each row */
+    }
+}
diff --git a/libvpx/vp8/decoder/decodemv.h b/libvpx/vp8/decoder/decodemv.h
new file mode 100644
index 0000000..9403424
--- /dev/null
+++ b/libvpx/vp8/decoder/decodemv.h
@@ -0,0 +1,14 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyxd_int.h"
+
+void vp8_decode_mode_mvs(VP8D_COMP *);
diff --git a/libvpx/vp8/decoder/decoderthreading.h b/libvpx/vp8/decoder/decoderthreading.h
new file mode 100644
index 0000000..60c39d1
--- /dev/null
+++ b/libvpx/vp8/decoder/decoderthreading.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+
+
+
+#ifndef _DECODER_THREADING_H
+#define _DECODER_THREADING_H
+
+#if CONFIG_MULTITHREAD
+extern void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
+extern void vp8_decoder_remove_threads(VP8D_COMP *pbi);
+extern void vp8_decoder_create_threads(VP8D_COMP *pbi);
+extern void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
+extern void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
+#endif
+
+#endif
diff --git a/libvpx/vp8/decoder/decodframe.c b/libvpx/vp8/decoder/decodframe.c
new file mode 100644
index 0000000..a4a00f6
--- /dev/null
+++ b/libvpx/vp8/decoder/decodframe.c
@@ -0,0 +1,1385 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "onyxd_int.h"
+#include "vp8/common/header.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/reconinter.h"
+#include "detokenize.h"
+#include "vp8/common/invtrans.h"
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/quant_common.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp8/common/setupintrarecon.h"
+
+#include "decodemv.h"
+#include "vp8/common/extend.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "error_concealment.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/threading.h"
+#include "decoderthreading.h"
+#include "dboolhuff.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
+{
+    int Q;
+    VP8_COMMON *const pc = & pbi->common;
+
+    for (Q = 0; Q < QINDEX_RANGE; Q++)
+    {
+        pc->Y1dequant[Q][0] = (short)vp8_dc_quant(Q, pc->y1dc_delta_q);
+        pc->Y2dequant[Q][0] = (short)vp8_dc2quant(Q, pc->y2dc_delta_q);
+        pc->UVdequant[Q][0] = (short)vp8_dc_uv_quant(Q, pc->uvdc_delta_q);
+
+        pc->Y1dequant[Q][1] = (short)vp8_ac_yquant(Q);
+        pc->Y2dequant[Q][1] = (short)vp8_ac2quant(Q, pc->y2ac_delta_q);
+        pc->UVdequant[Q][1] = (short)vp8_ac_uv_quant(Q, pc->uvac_delta_q);
+    }
+}
+
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+    int i;
+    int QIndex;
+    MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+    VP8_COMMON *const pc = & pbi->common;
+
+    /* Decide whether to use the default or alternate baseline Q value. */
+    if (xd->segmentation_enabled)
+    {
+        /* Abs Value */
+        if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+            QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+
+        /* Delta Value */
+        else
+        {
+            QIndex = pc->base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */
+        }
+    }
+    else
+        QIndex = pc->base_qindex;
+
+    /* Set up the macroblock dequant constants */
+    xd->dequant_y1_dc[0] = 1;
+    xd->dequant_y1[0] = pc->Y1dequant[QIndex][0];
+    xd->dequant_y2[0] = pc->Y2dequant[QIndex][0];
+    xd->dequant_uv[0] = pc->UVdequant[QIndex][0];
+
+    for (i = 1; i < 16; i++)
+    {
+        xd->dequant_y1_dc[i] =
+        xd->dequant_y1[i] = pc->Y1dequant[QIndex][1];
+        xd->dequant_y2[i] = pc->Y2dequant[QIndex][1];
+        xd->dequant_uv[i] = pc->UVdequant[QIndex][1];
+    }
+}
+
+static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
+                              unsigned int mb_idx)
+{
+    MB_PREDICTION_MODE mode;
+    int i;
+#if CONFIG_ERROR_CONCEALMENT
+    int corruption_detected = 0;
+#endif
+
+    if (xd->mode_info_context->mbmi.mb_skip_coeff)
+    {
+        vp8_reset_mb_tokens_context(xd);
+    }
+    else if (!vp8dx_bool_error(xd->current_bc))
+    {
+        int eobtotal;
+        eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+        /* Special case:  Force the loopfilter to skip when eobtotal is zero */
+        xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0);
+    }
+
+    mode = xd->mode_info_context->mbmi.mode;
+
+    if (xd->segmentation_enabled)
+        vp8_mb_init_dequantizer(pbi, xd);
+
+
+#if CONFIG_ERROR_CONCEALMENT
+
+    if(pbi->ec_active)
+    {
+        int throw_residual;
+        /* When we have independent partitions we can apply residual even
+         * though other partitions within the frame are corrupt.
+         */
+        throw_residual = (!pbi->independent_partitions &&
+                          pbi->frame_corrupt_residual);
+        throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+
+        if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+        {
+            /* MB with corrupt residuals or corrupt mode/motion vectors.
+             * Better to use the predictor as reconstruction.
+             */
+            pbi->frame_corrupt_residual = 1;
+            vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+            vp8_conceal_corrupt_mb(xd);
+
+
+            corruption_detected = 1;
+
+            /* force idct to be skipped for B_PRED and use the
+             * prediction only for reconstruction
+             * */
+            vpx_memset(xd->eobs, 0, 25);
+        }
+    }
+#endif
+
+    /* do prediction */
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    {
+        vp8_build_intra_predictors_mbuv_s(xd,
+                                          xd->recon_above[1],
+                                          xd->recon_above[2],
+                                          xd->recon_left[1],
+                                          xd->recon_left[2],
+                                          xd->recon_left_stride[1],
+                                          xd->dst.u_buffer, xd->dst.v_buffer,
+                                          xd->dst.uv_stride);
+
+        if (mode != B_PRED)
+        {
+            vp8_build_intra_predictors_mby_s(xd,
+                                                 xd->recon_above[0],
+                                                 xd->recon_left[0],
+                                                 xd->recon_left_stride[0],
+                                                 xd->dst.y_buffer,
+                                                 xd->dst.y_stride);
+        }
+        else
+        {
+            short *DQC = xd->dequant_y1;
+            int dst_stride = xd->dst.y_stride;
+
+            /* clear out residual eob info */
+            if(xd->mode_info_context->mbmi.mb_skip_coeff)
+                vpx_memset(xd->eobs, 0, 25);
+
+            intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
+
+            for (i = 0; i < 16; i++)
+            {
+                BLOCKD *b = &xd->block[i];
+                unsigned char *dst = xd->dst.y_buffer + b->offset;
+                B_PREDICTION_MODE b_mode =
+                    xd->mode_info_context->bmi[i].as_mode;
+                unsigned char *Above = dst - dst_stride;
+                unsigned char *yleft = dst - 1;
+                int left_stride = dst_stride;
+                unsigned char top_left = Above[-1];
+
+                vp8_intra4x4_predict(Above, yleft, left_stride, b_mode,
+                                     dst, dst_stride, top_left);
+
+                if (xd->eobs[i])
+                {
+                    if (xd->eobs[i] > 1)
+                    {
+                    vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride);
+                    }
+                    else
+                    {
+                        vp8_dc_only_idct_add
+                            (b->qcoeff[0] * DQC[0],
+                                dst, dst_stride,
+                                dst, dst_stride);
+                        ((int *)b->qcoeff)[0] = 0;
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        vp8_build_inter_predictors_mb(xd);
+    }
+
+
+#if CONFIG_ERROR_CONCEALMENT
+    if (corruption_detected)
+    {
+        return;
+    }
+#endif
+
+    if(!xd->mode_info_context->mbmi.mb_skip_coeff)
+    {
+        /* dequantization and idct */
+        if (mode != B_PRED)
+        {
+            short *DQC = xd->dequant_y1;
+
+            if (mode != SPLITMV)
+            {
+                BLOCKD *b = &xd->block[24];
+
+                /* do 2nd order transform on the dc block */
+                if (xd->eobs[24] > 1)
+                {
+                    vp8_dequantize_b(b, xd->dequant_y2);
+
+                    vp8_short_inv_walsh4x4(&b->dqcoeff[0],
+                        xd->qcoeff);
+                    ((int *)b->qcoeff)[0] = 0;
+                    ((int *)b->qcoeff)[1] = 0;
+                    ((int *)b->qcoeff)[2] = 0;
+                    ((int *)b->qcoeff)[3] = 0;
+                    ((int *)b->qcoeff)[4] = 0;
+                    ((int *)b->qcoeff)[5] = 0;
+                    ((int *)b->qcoeff)[6] = 0;
+                    ((int *)b->qcoeff)[7] = 0;
+                }
+                else
+                {
+                    b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
+                    vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
+                        xd->qcoeff);
+                    ((int *)b->qcoeff)[0] = 0;
+                }
+
+                /* override the dc dequant constant in order to preserve the
+                 * dc components
+                 */
+                DQC = xd->dequant_y1_dc;
+            }
+
+            vp8_dequant_idct_add_y_block
+                            (xd->qcoeff, DQC,
+                             xd->dst.y_buffer,
+                             xd->dst.y_stride, xd->eobs);
+        }
+
+        vp8_dequant_idct_add_uv_block
+                        (xd->qcoeff+16*16, xd->dequant_uv,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
+    }
+}
+
+static int get_delta_q(vp8_reader *bc, int prev, int *q_update)
+{
+    int ret_val = 0;
+
+    if (vp8_read_bit(bc))
+    {
+        ret_val = vp8_read_literal(bc, 4);
+
+        if (vp8_read_bit(bc))
+            ret_val = -ret_val;
+    }
+
+    /* Trigger a quantizer update if the delta-q value has changed */
+    if (ret_val != prev)
+        *q_update = 1;
+
+    return ret_val;
+}
+
+#ifdef PACKET_TESTING
+#include <stdio.h>
+FILE *vpxlog = 0;
+#endif
+
+static void yv12_extend_frame_top_c(YV12_BUFFER_CONFIG *ybf)
+{
+    int i;
+    unsigned char *src_ptr1;
+    unsigned char *dest_ptr1;
+
+    unsigned int Border;
+    int plane_stride;
+
+    /***********/
+    /* Y Plane */
+    /***********/
+    Border = ybf->border;
+    plane_stride = ybf->y_stride;
+    src_ptr1 = ybf->y_buffer - Border;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+
+    for (i = 0; i < (int)Border; i++)
+    {
+        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        dest_ptr1 += plane_stride;
+    }
+
+
+    /***********/
+    /* U Plane */
+    /***********/
+    plane_stride = ybf->uv_stride;
+    Border /= 2;
+    src_ptr1 = ybf->u_buffer - Border;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        dest_ptr1 += plane_stride;
+    }
+
+    /***********/
+    /* V Plane */
+    /***********/
+
+    src_ptr1 = ybf->v_buffer - Border;
+    dest_ptr1 = src_ptr1 - (Border * plane_stride);
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        dest_ptr1 += plane_stride;
+    }
+}
+
+static void yv12_extend_frame_bottom_c(YV12_BUFFER_CONFIG *ybf)
+{
+    int i;
+    unsigned char *src_ptr1, *src_ptr2;
+    unsigned char *dest_ptr2;
+
+    unsigned int Border;
+    int plane_stride;
+    int plane_height;
+
+    /***********/
+    /* Y Plane */
+    /***********/
+    Border = ybf->border;
+    plane_stride = ybf->y_stride;
+    plane_height = ybf->y_height;
+
+    src_ptr1 = ybf->y_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)Border; i++)
+    {
+        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr2 += plane_stride;
+    }
+
+
+    /***********/
+    /* U Plane */
+    /***********/
+    plane_stride = ybf->uv_stride;
+    plane_height = ybf->uv_height;
+    Border /= 2;
+
+    src_ptr1 = ybf->u_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr2 += plane_stride;
+    }
+
+    /***********/
+    /* V Plane */
+    /***********/
+
+    src_ptr1 = ybf->v_buffer - Border;
+    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
+    dest_ptr2 = src_ptr2 + plane_stride;
+
+    for (i = 0; i < (int)(Border); i++)
+    {
+        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        dest_ptr2 += plane_stride;
+    }
+}
+
+static void yv12_extend_frame_left_right_c(YV12_BUFFER_CONFIG *ybf,
+                                           unsigned char *y_src,
+                                           unsigned char *u_src,
+                                           unsigned char *v_src)
+{
+    int i;
+    unsigned char *src_ptr1, *src_ptr2;
+    unsigned char *dest_ptr1, *dest_ptr2;
+
+    unsigned int Border;
+    int plane_stride;
+    int plane_height;
+    int plane_width;
+
+    /***********/
+    /* Y Plane */
+    /***********/
+    Border = ybf->border;
+    plane_stride = ybf->y_stride;
+    plane_height = 16;
+    plane_width = ybf->y_width;
+
+    /* copy the left and right most columns out */
+    src_ptr1 = y_src;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        vpx_memset(dest_ptr1, src_ptr1[0], Border);
+        vpx_memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    /***********/
+    /* U Plane */
+    /***********/
+    plane_stride = ybf->uv_stride;
+    plane_height = 8;
+    plane_width = ybf->uv_width;
+    Border /= 2;
+
+    /* copy the left and right most columns out */
+    src_ptr1 = u_src;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        vpx_memset(dest_ptr1, src_ptr1[0], Border);
+        vpx_memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+
+    /***********/
+    /* V Plane */
+    /***********/
+
+    /* copy the left and right most columns out */
+    src_ptr1 = v_src;
+    src_ptr2 = src_ptr1 + plane_width - 1;
+    dest_ptr1 = src_ptr1 - Border;
+    dest_ptr2 = src_ptr2 + 1;
+
+    for (i = 0; i < plane_height; i++)
+    {
+        vpx_memset(dest_ptr1, src_ptr1[0], Border);
+        vpx_memset(dest_ptr2, src_ptr2[0], Border);
+        src_ptr1  += plane_stride;
+        src_ptr2  += plane_stride;
+        dest_ptr1 += plane_stride;
+        dest_ptr2 += plane_stride;
+    }
+}
+
+static void decode_mb_rows(VP8D_COMP *pbi)
+{
+    VP8_COMMON *const pc = & pbi->common;
+    MACROBLOCKD *const xd  = & pbi->mb;
+
+    MODE_INFO *lf_mic = xd->mode_info_context;
+
+    int ibc = 0;
+    int num_part = 1 << pc->multi_token_partition;
+
+    int recon_yoffset, recon_uvoffset;
+    int mb_row, mb_col;
+    int mb_idx = 0;
+
+    YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+
+    int recon_y_stride = yv12_fb_new->y_stride;
+    int recon_uv_stride = yv12_fb_new->uv_stride;
+
+    unsigned char *ref_buffer[MAX_REF_FRAMES][3];
+    unsigned char *dst_buffer[3];
+    unsigned char *lf_dst[3];
+    unsigned char *eb_dst[3];
+    int i;
+    int ref_fb_corrupted[MAX_REF_FRAMES];
+
+    ref_fb_corrupted[INTRA_FRAME] = 0;
+
+    for(i = 1; i < MAX_REF_FRAMES; i++)
+    {
+        YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i];
+
+        ref_buffer[i][0] = this_fb->y_buffer;
+        ref_buffer[i][1] = this_fb->u_buffer;
+        ref_buffer[i][2] = this_fb->v_buffer;
+
+        ref_fb_corrupted[i] = this_fb->corrupted;
+    }
+
+    /* Set up the buffer pointers */
+    eb_dst[0] = lf_dst[0] = dst_buffer[0] = yv12_fb_new->y_buffer;
+    eb_dst[1] = lf_dst[1] = dst_buffer[1] = yv12_fb_new->u_buffer;
+    eb_dst[2] = lf_dst[2] = dst_buffer[2] = yv12_fb_new->v_buffer;
+
+    xd->up_available = 0;
+
+    /* Initialize the loop filter for this frame. */
+    if(pc->filter_level)
+        vp8_loop_filter_frame_init(pc, xd, pc->filter_level);
+
+    vp8_setup_intra_recon_top_line(yv12_fb_new);
+
+    /* Decode the individual macro block */
+    for (mb_row = 0; mb_row < pc->mb_rows; mb_row++)
+    {
+        if (num_part > 1)
+        {
+            xd->current_bc = & pbi->mbc[ibc];
+            ibc++;
+
+            if (ibc == num_part)
+                ibc = 0;
+        }
+
+        recon_yoffset = mb_row * recon_y_stride * 16;
+        recon_uvoffset = mb_row * recon_uv_stride * 8;
+
+        /* reset contexts */
+        xd->above_context = pc->above_context;
+        vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+
+        xd->left_available = 0;
+
+        xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+        xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+        xd->recon_above[0] = dst_buffer[0] + recon_yoffset;
+        xd->recon_above[1] = dst_buffer[1] + recon_uvoffset;
+        xd->recon_above[2] = dst_buffer[2] + recon_uvoffset;
+
+        xd->recon_left[0] = xd->recon_above[0] - 1;
+        xd->recon_left[1] = xd->recon_above[1] - 1;
+        xd->recon_left[2] = xd->recon_above[2] - 1;
+
+        xd->recon_above[0] -= xd->dst.y_stride;
+        xd->recon_above[1] -= xd->dst.uv_stride;
+        xd->recon_above[2] -= xd->dst.uv_stride;
+
+        /* TODO: move to outside row loop */
+        xd->recon_left_stride[0] = xd->dst.y_stride;
+        xd->recon_left_stride[1] = xd->dst.uv_stride;
+
+        setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1],
+                               xd->recon_left[2], xd->dst.y_stride,
+                               xd->dst.uv_stride);
+
+        for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
+        {
+            /* Distance of Mb to the various image edges.
+             * These are specified to 8th pel as they are always compared to values
+             * that are in 1/8th pel units
+             */
+            xd->mb_to_left_edge = -((mb_col * 16) << 3);
+            xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+#if CONFIG_ERROR_CONCEALMENT
+            {
+                int corrupt_residual = (!pbi->independent_partitions &&
+                                       pbi->frame_corrupt_residual) ||
+                                       vp8dx_bool_error(xd->current_bc);
+                if (pbi->ec_active &&
+                    xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME &&
+                    corrupt_residual)
+                {
+                    /* We have an intra block with corrupt coefficients, better to
+                     * conceal with an inter block. Interpolate MVs from neighboring
+                     * MBs.
+                     *
+                     * Note that for the first mb with corrupt residual in a frame,
+                     * we might not discover that before decoding the residual. That
+                     * happens after this check, and therefore no inter concealment
+                     * will be done.
+                     */
+                    vp8_interpolate_motion(xd,
+                                           mb_row, mb_col,
+                                           pc->mb_rows, pc->mb_cols,
+                                           pc->mode_info_stride);
+                }
+            }
+#endif
+
+            xd->dst.y_buffer = dst_buffer[0] + recon_yoffset;
+            xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
+            xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
+
+            xd->pre.y_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
+            xd->pre.u_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
+            xd->pre.v_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
+
+            /* propagate errors from reference frames */
+            xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
+
+            decode_macroblock(pbi, xd, mb_idx);
+
+            mb_idx++;
+            xd->left_available = 1;
+
+            /* check if the boolean decoder has suffered an error */
+            xd->corrupted |= vp8dx_bool_error(xd->current_bc);
+
+            xd->recon_above[0] += 16;
+            xd->recon_above[1] += 8;
+            xd->recon_above[2] += 8;
+            xd->recon_left[0] += 16;
+            xd->recon_left[1] += 8;
+            xd->recon_left[2] += 8;
+
+            recon_yoffset += 16;
+            recon_uvoffset += 8;
+
+            ++xd->mode_info_context;  /* next mb */
+
+            xd->above_context++;
+        }
+
+        /* adjust to the next row of mbs */
+        vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16,
+                          xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+
+        ++xd->mode_info_context;      /* skip prediction column */
+        xd->up_available = 1;
+
+        if(pc->filter_level)
+        {
+            if(mb_row > 0)
+            {
+                if (pc->filter_type == NORMAL_LOOPFILTER)
+                    vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1,
+                                               recon_y_stride, recon_uv_stride,
+                                               lf_dst[0], lf_dst[1], lf_dst[2]);
+                else
+                    vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1,
+                                               recon_y_stride, recon_uv_stride,
+                                               lf_dst[0], lf_dst[1], lf_dst[2]);
+
+                if(mb_row > 1)
+                {
+                    yv12_extend_frame_left_right_c(yv12_fb_new,
+                                                   eb_dst[0],
+                                                   eb_dst[1],
+                                                   eb_dst[2]);
+
+                    eb_dst[0] += recon_y_stride  * 16;
+                    eb_dst[1] += recon_uv_stride *  8;
+                    eb_dst[2] += recon_uv_stride *  8;
+
+                    if(mb_row == 2)
+                        yv12_extend_frame_top_c(yv12_fb_new);
+
+                }
+
+                lf_dst[0] += recon_y_stride  * 16;
+                lf_dst[1] += recon_uv_stride *  8;
+                lf_dst[2] += recon_uv_stride *  8;
+                lf_mic += pc->mb_cols;
+                lf_mic++;         /* Skip border mb */
+            }
+        }
+        else
+        {
+            if(mb_row > 0)
+            {
+                /**/
+                yv12_extend_frame_left_right_c(yv12_fb_new,
+                                               eb_dst[0],
+                                               eb_dst[1],
+                                               eb_dst[2]);
+
+                eb_dst[0] += recon_y_stride  * 16;
+                eb_dst[1] += recon_uv_stride *  8;
+                eb_dst[2] += recon_uv_stride *  8;
+
+                if(mb_row == 1)
+                    yv12_extend_frame_top_c(yv12_fb_new);
+            }
+        }
+    }
+
+    if(pc->filter_level)
+    {
+        if (pc->filter_type == NORMAL_LOOPFILTER)
+            vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1, recon_y_stride,
+                                       recon_uv_stride, lf_dst[0], lf_dst[1],
+                                       lf_dst[2]);
+        else
+            vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1, recon_y_stride,
+                                       recon_uv_stride, lf_dst[0], lf_dst[1],
+                                       lf_dst[2]);
+
+        yv12_extend_frame_left_right_c(yv12_fb_new,
+                                       eb_dst[0],
+                                       eb_dst[1],
+                                       eb_dst[2]);
+        eb_dst[0] += recon_y_stride  * 16;
+        eb_dst[1] += recon_uv_stride *  8;
+        eb_dst[2] += recon_uv_stride *  8;
+    }
+    yv12_extend_frame_left_right_c(yv12_fb_new,
+                                   eb_dst[0],
+                                   eb_dst[1],
+                                   eb_dst[2]);
+
+    yv12_extend_frame_bottom_c(yv12_fb_new);
+
+}
+
+static unsigned int read_partition_size(const unsigned char *cx_size)
+{
+    const unsigned int size =
+        cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);
+    return size;
+}
+
+static int read_is_valid(const unsigned char *start,
+                         size_t               len,
+                         const unsigned char *end)
+{
+    return (start + len > start && start + len <= end);
+}
+
+static unsigned int read_available_partition_size(
+                                       VP8D_COMP *pbi,
+                                       const unsigned char *token_part_sizes,
+                                       const unsigned char *fragment_start,
+                                       const unsigned char *first_fragment_end,
+                                       const unsigned char *fragment_end,
+                                       int i,
+                                       int num_part)
+{
+    VP8_COMMON* pc = &pbi->common;
+    const unsigned char *partition_size_ptr = token_part_sizes + i * 3;
+    unsigned int partition_size = 0;
+    ptrdiff_t bytes_left = fragment_end - fragment_start;
+    /* Calculate the length of this partition. The last partition
+     * size is implicit. If the partition size can't be read, then
+     * either use the remaining data in the buffer (for EC mode)
+     * or throw an error.
+     */
+    if (i < num_part - 1)
+    {
+        if (read_is_valid(partition_size_ptr, 3, first_fragment_end))
+            partition_size = read_partition_size(partition_size_ptr);
+        else if (pbi->ec_active)
+            partition_size = (unsigned int)bytes_left;
+        else
+            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                               "Truncated partition size data");
+    }
+    else
+        partition_size = (unsigned int)bytes_left;
+
+    /* Validate the calculated partition length. If the buffer
+     * described by the partition can't be fully read, then restrict
+     * it to the portion that can be (for EC mode) or throw an error.
+     */
+    if (!read_is_valid(fragment_start, partition_size, fragment_end))
+    {
+        if (pbi->ec_active)
+            partition_size = (unsigned int)bytes_left;
+        else
+            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                               "Truncated packet or corrupt partition "
+                               "%d length", i + 1);
+    }
+    return partition_size;
+}
+
+
+static void setup_token_decoder(VP8D_COMP *pbi,
+                                const unsigned char* token_part_sizes)
+{
+    vp8_reader *bool_decoder = &pbi->mbc[0];
+    unsigned int partition_idx;
+    unsigned int fragment_idx;
+    unsigned int num_token_partitions;
+    const unsigned char *first_fragment_end = pbi->fragments[0] +
+                                          pbi->fragment_sizes[0];
+
+    TOKEN_PARTITION multi_token_partition =
+            (TOKEN_PARTITION)vp8_read_literal(&pbi->mbc[8], 2);
+    if (!vp8dx_bool_error(&pbi->mbc[8]))
+        pbi->common.multi_token_partition = multi_token_partition;
+    num_token_partitions = 1 << pbi->common.multi_token_partition;
+
+    /* Check for partitions within the fragments and unpack the fragments
+     * so that each fragment pointer points to its corresponding partition. */
+    for (fragment_idx = 0; fragment_idx < pbi->num_fragments; ++fragment_idx)
+    {
+        unsigned int fragment_size = pbi->fragment_sizes[fragment_idx];
+        const unsigned char *fragment_end = pbi->fragments[fragment_idx] +
+                                            fragment_size;
+        /* Special case for handling the first partition since we have already
+         * read its size. */
+        if (fragment_idx == 0)
+        {
+            /* Size of first partition + token partition sizes element */
+            ptrdiff_t ext_first_part_size = token_part_sizes -
+                pbi->fragments[0] + 3 * (num_token_partitions - 1);
+            fragment_size -= (unsigned int)ext_first_part_size;
+            if (fragment_size > 0)
+            {
+                pbi->fragment_sizes[0] = (unsigned int)ext_first_part_size;
+                /* The fragment contains an additional partition. Move to
+                 * next. */
+                fragment_idx++;
+                pbi->fragments[fragment_idx] = pbi->fragments[0] +
+                  pbi->fragment_sizes[0];
+            }
+        }
+        /* Split the chunk into partitions read from the bitstream */
+        while (fragment_size > 0)
+        {
+            ptrdiff_t partition_size = read_available_partition_size(
+                                                 pbi,
+                                                 token_part_sizes,
+                                                 pbi->fragments[fragment_idx],
+                                                 first_fragment_end,
+                                                 fragment_end,
+                                                 fragment_idx - 1,
+                                                 num_token_partitions);
+            pbi->fragment_sizes[fragment_idx] = (unsigned int)partition_size;
+            fragment_size -= (unsigned int)partition_size;
+            assert(fragment_idx <= num_token_partitions);
+            if (fragment_size > 0)
+            {
+                /* The fragment contains an additional partition.
+                 * Move to next. */
+                fragment_idx++;
+                pbi->fragments[fragment_idx] =
+                    pbi->fragments[fragment_idx - 1] + partition_size;
+            }
+        }
+    }
+
+    pbi->num_fragments = num_token_partitions + 1;
+
+    for (partition_idx = 1; partition_idx < pbi->num_fragments; ++partition_idx)
+    {
+        if (vp8dx_start_decode(bool_decoder,
+                               pbi->fragments[partition_idx],
+                               pbi->fragment_sizes[partition_idx]))
+            vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate bool decoder %d",
+                               partition_idx);
+
+        bool_decoder++;
+    }
+
+#if CONFIG_MULTITHREAD
+    /* Clamp number of decoder threads */
+    if (pbi->decoding_thread_count > num_token_partitions - 1)
+        pbi->decoding_thread_count = num_token_partitions - 1;
+#endif
+}
+
+
+static void init_frame(VP8D_COMP *pbi)
+{
+    VP8_COMMON *const pc = & pbi->common;
+    MACROBLOCKD *const xd  = & pbi->mb;
+
+    if (pc->frame_type == KEY_FRAME)
+    {
+        /* Various keyframe initializations */
+        vpx_memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+
+        vp8_init_mbmode_probs(pc);
+
+        vp8_default_coef_probs(pc);
+
+        /* reset the segment feature data to 0 with delta coding (Default state). */
+        vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+        xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
+
+        /* reset the mode ref deltasa for loop filter */
+        vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+        vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+
+        /* All buffers are implicitly updated on key frames. */
+        pc->refresh_golden_frame = 1;
+        pc->refresh_alt_ref_frame = 1;
+        pc->copy_buffer_to_gf = 0;
+        pc->copy_buffer_to_arf = 0;
+
+        /* Note that Golden and Altref modes cannot be used on a key frame so
+         * ref_frame_sign_bias[] is undefined and meaningless
+         */
+        pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
+        pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;
+    }
+    else
+    {
+        /* To enable choice of different interploation filters */
+        if (!pc->use_bilinear_mc_filter)
+        {
+            xd->subpixel_predict        = vp8_sixtap_predict4x4;
+            xd->subpixel_predict8x4     = vp8_sixtap_predict8x4;
+            xd->subpixel_predict8x8     = vp8_sixtap_predict8x8;
+            xd->subpixel_predict16x16   = vp8_sixtap_predict16x16;
+        }
+        else
+        {
+            xd->subpixel_predict        = vp8_bilinear_predict4x4;
+            xd->subpixel_predict8x4     = vp8_bilinear_predict8x4;
+            xd->subpixel_predict8x8     = vp8_bilinear_predict8x8;
+            xd->subpixel_predict16x16   = vp8_bilinear_predict16x16;
+        }
+
+        if (pbi->decoded_key_frame && pbi->ec_enabled && !pbi->ec_active)
+            pbi->ec_active = 1;
+    }
+
+    xd->left_context = &pc->left_context;
+    xd->mode_info_context = pc->mi;
+    xd->frame_type = pc->frame_type;
+    xd->mode_info_context->mbmi.mode = DC_PRED;
+    xd->mode_info_stride = pc->mode_info_stride;
+    xd->corrupted = 0; /* init without corruption */
+
+    xd->fullpixel_mask = 0xffffffff;
+    if(pc->full_pixel)
+        xd->fullpixel_mask = 0xfffffff8;
+
+}
+
+int vp8_decode_frame(VP8D_COMP *pbi)
+{
+    vp8_reader *const bc = & pbi->mbc[8];
+    VP8_COMMON *const pc = & pbi->common;
+    MACROBLOCKD *const xd  = & pbi->mb;
+    const unsigned char *data = pbi->fragments[0];
+    const unsigned char *data_end =  data + pbi->fragment_sizes[0];
+    ptrdiff_t first_partition_length_in_bytes;
+
+    int i, j, k, l;
+    const int *const mb_feature_data_bits = vp8_mb_feature_data_bits;
+    int corrupt_tokens = 0;
+    int prev_independent_partitions = pbi->independent_partitions;
+
+    YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+
+    /* start with no corruption of current frame */
+    xd->corrupted = 0;
+    yv12_fb_new->corrupted = 0;
+
+    if (data_end - data < 3)
+    {
+        if (!pbi->ec_active)
+        {
+            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                               "Truncated packet");
+        }
+
+        /* Declare the missing frame as an inter frame since it will
+           be handled as an inter frame when we have estimated its
+           motion vectors. */
+        pc->frame_type = INTER_FRAME;
+        pc->version = 0;
+        pc->show_frame = 1;
+        first_partition_length_in_bytes = 0;
+    }
+    else
+    {
+        pc->frame_type = (FRAME_TYPE)(data[0] & 1);
+        pc->version = (data[0] >> 1) & 7;
+        pc->show_frame = (data[0] >> 4) & 1;
+        first_partition_length_in_bytes =
+            (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
+
+        if (!pbi->ec_active && (data + first_partition_length_in_bytes > data_end
+            || data + first_partition_length_in_bytes < data))
+            vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                               "Truncated packet or corrupt partition 0 length");
+
+        data += 3;
+
+        vp8_setup_version(pc);
+
+
+        if (pc->frame_type == KEY_FRAME)
+        {
+            /* vet via sync code */
+            /* When error concealment is enabled we should only check the sync
+             * code if we have enough bits available
+             */
+            if (!pbi->ec_active || data + 3 < data_end)
+            {
+                if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)
+                    vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
+                                   "Invalid frame sync code");
+            }
+
+            /* If error concealment is enabled we should only parse the new size
+             * if we have enough data. Otherwise we will end up with the wrong
+             * size.
+             */
+            if (!pbi->ec_active || data + 6 < data_end)
+            {
+                pc->Width = (data[3] | (data[4] << 8)) & 0x3fff;
+                pc->horiz_scale = data[4] >> 6;
+                pc->Height = (data[5] | (data[6] << 8)) & 0x3fff;
+                pc->vert_scale = data[6] >> 6;
+            }
+            data += 7;
+
+        }
+        else
+        {
+          vpx_memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+          vpx_memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+        }
+    }
+    if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME))
+    {
+        return -1;
+    }
+
+    init_frame(pbi);
+
+    if (vp8dx_start_decode(bc, data, (unsigned int)(data_end - data)))
+        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate bool decoder 0");
+    if (pc->frame_type == KEY_FRAME) {
+        pc->clr_type    = (YUV_TYPE)vp8_read_bit(bc);
+        pc->clamp_type  = (CLAMP_TYPE)vp8_read_bit(bc);
+    }
+
+    /* Is segmentation enabled */
+    xd->segmentation_enabled = (unsigned char)vp8_read_bit(bc);
+
+    if (xd->segmentation_enabled)
+    {
+        /* Signal whether or not the segmentation map is being explicitly updated this frame. */
+        xd->update_mb_segmentation_map = (unsigned char)vp8_read_bit(bc);
+        xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc);
+
+        if (xd->update_mb_segmentation_data)
+        {
+            xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc);
+
+            vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+
+            /* For each segmentation feature (Quant and loop filter level) */
+            for (i = 0; i < MB_LVL_MAX; i++)
+            {
+                for (j = 0; j < MAX_MB_SEGMENTS; j++)
+                {
+                    /* Frame level data */
+                    if (vp8_read_bit(bc))
+                    {
+                        xd->segment_feature_data[i][j] = (signed char)vp8_read_literal(bc, mb_feature_data_bits[i]);
+
+                        if (vp8_read_bit(bc))
+                            xd->segment_feature_data[i][j] = -xd->segment_feature_data[i][j];
+                    }
+                    else
+                        xd->segment_feature_data[i][j] = 0;
+                }
+            }
+        }
+
+        if (xd->update_mb_segmentation_map)
+        {
+            /* Which macro block level features are enabled */
+            vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+
+            /* Read the probs used to decode the segment id for each macro block. */
+            for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
+            {
+                /* If not explicitly set value is defaulted to 255 by memset above */
+                if (vp8_read_bit(bc))
+                    xd->mb_segment_tree_probs[i] = (vp8_prob)vp8_read_literal(bc, 8);
+            }
+        }
+    }
+    else
+    {
+        /* No segmentation updates on this frame */
+        xd->update_mb_segmentation_map = 0;
+        xd->update_mb_segmentation_data = 0;
+    }
+
+    /* Read the loop filter level and type */
+    pc->filter_type = (LOOPFILTERTYPE) vp8_read_bit(bc);
+    pc->filter_level = vp8_read_literal(bc, 6);
+    pc->sharpness_level = vp8_read_literal(bc, 3);
+
+    /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */
+    xd->mode_ref_lf_delta_update = 0;
+    xd->mode_ref_lf_delta_enabled = (unsigned char)vp8_read_bit(bc);
+
+    if (xd->mode_ref_lf_delta_enabled)
+    {
+        /* Do the deltas need to be updated */
+        xd->mode_ref_lf_delta_update = (unsigned char)vp8_read_bit(bc);
+
+        if (xd->mode_ref_lf_delta_update)
+        {
+            /* Send update */
+            for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+            {
+                if (vp8_read_bit(bc))
+                {
+                    /*sign = vp8_read_bit( bc );*/
+                    xd->ref_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
+
+                    if (vp8_read_bit(bc))        /* Apply sign */
+                        xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;
+                }
+            }
+
+            /* Send update */
+            for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+            {
+                if (vp8_read_bit(bc))
+                {
+                    /*sign = vp8_read_bit( bc );*/
+                    xd->mode_lf_deltas[i] = (signed char)vp8_read_literal(bc, 6);
+
+                    if (vp8_read_bit(bc))        /* Apply sign */
+                        xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;
+                }
+            }
+        }
+    }
+
+    setup_token_decoder(pbi, data + first_partition_length_in_bytes);
+
+    xd->current_bc = &pbi->mbc[0];
+
+    /* Read the default quantizers. */
+    {
+        int Q, q_update;
+
+        Q = vp8_read_literal(bc, 7);  /* AC 1st order Q = default */
+        pc->base_qindex = Q;
+        q_update = 0;
+        pc->y1dc_delta_q = get_delta_q(bc, pc->y1dc_delta_q, &q_update);
+        pc->y2dc_delta_q = get_delta_q(bc, pc->y2dc_delta_q, &q_update);
+        pc->y2ac_delta_q = get_delta_q(bc, pc->y2ac_delta_q, &q_update);
+        pc->uvdc_delta_q = get_delta_q(bc, pc->uvdc_delta_q, &q_update);
+        pc->uvac_delta_q = get_delta_q(bc, pc->uvac_delta_q, &q_update);
+
+        if (q_update)
+            vp8cx_init_de_quantizer(pbi);
+
+        /* MB level dequantizer setup */
+        vp8_mb_init_dequantizer(pbi, &pbi->mb);
+    }
+
+    /* Determine if the golden frame or ARF buffer should be updated and how.
+     * For all non key frames the GF and ARF refresh flags and sign bias
+     * flags must be set explicitly.
+     */
+    if (pc->frame_type != KEY_FRAME)
+    {
+        /* Should the GF or ARF be updated from the current frame */
+        pc->refresh_golden_frame = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+        /* Assume we shouldn't refresh golden if the bit is missing */
+        xd->corrupted |= vp8dx_bool_error(bc);
+        if (pbi->ec_active && xd->corrupted)
+            pc->refresh_golden_frame = 0;
+#endif
+
+        pc->refresh_alt_ref_frame = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+        /* Assume we shouldn't refresh altref if the bit is missing */
+        xd->corrupted |= vp8dx_bool_error(bc);
+        if (pbi->ec_active && xd->corrupted)
+            pc->refresh_alt_ref_frame = 0;
+#endif
+
+        /* Buffer to buffer copy flags. */
+        pc->copy_buffer_to_gf = 0;
+
+        if (!pc->refresh_golden_frame)
+            pc->copy_buffer_to_gf = vp8_read_literal(bc, 2);
+
+#if CONFIG_ERROR_CONCEALMENT
+        /* Assume we shouldn't copy to the golden if the bit is missing */
+        xd->corrupted |= vp8dx_bool_error(bc);
+        if (pbi->ec_active && xd->corrupted)
+            pc->copy_buffer_to_gf = 0;
+#endif
+
+        pc->copy_buffer_to_arf = 0;
+
+        if (!pc->refresh_alt_ref_frame)
+            pc->copy_buffer_to_arf = vp8_read_literal(bc, 2);
+
+#if CONFIG_ERROR_CONCEALMENT
+        /* Assume we shouldn't copy to the alt-ref if the bit is missing */
+        xd->corrupted |= vp8dx_bool_error(bc);
+        if (pbi->ec_active && xd->corrupted)
+            pc->copy_buffer_to_arf = 0;
+#endif
+
+
+        pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp8_read_bit(bc);
+        pc->ref_frame_sign_bias[ALTREF_FRAME] = vp8_read_bit(bc);
+    }
+
+    pc->refresh_entropy_probs = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+    /* Assume we shouldn't refresh the probabilities if the bit is
+     * missing */
+    xd->corrupted |= vp8dx_bool_error(bc);
+    if (pbi->ec_active && xd->corrupted)
+        pc->refresh_entropy_probs = 0;
+#endif
+    if (pc->refresh_entropy_probs == 0)
+    {
+        vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+    }
+
+    pc->refresh_last_frame = pc->frame_type == KEY_FRAME  ||  vp8_read_bit(bc);
+
+#if CONFIG_ERROR_CONCEALMENT
+    /* Assume we should refresh the last frame if the bit is missing */
+    xd->corrupted |= vp8dx_bool_error(bc);
+    if (pbi->ec_active && xd->corrupted)
+        pc->refresh_last_frame = 1;
+#endif
+
+    if (0)
+    {
+        FILE *z = fopen("decodestats.stt", "a");
+        fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
+                pc->current_video_frame,
+                pc->frame_type,
+                pc->refresh_golden_frame,
+                pc->refresh_alt_ref_frame,
+                pc->refresh_last_frame,
+                pc->base_qindex);
+        fclose(z);
+    }
+
+    {
+        pbi->independent_partitions = 1;
+
+        /* read coef probability tree */
+        for (i = 0; i < BLOCK_TYPES; i++)
+            for (j = 0; j < COEF_BANDS; j++)
+                for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+                    for (l = 0; l < ENTROPY_NODES; l++)
+                    {
+
+                        vp8_prob *const p = pc->fc.coef_probs [i][j][k] + l;
+
+                        if (vp8_read(bc, vp8_coef_update_probs [i][j][k][l]))
+                        {
+                            *p = (vp8_prob)vp8_read_literal(bc, 8);
+
+                        }
+                        if (k > 0 && *p != pc->fc.coef_probs[i][j][k-1][l])
+                            pbi->independent_partitions = 0;
+
+                    }
+    }
+
+    /* clear out the coeff buffer */
+    vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+
+    vp8_decode_mode_mvs(pbi);
+
+#if CONFIG_ERROR_CONCEALMENT
+    if (pbi->ec_active &&
+            pbi->mvs_corrupt_from_mb < (unsigned int)pc->mb_cols * pc->mb_rows)
+    {
+        /* Motion vectors are missing in this frame. We will try to estimate
+         * them and then continue decoding the frame as usual */
+        vp8_estimate_missing_mvs(pbi);
+    }
+#endif
+
+    vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+    pbi->frame_corrupt_residual = 0;
+
+#if CONFIG_MULTITHREAD
+    if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
+    {
+        unsigned int i;
+        vp8mt_decode_mb_rows(pbi, xd);
+        vp8_yv12_extend_frame_borders(yv12_fb_new);
+        for (i = 0; i < pbi->decoding_thread_count; ++i)
+            corrupt_tokens |= pbi->mb_row_di[i].mbd.corrupted;
+    }
+    else
+#endif
+    {
+        decode_mb_rows(pbi);
+        corrupt_tokens |= xd->corrupted;
+    }
+
+    /* Collect information about decoder corruption. */
+    /* 1. Check first boolean decoder for errors. */
+    yv12_fb_new->corrupted = vp8dx_bool_error(bc);
+    /* 2. Check the macroblock information */
+    yv12_fb_new->corrupted |= corrupt_tokens;
+
+    if (!pbi->decoded_key_frame)
+    {
+        if (pc->frame_type == KEY_FRAME &&
+            !yv12_fb_new->corrupted)
+            pbi->decoded_key_frame = 1;
+        else
+            vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
+                               "A stream must start with a complete key frame");
+    }
+
+    /* vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes  \n",bc->pos+pbi->bc2.pos); */
+
+    if (pc->refresh_entropy_probs == 0)
+    {
+        vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
+        pbi->independent_partitions = prev_independent_partitions;
+    }
+
+#ifdef PACKET_TESTING
+    {
+        FILE *f = fopen("decompressor.VP8", "ab");
+        unsigned int size = pbi->bc2.pos + pbi->bc.pos + 8;
+        fwrite((void *) &size, 4, 1, f);
+        fwrite((void *) pbi->Source, size, 1, f);
+        fclose(f);
+    }
+#endif
+
+    return 0;
+}
diff --git a/libvpx/vp8/decoder/detokenize.c b/libvpx/vp8/decoder/detokenize.c
new file mode 100644
index 0000000..452ff6c
--- /dev/null
+++ b/libvpx/vp8/decoder/detokenize.c
@@ -0,0 +1,245 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/blockd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "detokenize.h"
+
+void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
+{
+    ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context);
+    ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context);
+
+    vpx_memset(a_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+    vpx_memset(l_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+
+    /* Clear entropy contexts for Y2 blocks */
+    if (!x->mode_info_context->mbmi.is_4x4)
+    {
+        a_ctx[8] = l_ctx[8] = 0;
+    }
+}
+
+/*
+    ------------------------------------------------------------------------------
+    Residual decoding (Paragraph 13.2 / 13.3)
+*/
+static const uint8_t kBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  /* extra entry as sentinel */
+};
+
+static const uint8_t kCat3[] = { 173, 148, 140, 0 };
+static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
+static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
+static const uint8_t kCat6[] =
+  { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
+};
+
+#define VP8GetBit vp8dx_decode_bool
+#define NUM_PROBAS  11
+#define NUM_CTX  3
+
+/* for const-casting */
+typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];
+
+static int GetSigned(BOOL_DECODER *br, int value_to_sign)
+{
+    int split = (br->range + 1) >> 1;
+    VP8_BD_VALUE bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
+    int v;
+
+    if(br->count < 0)
+        vp8dx_bool_decoder_fill(br);
+
+    if ( br->value < bigsplit )
+    {
+        br->range = split;
+        v= value_to_sign;
+    }
+    else
+    {
+        br->range = br->range-split;
+        br->value = br->value-bigsplit;
+        v = -value_to_sign;
+    }
+    br->range +=br->range;
+    br->value +=br->value;
+    br->count--;
+
+    return v;
+}
+/*
+   Returns the position of the last non-zero coeff plus one
+   (and 0 if there's no coeff at all)
+*/
+static int GetCoeffs(BOOL_DECODER *br, ProbaArray prob,
+                     int ctx, int n, int16_t* out)
+{
+    const uint8_t* p = prob[n][ctx];
+    if (!VP8GetBit(br, p[0]))
+    {   /* first EOB is more a 'CBP' bit. */
+        return 0;
+    }
+    while (1)
+    {
+        ++n;
+        if (!VP8GetBit(br, p[1]))
+        {
+            p = prob[kBands[n]][0];
+        }
+        else
+        {  /* non zero coeff */
+            int v, j;
+            if (!VP8GetBit(br, p[2]))
+            {
+                p = prob[kBands[n]][1];
+                v = 1;
+            }
+            else
+            {
+                if (!VP8GetBit(br, p[3]))
+                {
+                    if (!VP8GetBit(br, p[4]))
+                    {
+                        v = 2;
+                    }
+                    else
+                    {
+                        v = 3 + VP8GetBit(br, p[5]);
+                    }
+                }
+                else
+                {
+                    if (!VP8GetBit(br, p[6]))
+                    {
+                        if (!VP8GetBit(br, p[7]))
+                        {
+                            v = 5 + VP8GetBit(br, 159);
+                        } else
+                        {
+                            v = 7 + 2 * VP8GetBit(br, 165);
+                            v += VP8GetBit(br, 145);
+                        }
+                    }
+                    else
+                    {
+                        const uint8_t* tab;
+                        const int bit1 = VP8GetBit(br, p[8]);
+                        const int bit0 = VP8GetBit(br, p[9 + bit1]);
+                        const int cat = 2 * bit1 + bit0;
+                        v = 0;
+                        for (tab = kCat3456[cat]; *tab; ++tab)
+                        {
+                            v += v + VP8GetBit(br, *tab);
+                        }
+                        v += 3 + (8 << cat);
+                    }
+                }
+                p = prob[kBands[n]][2];
+            }
+            j = kZigzag[n - 1];
+
+            out[j] = GetSigned(br, v);
+
+            if (n == 16 || !VP8GetBit(br, p[0]))
+            {   /* EOB */
+                return n;
+            }
+        }
+        if (n == 16)
+        {
+            return 16;
+        }
+    }
+}
+
+int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
+{
+    BOOL_DECODER *bc = x->current_bc;
+    const FRAME_CONTEXT * const fc = &dx->common.fc;
+    char *eobs = x->eobs;
+
+    int i;
+    int nonzeros;
+    int eobtotal = 0;
+
+    short *qcoeff_ptr;
+    ProbaArray coef_probs;
+    ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context);
+    ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context);
+    ENTROPY_CONTEXT *a;
+    ENTROPY_CONTEXT *l;
+    int skip_dc = 0;
+
+    qcoeff_ptr = &x->qcoeff[0];
+
+    if (!x->mode_info_context->mbmi.is_4x4)
+    {
+        a = a_ctx + 8;
+        l = l_ctx + 8;
+
+        coef_probs = fc->coef_probs [1];
+
+        nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr + 24 * 16);
+        *a = *l = (nonzeros > 0);
+
+        eobs[24] = nonzeros;
+        eobtotal += nonzeros - 16;
+
+        coef_probs = fc->coef_probs [0];
+        skip_dc = 1;
+    }
+    else
+    {
+        coef_probs = fc->coef_probs [3];
+        skip_dc = 0;
+    }
+
+    for (i = 0; i < 16; ++i)
+    {
+        a = a_ctx + (i&3);
+        l = l_ctx + ((i&0xc)>>2);
+
+        nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), skip_dc, qcoeff_ptr);
+        *a = *l = (nonzeros > 0);
+
+        nonzeros += skip_dc;
+        eobs[i] = nonzeros;
+        eobtotal += nonzeros;
+        qcoeff_ptr += 16;
+    }
+
+    coef_probs = fc->coef_probs [2];
+
+    a_ctx += 4;
+    l_ctx += 4;
+    for (i = 16; i < 24; ++i)
+    {
+        a = a_ctx + ((i > 19)<<1) + (i&1);
+        l = l_ctx + ((i > 19)<<1) + ((i&3)>1);
+
+        nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr);
+        *a = *l = (nonzeros > 0);
+
+        eobs[i] = nonzeros;
+        eobtotal += nonzeros;
+        qcoeff_ptr += 16;
+    }
+
+    return eobtotal;
+}
+
diff --git a/libvpx/vp8/decoder/detokenize.h b/libvpx/vp8/decoder/detokenize.h
new file mode 100644
index 0000000..8640bda
--- /dev/null
+++ b/libvpx/vp8/decoder/detokenize.h
@@ -0,0 +1,20 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DETOKENIZE_H
+#define DETOKENIZE_H
+
+#include "onyxd_int.h"
+
+void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
+int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
+
+#endif /* DETOKENIZE_H */
diff --git a/libvpx/vp8/decoder/ec_types.h b/libvpx/vp8/decoder/ec_types.h
new file mode 100644
index 0000000..ccb5ddb
--- /dev/null
+++ b/libvpx/vp8/decoder/ec_types.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_DEC_EC_TYPES_H
+#define VP8_DEC_EC_TYPES_H
+
+#define MAX_OVERLAPS 16
+
+
+
+/* The area (pixel area in Q6) the block pointed to by bmi overlaps
+ * another block with.
+ */
+typedef struct
+{
+    int overlap;
+    union b_mode_info *bmi;
+} OVERLAP_NODE;
+
+/* Structure to keep track of overlapping blocks on a block level. */
+typedef struct
+{
+    /* TODO(holmer): This array should be exchanged for a linked list */
+    OVERLAP_NODE overlaps[MAX_OVERLAPS];
+} B_OVERLAP;
+
+/* Structure used to hold all the overlaps of a macroblock. The overlaps of a
+ * macroblock is further divided into block overlaps.
+ */
+typedef struct
+{
+    B_OVERLAP overlaps[16];
+} MB_OVERLAP;
+
+/* Structure for keeping track of motion vectors and which reference frame they
+ * refer to. Used for motion vector interpolation.
+ */
+typedef struct
+{
+    MV mv;
+    MV_REFERENCE_FRAME ref_frame;
+} EC_BLOCK;
+
+#endif /* VP8_DEC_EC_TYPES_H */
diff --git a/libvpx/vp8/decoder/error_concealment.c b/libvpx/vp8/decoder/error_concealment.c
new file mode 100644
index 0000000..8b2e32b
--- /dev/null
+++ b/libvpx/vp8/decoder/error_concealment.c
@@ -0,0 +1,598 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "error_concealment.h"
+#include "onyxd_int.h"
+#include "decodemv.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/findnearmv.h"
+
+#include <assert.h>
+
+#define MIN(x,y) (((x)<(y))?(x):(y))
+#define MAX(x,y) (((x)>(y))?(x):(y))
+
+#define FLOOR(x,q) ((x) & -(1 << (q)))
+
+#define NUM_NEIGHBORS 20
+
+typedef struct ec_position
+{
+    int row;
+    int col;
+} EC_POS;
+
+/*
+ * Regenerate the table in Matlab with:
+ * x = meshgrid((1:4), (1:4));
+ * y = meshgrid((1:4), (1:4))';
+ * W = round((1./(sqrt(x.^2 + y.^2))*2^7));
+ * W(1,1) = 0;
+ */
+static const int weights_q7[5][5] = {
+       {  0,   128,    64,    43,    32 },
+       {128,    91,    57,    40,    31 },
+       { 64,    57,    45,    36,    29 },
+       { 43,    40,    36,    30,    26 },
+       { 32,    31,    29,    26,    23 }
+};
+
+int vp8_alloc_overlap_lists(VP8D_COMP *pbi)
+{
+    if (pbi->overlaps != NULL)
+    {
+        vpx_free(pbi->overlaps);
+        pbi->overlaps = NULL;
+    }
+
+    pbi->overlaps = vpx_calloc(pbi->common.mb_rows * pbi->common.mb_cols,
+                               sizeof(MB_OVERLAP));
+
+    if (pbi->overlaps == NULL)
+        return -1;
+
+    return 0;
+}
+
+void vp8_de_alloc_overlap_lists(VP8D_COMP *pbi)
+{
+    vpx_free(pbi->overlaps);
+    pbi->overlaps = NULL;
+}
+
+/* Inserts a new overlap area value to the list of overlaps of a block */
+static void assign_overlap(OVERLAP_NODE* overlaps,
+                           union b_mode_info *bmi,
+                           int overlap)
+{
+    int i;
+    if (overlap <= 0)
+        return;
+    /* Find and assign to the next empty overlap node in the list of overlaps.
+     * Empty is defined as bmi == NULL */
+    for (i = 0; i < MAX_OVERLAPS; i++)
+    {
+        if (overlaps[i].bmi == NULL)
+        {
+            overlaps[i].bmi = bmi;
+            overlaps[i].overlap = overlap;
+            break;
+        }
+    }
+}
+
+/* Calculates the overlap area between two 4x4 squares, where the first
+ * square has its upper-left corner at (b1_row, b1_col) and the second
+ * square has its upper-left corner at (b2_row, b2_col). Doesn't
+ * properly handle squares which do not overlap.
+ */
+static int block_overlap(int b1_row, int b1_col, int b2_row, int b2_col)
+{
+    const int int_top = MAX(b1_row, b2_row); // top
+    const int int_left = MAX(b1_col, b2_col); // left
+    /* Since each block is 4x4 pixels, adding 4 (Q3) to the left/top edge
+     * gives us the right/bottom edge.
+     */
+    const int int_right = MIN(b1_col + (4<<3), b2_col + (4<<3)); // right
+    const int int_bottom = MIN(b1_row + (4<<3), b2_row + (4<<3)); // bottom
+    return (int_bottom - int_top) * (int_right - int_left);
+}
+
+/* Calculates the overlap area for all blocks in a macroblock at position
+ * (mb_row, mb_col) in macroblocks, which are being overlapped by a given
+ * overlapping block at position (new_row, new_col) (in pixels, Q3). The
+ * first block being overlapped in the macroblock has position (first_blk_row,
+ * first_blk_col) in blocks relative the upper-left corner of the image.
+ */
+static void calculate_overlaps_mb(B_OVERLAP *b_overlaps, union b_mode_info *bmi,
+                                  int new_row, int new_col,
+                                  int mb_row, int mb_col,
+                                  int first_blk_row, int first_blk_col)
+{
+    /* Find the blocks within this MB (defined by mb_row, mb_col) which are
+     * overlapped by bmi and calculate and assign overlap for each of those
+     * blocks. */
+
+    /* Block coordinates relative the upper-left block */
+    const int rel_ol_blk_row = first_blk_row - mb_row * 4;
+    const int rel_ol_blk_col = first_blk_col - mb_col * 4;
+    /* If the block partly overlaps any previous MB, these coordinates
+     * can be < 0. We don't want to access blocks in previous MBs.
+     */
+    const int blk_idx = MAX(rel_ol_blk_row,0) * 4 + MAX(rel_ol_blk_col,0);
+    /* Upper left overlapping block */
+    B_OVERLAP *b_ol_ul = &(b_overlaps[blk_idx]);
+
+    /* Calculate and assign overlaps for all blocks in this MB
+     * which the motion compensated block overlaps
+     */
+    /* Avoid calculating overlaps for blocks in later MBs */
+    int end_row = MIN(4 + mb_row * 4 - first_blk_row, 2);
+    int end_col = MIN(4 + mb_col * 4 - first_blk_col, 2);
+    int row, col;
+
+    /* Check if new_row and new_col are evenly divisible by 4 (Q3),
+     * and if so we shouldn't check neighboring blocks
+     */
+    if (new_row >= 0 && (new_row & 0x1F) == 0)
+        end_row = 1;
+    if (new_col >= 0 && (new_col & 0x1F) == 0)
+        end_col = 1;
+
+    /* Check if the overlapping block partly overlaps a previous MB
+     * and if so, we're overlapping fewer blocks in this MB.
+     */
+    if (new_row < (mb_row*16)<<3)
+        end_row = 1;
+    if (new_col < (mb_col*16)<<3)
+        end_col = 1;
+
+    for (row = 0; row < end_row; ++row)
+    {
+        for (col = 0; col < end_col; ++col)
+        {
+            /* input in Q3, result in Q6 */
+            const int overlap = block_overlap(new_row, new_col,
+                                                  (((first_blk_row + row) *
+                                                      4) << 3),
+                                                  (((first_blk_col + col) *
+                                                      4) << 3));
+            assign_overlap(b_ol_ul[row * 4 + col].overlaps, bmi, overlap);
+        }
+    }
+}
+
+void vp8_calculate_overlaps(MB_OVERLAP *overlap_ul,
+                            int mb_rows, int mb_cols,
+                            union b_mode_info *bmi,
+                            int b_row, int b_col)
+{
+    MB_OVERLAP *mb_overlap;
+    int row, col, rel_row, rel_col;
+    int new_row, new_col;
+    int end_row, end_col;
+    int overlap_b_row, overlap_b_col;
+    int overlap_mb_row, overlap_mb_col;
+
+    /* mb subpixel position */
+    row = (4 * b_row) << 3; /* Q3 */
+    col = (4 * b_col) << 3; /* Q3 */
+
+    /* reverse compensate for motion */
+    new_row = row - bmi->mv.as_mv.row;
+    new_col = col - bmi->mv.as_mv.col;
+
+    if (new_row >= ((16*mb_rows) << 3) || new_col >= ((16*mb_cols) << 3))
+    {
+        /* the new block ended up outside the frame */
+        return;
+    }
+
+    if (new_row <= (-4 << 3) || new_col <= (-4 << 3))
+    {
+        /* outside the frame */
+        return;
+    }
+    /* overlapping block's position in blocks */
+    overlap_b_row = FLOOR(new_row / 4, 3) >> 3;
+    overlap_b_col = FLOOR(new_col / 4, 3) >> 3;
+
+    /* overlapping block's MB position in MBs
+     * operations are done in Q3
+     */
+    overlap_mb_row = FLOOR((overlap_b_row << 3) / 4, 3) >> 3;
+    overlap_mb_col = FLOOR((overlap_b_col << 3) / 4, 3) >> 3;
+
+    end_row = MIN(mb_rows - overlap_mb_row, 2);
+    end_col = MIN(mb_cols - overlap_mb_col, 2);
+
+    /* Don't calculate overlap for MBs we don't overlap */
+    /* Check if the new block row starts at the last block row of the MB */
+    if (abs(new_row - ((16*overlap_mb_row) << 3)) < ((3*4) << 3))
+        end_row = 1;
+    /* Check if the new block col starts at the last block col of the MB */
+    if (abs(new_col - ((16*overlap_mb_col) << 3)) < ((3*4) << 3))
+        end_col = 1;
+
+    /* find the MB(s) this block is overlapping */
+    for (rel_row = 0; rel_row < end_row; ++rel_row)
+    {
+        for (rel_col = 0; rel_col < end_col; ++rel_col)
+        {
+            if (overlap_mb_row + rel_row < 0 ||
+                overlap_mb_col + rel_col < 0)
+                continue;
+            mb_overlap = overlap_ul + (overlap_mb_row + rel_row) * mb_cols +
+                 overlap_mb_col + rel_col;
+
+            calculate_overlaps_mb(mb_overlap->overlaps, bmi,
+                                  new_row, new_col,
+                                  overlap_mb_row + rel_row,
+                                  overlap_mb_col + rel_col,
+                                  overlap_b_row + rel_row,
+                                  overlap_b_col + rel_col);
+        }
+    }
+}
+
+/* Estimates a motion vector given the overlapping blocks' motion vectors.
+ * Filters out all overlapping blocks which do not refer to the correct
+ * reference frame type.
+ */
+static void estimate_mv(const OVERLAP_NODE *overlaps, union b_mode_info *bmi)
+{
+    int i;
+    int overlap_sum = 0;
+    int row_acc = 0;
+    int col_acc = 0;
+
+    bmi->mv.as_int = 0;
+    for (i=0; i < MAX_OVERLAPS; ++i)
+    {
+        if (overlaps[i].bmi == NULL)
+            break;
+        col_acc += overlaps[i].overlap * overlaps[i].bmi->mv.as_mv.col;
+        row_acc += overlaps[i].overlap * overlaps[i].bmi->mv.as_mv.row;
+        overlap_sum += overlaps[i].overlap;
+    }
+    if (overlap_sum > 0)
+    {
+        /* Q9 / Q6 = Q3 */
+        bmi->mv.as_mv.col = col_acc / overlap_sum;
+        bmi->mv.as_mv.row = row_acc / overlap_sum;
+    }
+    else
+    {
+        bmi->mv.as_mv.col = 0;
+        bmi->mv.as_mv.row = 0;
+    }
+}
+
+/* Estimates all motion vectors for a macroblock given the lists of
+ * overlaps for each block. Decides whether or not the MVs must be clamped.
+ */
+static void estimate_mb_mvs(const B_OVERLAP *block_overlaps,
+                            MODE_INFO *mi,
+                            int mb_to_left_edge,
+                            int mb_to_right_edge,
+                            int mb_to_top_edge,
+                            int mb_to_bottom_edge)
+{
+    int row, col;
+    int non_zero_count = 0;
+    MV * const filtered_mv = &(mi->mbmi.mv.as_mv);
+    union b_mode_info * const bmi = mi->bmi;
+    filtered_mv->col = 0;
+    filtered_mv->row = 0;
+    mi->mbmi.need_to_clamp_mvs = 0;
+    for (row = 0; row < 4; ++row)
+    {
+        int this_b_to_top_edge = mb_to_top_edge + ((row*4)<<3);
+        int this_b_to_bottom_edge = mb_to_bottom_edge - ((row*4)<<3);
+        for (col = 0; col < 4; ++col)
+        {
+            int i = row * 4 + col;
+            int this_b_to_left_edge = mb_to_left_edge + ((col*4)<<3);
+            int this_b_to_right_edge = mb_to_right_edge - ((col*4)<<3);
+            /* Estimate vectors for all blocks which are overlapped by this */
+            /* type. Interpolate/extrapolate the rest of the block's MVs */
+            estimate_mv(block_overlaps[i].overlaps, &(bmi[i]));
+            mi->mbmi.need_to_clamp_mvs |= vp8_check_mv_bounds(
+                                                         &bmi[i].mv,
+                                                         this_b_to_left_edge,
+                                                         this_b_to_right_edge,
+                                                         this_b_to_top_edge,
+                                                         this_b_to_bottom_edge);
+            if (bmi[i].mv.as_int != 0)
+            {
+                ++non_zero_count;
+                filtered_mv->col += bmi[i].mv.as_mv.col;
+                filtered_mv->row += bmi[i].mv.as_mv.row;
+            }
+        }
+    }
+    if (non_zero_count > 0)
+    {
+        filtered_mv->col /= non_zero_count;
+        filtered_mv->row /= non_zero_count;
+    }
+}
+
+static void calc_prev_mb_overlaps(MB_OVERLAP *overlaps, MODE_INFO *prev_mi,
+                                    int mb_row, int mb_col,
+                                    int mb_rows, int mb_cols)
+{
+    int sub_row;
+    int sub_col;
+    for (sub_row = 0; sub_row < 4; ++sub_row)
+    {
+        for (sub_col = 0; sub_col < 4; ++sub_col)
+        {
+            vp8_calculate_overlaps(
+                                overlaps, mb_rows, mb_cols,
+                                &(prev_mi->bmi[sub_row * 4 + sub_col]),
+                                4 * mb_row + sub_row,
+                                4 * mb_col + sub_col);
+        }
+    }
+}
+
+/* Estimate all missing motion vectors. This function does the same as the one
+ * above, but has different input arguments. */
+static void estimate_missing_mvs(MB_OVERLAP *overlaps,
+                                 MODE_INFO *mi, MODE_INFO *prev_mi,
+                                 int mb_rows, int mb_cols,
+                                 unsigned int first_corrupt)
+{
+    int mb_row, mb_col;
+    vpx_memset(overlaps, 0, sizeof(MB_OVERLAP) * mb_rows * mb_cols);
+    /* First calculate the overlaps for all blocks */
+    for (mb_row = 0; mb_row < mb_rows; ++mb_row)
+    {
+        for (mb_col = 0; mb_col < mb_cols; ++mb_col)
+        {
+            /* We're only able to use blocks referring to the last frame
+             * when extrapolating new vectors.
+             */
+            if (prev_mi->mbmi.ref_frame == LAST_FRAME)
+            {
+                calc_prev_mb_overlaps(overlaps, prev_mi,
+                                      mb_row, mb_col,
+                                      mb_rows, mb_cols);
+            }
+            ++prev_mi;
+        }
+        ++prev_mi;
+    }
+
+    mb_row = first_corrupt / mb_cols;
+    mb_col = first_corrupt - mb_row * mb_cols;
+    mi += mb_row*(mb_cols + 1) + mb_col;
+    /* Go through all macroblocks in the current image with missing MVs
+     * and calculate new MVs using the overlaps.
+     */
+    for (; mb_row < mb_rows; ++mb_row)
+    {
+        int mb_to_top_edge = -((mb_row * 16)) << 3;
+        int mb_to_bottom_edge = ((mb_rows - 1 - mb_row) * 16) << 3;
+        for (; mb_col < mb_cols; ++mb_col)
+        {
+            int mb_to_left_edge = -((mb_col * 16) << 3);
+            int mb_to_right_edge = ((mb_cols - 1 - mb_col) * 16) << 3;
+            const B_OVERLAP *block_overlaps =
+                    overlaps[mb_row*mb_cols + mb_col].overlaps;
+            mi->mbmi.ref_frame = LAST_FRAME;
+            mi->mbmi.mode = SPLITMV;
+            mi->mbmi.uv_mode = DC_PRED;
+            mi->mbmi.partitioning = 3;
+            mi->mbmi.segment_id = 0;
+            estimate_mb_mvs(block_overlaps,
+                            mi,
+                            mb_to_left_edge,
+                            mb_to_right_edge,
+                            mb_to_top_edge,
+                            mb_to_bottom_edge);
+            ++mi;
+        }
+        mb_col = 0;
+        ++mi;
+    }
+}
+
+void vp8_estimate_missing_mvs(VP8D_COMP *pbi)
+{
+    VP8_COMMON * const pc = &pbi->common;
+    estimate_missing_mvs(pbi->overlaps,
+                         pc->mi, pc->prev_mi,
+                         pc->mb_rows, pc->mb_cols,
+                         pbi->mvs_corrupt_from_mb);
+}
+
+static void assign_neighbor(EC_BLOCK *neighbor, MODE_INFO *mi, int block_idx)
+{
+    assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);
+    neighbor->ref_frame = mi->mbmi.ref_frame;
+    neighbor->mv = mi->bmi[block_idx].mv.as_mv;
+}
+
+/* Finds the neighboring blocks of a macroblocks. In the general case
+ * 20 blocks are found. If a fewer number of blocks are found due to
+ * image boundaries, those positions in the EC_BLOCK array are left "empty".
+ * The neighbors are enumerated with the upper-left neighbor as the first
+ * element, the second element refers to the neighbor to right of the previous
+ * neighbor, and so on. The last element refers to the neighbor below the first
+ * neighbor.
+ */
+static void find_neighboring_blocks(MODE_INFO *mi,
+                                    EC_BLOCK *neighbors,
+                                    int mb_row, int mb_col,
+                                    int mb_rows, int mb_cols,
+                                    int mi_stride)
+{
+    int i = 0;
+    int j;
+    if (mb_row > 0)
+    {
+        /* upper left */
+        if (mb_col > 0)
+            assign_neighbor(&neighbors[i], mi - mi_stride - 1, 15);
+        ++i;
+        /* above */
+        for (j = 12; j < 16; ++j, ++i)
+            assign_neighbor(&neighbors[i], mi - mi_stride, j);
+    }
+    else
+        i += 5;
+    if (mb_col < mb_cols - 1)
+    {
+        /* upper right */
+        if (mb_row > 0)
+            assign_neighbor(&neighbors[i], mi - mi_stride + 1, 12);
+        ++i;
+        /* right */
+        for (j = 0; j <= 12; j += 4, ++i)
+            assign_neighbor(&neighbors[i], mi + 1, j);
+    }
+    else
+        i += 5;
+    if (mb_row < mb_rows - 1)
+    {
+        /* lower right */
+        if (mb_col < mb_cols - 1)
+            assign_neighbor(&neighbors[i], mi + mi_stride + 1, 0);
+        ++i;
+        /* below */
+        for (j = 0; j < 4; ++j, ++i)
+            assign_neighbor(&neighbors[i], mi + mi_stride, j);
+    }
+    else
+        i += 5;
+    if (mb_col > 0)
+    {
+        /* lower left */
+        if (mb_row < mb_rows - 1)
+            assign_neighbor(&neighbors[i], mi + mi_stride - 1, 4);
+        ++i;
+        /* left */
+        for (j = 3; j < 16; j += 4, ++i)
+        {
+            assign_neighbor(&neighbors[i], mi - 1, j);
+        }
+    }
+    else
+        i += 5;
+    assert(i == 20);
+}
+
+/* Interpolates all motion vectors for a macroblock from the neighboring blocks'
+ * motion vectors.
+ */
+static void interpolate_mvs(MACROBLOCKD *mb,
+                         EC_BLOCK *neighbors,
+                         MV_REFERENCE_FRAME dom_ref_frame)
+{
+    int row, col, i;
+    MODE_INFO * const mi = mb->mode_info_context;
+    /* Table with the position of the neighboring blocks relative the position
+     * of the upper left block of the current MB. Starting with the upper left
+     * neighbor and going to the right.
+     */
+    const EC_POS neigh_pos[NUM_NEIGHBORS] = {
+                                        {-1,-1}, {-1,0}, {-1,1}, {-1,2}, {-1,3},
+                                        {-1,4}, {0,4}, {1,4}, {2,4}, {3,4},
+                                        {4,4}, {4,3}, {4,2}, {4,1}, {4,0},
+                                        {4,-1}, {3,-1}, {2,-1}, {1,-1}, {0,-1}
+                                      };
+    mi->mbmi.need_to_clamp_mvs = 0;
+    for (row = 0; row < 4; ++row)
+    {
+        int mb_to_top_edge = mb->mb_to_top_edge + ((row*4)<<3);
+        int mb_to_bottom_edge = mb->mb_to_bottom_edge - ((row*4)<<3);
+        for (col = 0; col < 4; ++col)
+        {
+            int mb_to_left_edge = mb->mb_to_left_edge + ((col*4)<<3);
+            int mb_to_right_edge = mb->mb_to_right_edge - ((col*4)<<3);
+            int w_sum = 0;
+            int mv_row_sum = 0;
+            int mv_col_sum = 0;
+            int_mv * const mv = &(mi->bmi[row*4 + col].mv);
+            mv->as_int = 0;
+            for (i = 0; i < NUM_NEIGHBORS; ++i)
+            {
+                /* Calculate the weighted sum of neighboring MVs referring
+                 * to the dominant frame type.
+                 */
+                const int w = weights_q7[abs(row - neigh_pos[i].row)]
+                                        [abs(col - neigh_pos[i].col)];
+                if (neighbors[i].ref_frame != dom_ref_frame)
+                    continue;
+                w_sum += w;
+                /* Q7 * Q3 = Q10 */
+                mv_row_sum += w*neighbors[i].mv.row;
+                mv_col_sum += w*neighbors[i].mv.col;
+            }
+            if (w_sum > 0)
+            {
+                /* Avoid division by zero.
+                 * Normalize with the sum of the coefficients
+                 * Q3 = Q10 / Q7
+                 */
+                mv->as_mv.row = mv_row_sum / w_sum;
+                mv->as_mv.col = mv_col_sum / w_sum;
+                mi->mbmi.need_to_clamp_mvs |= vp8_check_mv_bounds(
+                                                            mv,
+                                                            mb_to_left_edge,
+                                                            mb_to_right_edge,
+                                                            mb_to_top_edge,
+                                                            mb_to_bottom_edge);
+            }
+        }
+    }
+}
+
+void vp8_interpolate_motion(MACROBLOCKD *mb,
+                        int mb_row, int mb_col,
+                        int mb_rows, int mb_cols,
+                        int mi_stride)
+{
+    /* Find relevant neighboring blocks */
+    EC_BLOCK neighbors[NUM_NEIGHBORS];
+    int i;
+    /* Initialize the array. MAX_REF_FRAMES is interpreted as "doesn't exist" */
+    for (i = 0; i < NUM_NEIGHBORS; ++i)
+    {
+        neighbors[i].ref_frame = MAX_REF_FRAMES;
+        neighbors[i].mv.row = neighbors[i].mv.col = 0;
+    }
+    find_neighboring_blocks(mb->mode_info_context,
+                                neighbors,
+                                mb_row, mb_col,
+                                mb_rows, mb_cols,
+                                mb->mode_info_stride);
+    /* Interpolate MVs for the missing blocks from the surrounding
+     * blocks which refer to the last frame. */
+    interpolate_mvs(mb, neighbors, LAST_FRAME);
+
+    mb->mode_info_context->mbmi.ref_frame = LAST_FRAME;
+    mb->mode_info_context->mbmi.mode = SPLITMV;
+    mb->mode_info_context->mbmi.uv_mode = DC_PRED;
+    mb->mode_info_context->mbmi.partitioning = 3;
+    mb->mode_info_context->mbmi.segment_id = 0;
+}
+
+void vp8_conceal_corrupt_mb(MACROBLOCKD *xd)
+{
+    /* This macroblock has corrupt residual, use the motion compensated
+       image (predictor) for concealment */
+
+    /* The build predictor functions now output directly into the dst buffer,
+     * so the copies are no longer necessary */
+
+}
diff --git a/libvpx/vp8/decoder/error_concealment.h b/libvpx/vp8/decoder/error_concealment.h
new file mode 100644
index 0000000..65ae9d9
--- /dev/null
+++ b/libvpx/vp8/decoder/error_concealment.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef ERROR_CONCEALMENT_H
+#define ERROR_CONCEALMENT_H
+
+#include "onyxd_int.h"
+#include "ec_types.h"
+
+/* Allocate memory for the overlap lists */
+int vp8_alloc_overlap_lists(VP8D_COMP *pbi);
+
+/* Deallocate the overlap lists */
+void vp8_de_alloc_overlap_lists(VP8D_COMP *pbi);
+
+/* Estimate all missing motion vectors. */
+void vp8_estimate_missing_mvs(VP8D_COMP *pbi);
+
+/* Functions for spatial MV interpolation */
+
+/* Interpolates all motion vectors for a macroblock mb at position
+ * (mb_row, mb_col). */
+void vp8_interpolate_motion(MACROBLOCKD *mb,
+                            int mb_row, int mb_col,
+                            int mb_rows, int mb_cols,
+                            int mi_stride);
+
+/* Conceal a macroblock with corrupt residual.
+ * Copies the prediction signal to the reconstructed image.
+ */
+void vp8_conceal_corrupt_mb(MACROBLOCKD *xd);
+
+#endif
diff --git a/libvpx/vp8/decoder/onyxd_if.c b/libvpx/vp8/decoder/onyxd_if.c
new file mode 100644
index 0000000..8d6871b
--- /dev/null
+++ b/libvpx/vp8/decoder/onyxd_if.c
@@ -0,0 +1,522 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/onyxc_int.h"
+#if CONFIG_POSTPROC
+#include "vp8/common/postproc.h"
+#endif
+#include "vp8/common/onyxd.h"
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "vp8/common/threading.h"
+#include "decoderthreading.h"
+#include <stdio.h>
+#include <assert.h>
+
+#include "vp8/common/quant_common.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp8/common/systemdependent.h"
+#include "vpx_ports/vpx_timer.h"
+#include "detokenize.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "error_concealment.h"
+#endif
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+
+extern void vp8_init_loop_filter(VP8_COMMON *cm);
+extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
+static int get_free_fb (VP8_COMMON *cm);
+static void ref_cnt_fb (int *buf, int *idx, int new_idx);
+
+struct VP8D_COMP * vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
+{
+    VP8D_COMP *pbi = vpx_memalign(32, sizeof(VP8D_COMP));
+
+    if (!pbi)
+        return NULL;
+
+    vpx_memset(pbi, 0, sizeof(VP8D_COMP));
+
+    if (setjmp(pbi->common.error.jmp))
+    {
+        pbi->common.error.setjmp = 0;
+        vp8dx_remove_decompressor(pbi);
+        return 0;
+    }
+
+    pbi->common.error.setjmp = 1;
+
+    vp8_create_common(&pbi->common);
+
+    pbi->common.current_video_frame = 0;
+    pbi->ready_for_new_data = 1;
+
+#if CONFIG_MULTITHREAD
+    pbi->max_threads = oxcf->max_threads;
+    vp8_decoder_create_threads(pbi);
+#endif
+
+    /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
+     *  unnecessary calling of vp8cx_init_de_quantizer() for every frame.
+     */
+    vp8cx_init_de_quantizer(pbi);
+
+    vp8_loop_filter_init(&pbi->common);
+
+    pbi->common.error.setjmp = 0;
+
+#if CONFIG_ERROR_CONCEALMENT
+    pbi->ec_enabled = oxcf->error_concealment;
+    pbi->overlaps = NULL;
+#else
+    pbi->ec_enabled = 0;
+#endif
+    /* Error concealment is activated after a key frame has been
+     * decoded without errors when error concealment is enabled.
+     */
+    pbi->ec_active = 0;
+
+    pbi->decoded_key_frame = 0;
+
+    pbi->input_fragments = oxcf->input_fragments;
+    pbi->num_fragments = 0;
+
+    /* Independent partitions is activated when a frame updates the
+     * token probability table to have equal probabilities over the
+     * PREV_COEF context.
+     */
+    pbi->independent_partitions = 0;
+
+    vp8_setup_block_dptrs(&pbi->mb);
+
+    return pbi;
+}
+
+
+void vp8dx_remove_decompressor(VP8D_COMP *pbi)
+{
+    if (!pbi)
+        return;
+
+#if CONFIG_MULTITHREAD
+    if (pbi->b_multithreaded_rd)
+        vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
+    vp8_decoder_remove_threads(pbi);
+#endif
+#if CONFIG_ERROR_CONCEALMENT
+    vp8_de_alloc_overlap_lists(pbi);
+#endif
+    vp8_remove_common(&pbi->common);
+    vpx_free(pbi);
+}
+
+
+vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8_COMMON *cm = &pbi->common;
+    int ref_fb_idx;
+
+    if (ref_frame_flag == VP8_LAST_FRAME)
+        ref_fb_idx = cm->lst_fb_idx;
+    else if (ref_frame_flag == VP8_GOLD_FRAME)
+        ref_fb_idx = cm->gld_fb_idx;
+    else if (ref_frame_flag == VP8_ALTR_FRAME)
+        ref_fb_idx = cm->alt_fb_idx;
+    else{
+        vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+            "Invalid reference frame");
+        return pbi->common.error.error_code;
+    }
+
+    if(cm->yv12_fb[ref_fb_idx].y_height != sd->y_height ||
+        cm->yv12_fb[ref_fb_idx].y_width != sd->y_width ||
+        cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height ||
+        cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width){
+        vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+            "Incorrect buffer dimensions");
+    }
+    else
+        vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);
+
+    return pbi->common.error.error_code;
+}
+
+
+vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8_COMMON *cm = &pbi->common;
+    int *ref_fb_ptr = NULL;
+    int free_fb;
+
+    if (ref_frame_flag == VP8_LAST_FRAME)
+        ref_fb_ptr = &cm->lst_fb_idx;
+    else if (ref_frame_flag == VP8_GOLD_FRAME)
+        ref_fb_ptr = &cm->gld_fb_idx;
+    else if (ref_frame_flag == VP8_ALTR_FRAME)
+        ref_fb_ptr = &cm->alt_fb_idx;
+    else{
+        vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+            "Invalid reference frame");
+        return pbi->common.error.error_code;
+    }
+
+    if(cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height ||
+        cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width ||
+        cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height ||
+        cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width){
+        vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
+            "Incorrect buffer dimensions");
+    }
+    else{
+        /* Find an empty frame buffer. */
+        free_fb = get_free_fb(cm);
+        /* Decrease fb_idx_ref_cnt since it will be increased again in
+         * ref_cnt_fb() below. */
+        cm->fb_idx_ref_cnt[free_fb]--;
+
+        /* Manage the reference counters and copy image. */
+        ref_cnt_fb (cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb);
+        vp8_yv12_copy_frame(sd, &cm->yv12_fb[*ref_fb_ptr]);
+    }
+
+   return pbi->common.error.error_code;
+}
+
+/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
+#if HAVE_NEON
+extern void vp8_push_neon(int64_t *store);
+extern void vp8_pop_neon(int64_t *store);
+#endif
+
+static int get_free_fb (VP8_COMMON *cm)
+{
+    int i;
+    for (i = 0; i < NUM_YV12_BUFFERS; i++)
+        if (cm->fb_idx_ref_cnt[i] == 0)
+            break;
+
+    assert(i < NUM_YV12_BUFFERS);
+    cm->fb_idx_ref_cnt[i] = 1;
+    return i;
+}
+
+static void ref_cnt_fb (int *buf, int *idx, int new_idx)
+{
+    if (buf[*idx] > 0)
+        buf[*idx]--;
+
+    *idx = new_idx;
+
+    buf[new_idx]++;
+}
+
+/* If any buffer copy / swapping is signalled it should be done here. */
+static int swap_frame_buffers (VP8_COMMON *cm)
+{
+    int err = 0;
+
+    /* The alternate reference frame or golden frame can be updated
+     *  using the new, last, or golden/alt ref frame.  If it
+     *  is updated using the newly decoded frame it is a refresh.
+     *  An update using the last or golden/alt ref frame is a copy.
+     */
+    if (cm->copy_buffer_to_arf)
+    {
+        int new_fb = 0;
+
+        if (cm->copy_buffer_to_arf == 1)
+            new_fb = cm->lst_fb_idx;
+        else if (cm->copy_buffer_to_arf == 2)
+            new_fb = cm->gld_fb_idx;
+        else
+            err = -1;
+
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);
+    }
+
+    if (cm->copy_buffer_to_gf)
+    {
+        int new_fb = 0;
+
+        if (cm->copy_buffer_to_gf == 1)
+            new_fb = cm->lst_fb_idx;
+        else if (cm->copy_buffer_to_gf == 2)
+            new_fb = cm->alt_fb_idx;
+        else
+            err = -1;
+
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);
+    }
+
+    if (cm->refresh_golden_frame)
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);
+
+    if (cm->refresh_alt_ref_frame)
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);
+
+    if (cm->refresh_last_frame)
+    {
+        ref_cnt_fb (cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);
+
+        cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];
+    }
+    else
+        cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+
+    cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+    return err;
+}
+
+int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
+                                  const uint8_t *source,
+                                  int64_t time_stamp)
+{
+#if HAVE_NEON
+    int64_t dx_store_reg[8];
+#endif
+    VP8_COMMON *cm = &pbi->common;
+    int retcode = -1;
+
+    pbi->common.error.error_code = VPX_CODEC_OK;
+
+    if (pbi->num_fragments == 0)
+    {
+        /* New frame, reset fragment pointers and sizes */
+        vpx_memset((void*)pbi->fragments, 0, sizeof(pbi->fragments));
+        vpx_memset(pbi->fragment_sizes, 0, sizeof(pbi->fragment_sizes));
+    }
+    if (pbi->input_fragments && !(source == NULL && size == 0))
+    {
+        /* Store a pointer to this fragment and return. We haven't
+         * received the complete frame yet, so we will wait with decoding.
+         */
+        assert(pbi->num_fragments < MAX_PARTITIONS);
+        pbi->fragments[pbi->num_fragments] = source;
+        pbi->fragment_sizes[pbi->num_fragments] = size;
+        pbi->num_fragments++;
+        if (pbi->num_fragments > (1 << EIGHT_PARTITION) + 1)
+        {
+            pbi->common.error.error_code = VPX_CODEC_UNSUP_BITSTREAM;
+            pbi->common.error.setjmp = 0;
+            pbi->num_fragments = 0;
+            return -1;
+        }
+        return 0;
+    }
+
+    if (!pbi->input_fragments)
+    {
+        pbi->fragments[0] = source;
+        pbi->fragment_sizes[0] = size;
+        pbi->num_fragments = 1;
+    }
+    assert(pbi->common.multi_token_partition <= EIGHT_PARTITION);
+    if (pbi->num_fragments == 0)
+    {
+        pbi->num_fragments = 1;
+        pbi->fragments[0] = NULL;
+        pbi->fragment_sizes[0] = 0;
+    }
+
+    if (!pbi->ec_active &&
+        pbi->num_fragments <= 1 && pbi->fragment_sizes[0] == 0)
+    {
+        /* If error concealment is disabled we won't signal missing frames
+         * to the decoder.
+         */
+        if (cm->fb_idx_ref_cnt[cm->lst_fb_idx] > 1)
+        {
+            /* The last reference shares buffer with another reference
+             * buffer. Move it to its own buffer before setting it as
+             * corrupt, otherwise we will make multiple buffers corrupt.
+             */
+            const int prev_idx = cm->lst_fb_idx;
+            cm->fb_idx_ref_cnt[prev_idx]--;
+            cm->lst_fb_idx = get_free_fb(cm);
+            vp8_yv12_copy_frame(&cm->yv12_fb[prev_idx],
+                                    &cm->yv12_fb[cm->lst_fb_idx]);
+        }
+        /* This is used to signal that we are missing frames.
+         * We do not know if the missing frame(s) was supposed to update
+         * any of the reference buffers, but we act conservative and
+         * mark only the last buffer as corrupted.
+         */
+        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
+        /* Signal that we have no frame to show. */
+        cm->show_frame = 0;
+
+        pbi->num_fragments = 0;
+
+        /* Nothing more to do. */
+        return 0;
+    }
+
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->cpu_caps & HAS_NEON)
+#endif
+    {
+        vp8_push_neon(dx_store_reg);
+    }
+#endif
+
+    cm->new_fb_idx = get_free_fb (cm);
+
+    /* setup reference frames for vp8_decode_frame */
+    pbi->dec_fb_ref[INTRA_FRAME]  = &cm->yv12_fb[cm->new_fb_idx];
+    pbi->dec_fb_ref[LAST_FRAME]   = &cm->yv12_fb[cm->lst_fb_idx];
+    pbi->dec_fb_ref[GOLDEN_FRAME] = &cm->yv12_fb[cm->gld_fb_idx];
+    pbi->dec_fb_ref[ALTREF_FRAME] = &cm->yv12_fb[cm->alt_fb_idx];
+
+    if (setjmp(pbi->common.error.jmp))
+    {
+       /* We do not know if the missing frame(s) was supposed to update
+        * any of the reference buffers, but we act conservative and
+        * mark only the last buffer as corrupted.
+        */
+        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
+        if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+          cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+        goto decode_exit;
+    }
+
+    pbi->common.error.setjmp = 1;
+
+    retcode = vp8_decode_frame(pbi);
+
+    if (retcode < 0)
+    {
+        if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
+          cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
+
+        pbi->common.error.error_code = VPX_CODEC_ERROR;
+        goto decode_exit;
+    }
+
+    if (swap_frame_buffers (cm))
+    {
+        pbi->common.error.error_code = VPX_CODEC_ERROR;
+        goto decode_exit;
+    }
+
+    vp8_clear_system_state();
+
+#if CONFIG_ERROR_CONCEALMENT
+    /* swap the mode infos to storage for future error concealment */
+    if (pbi->ec_enabled && pbi->common.prev_mi)
+    {
+        MODE_INFO* tmp = pbi->common.prev_mi;
+        int row, col;
+        pbi->common.prev_mi = pbi->common.mi;
+        pbi->common.mi = tmp;
+
+        /* Propagate the segment_ids to the next frame */
+        for (row = 0; row < pbi->common.mb_rows; ++row)
+        {
+            for (col = 0; col < pbi->common.mb_cols; ++col)
+            {
+                const int i = row*pbi->common.mode_info_stride + col;
+                pbi->common.mi[i].mbmi.segment_id =
+                        pbi->common.prev_mi[i].mbmi.segment_id;
+            }
+        }
+    }
+#endif
+
+    if (cm->show_frame)
+        cm->current_video_frame++;
+
+    pbi->ready_for_new_data = 0;
+    pbi->last_time_stamp = time_stamp;
+
+decode_exit:
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->cpu_caps & HAS_NEON)
+#endif
+    {
+        vp8_pop_neon(dx_store_reg);
+    }
+#endif
+
+    pbi->common.error.setjmp = 0;
+    pbi->num_fragments = 0;
+    return retcode;
+}
+int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
+{
+    int ret = -1;
+
+    if (pbi->ready_for_new_data == 1)
+        return ret;
+
+    /* ie no raw frame to show!!! */
+    if (pbi->common.show_frame == 0)
+        return ret;
+
+    pbi->ready_for_new_data = 1;
+    *time_stamp = pbi->last_time_stamp;
+    *time_end_stamp = 0;
+
+    sd->clrtype = pbi->common.clr_type;
+#if CONFIG_POSTPROC
+    ret = vp8_post_proc_frame(&pbi->common, sd, flags);
+#else
+
+    if (pbi->common.frame_to_show)
+    {
+        *sd = *pbi->common.frame_to_show;
+        sd->y_width = pbi->common.Width;
+        sd->y_height = pbi->common.Height;
+        sd->uv_height = pbi->common.Height / 2;
+        ret = 0;
+    }
+    else
+    {
+        ret = -1;
+    }
+
+#endif /*!CONFIG_POSTPROC*/
+    vp8_clear_system_state();
+    return ret;
+}
+
+
+/* This function as written isn't decoder specific, but the encoder has
+ * much faster ways of computing this, so it's ok for it to live in a
+ * decode specific file.
+ */
+int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame )
+{
+    const MODE_INFO *mi = oci->mi;
+    int mb_row, mb_col;
+
+    for (mb_row = 0; mb_row < oci->mb_rows; mb_row++)
+    {
+        for (mb_col = 0; mb_col < oci->mb_cols; mb_col++,mi++)
+        {
+            if( mi->mbmi.ref_frame == ref_frame)
+              return 1;
+        }
+        mi++;
+    }
+    return 0;
+
+}
diff --git a/libvpx/vp8/decoder/onyxd_int.h b/libvpx/vp8/decoder/onyxd_int.h
new file mode 100644
index 0000000..0063beb
--- /dev/null
+++ b/libvpx/vp8/decoder/onyxd_int.h
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8D_INT_H
+#define __INC_VP8D_INT_H
+#include "vpx_config.h"
+#include "vp8/common/onyxd.h"
+#include "treereader.h"
+#include "vp8/common/onyxc_int.h"
+#include "vp8/common/threading.h"
+
+#if CONFIG_ERROR_CONCEALMENT
+#include "ec_types.h"
+#endif
+
+typedef struct
+{
+    int ithread;
+    void *ptr1;
+    void *ptr2;
+} DECODETHREAD_DATA;
+
+typedef struct
+{
+    MACROBLOCKD  mbd;
+} MB_ROW_DEC;
+
+typedef struct VP8D_COMP
+{
+    DECLARE_ALIGNED(16, MACROBLOCKD, mb);
+
+    YV12_BUFFER_CONFIG *dec_fb_ref[NUM_YV12_BUFFERS];
+
+    DECLARE_ALIGNED(16, VP8_COMMON, common);
+
+    /* the last partition will be used for the modes/mvs */
+    vp8_reader mbc[MAX_PARTITIONS];
+
+    VP8D_CONFIG oxcf;
+
+
+    const unsigned char *fragments[MAX_PARTITIONS];
+    unsigned int   fragment_sizes[MAX_PARTITIONS];
+    unsigned int   num_fragments;
+
+#if CONFIG_MULTITHREAD
+    /* variable for threading */
+
+    volatile int b_multithreaded_rd;
+    int max_threads;
+    int current_mb_col_main;
+    unsigned int decoding_thread_count;
+    int allocated_decoding_thread_count;
+
+    int mt_baseline_filter_level[MAX_MB_SEGMENTS];
+    int sync_range;
+    int *mt_current_mb_col;                  /* Each row remembers its already decoded column. */
+
+    unsigned char **mt_yabove_row;           /* mb_rows x width */
+    unsigned char **mt_uabove_row;
+    unsigned char **mt_vabove_row;
+    unsigned char **mt_yleft_col;            /* mb_rows x 16 */
+    unsigned char **mt_uleft_col;            /* mb_rows x 8 */
+    unsigned char **mt_vleft_col;            /* mb_rows x 8 */
+
+    MB_ROW_DEC           *mb_row_di;
+    DECODETHREAD_DATA    *de_thread_data;
+
+    pthread_t           *h_decoding_thread;
+    sem_t               *h_event_start_decoding;
+    sem_t                h_event_end_decoding;
+    /* end of threading data */
+#endif
+
+    int64_t last_time_stamp;
+    int   ready_for_new_data;
+
+    vp8_prob prob_intra;
+    vp8_prob prob_last;
+    vp8_prob prob_gf;
+    vp8_prob prob_skip_false;
+
+#if CONFIG_ERROR_CONCEALMENT
+    MB_OVERLAP *overlaps;
+    /* the mb num from which modes and mvs (first partition) are corrupt */
+    unsigned int mvs_corrupt_from_mb;
+#endif
+    int ec_enabled;
+    int ec_active;
+    int input_fragments;
+    int decoded_key_frame;
+    int independent_partitions;
+    int frame_corrupt_residual;
+
+} VP8D_COMP;
+
+int vp8_decode_frame(VP8D_COMP *cpi);
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+        lval = (expr); \
+        if(!lval) \
+            vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+                               "Failed to allocate "#lval" at %s:%d", \
+                               __FILE__,__LINE__);\
+    } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+        lval = (expr); \
+        if(!lval) \
+            vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\
+                               "Failed to allocate "#lval);\
+    } while(0)
+#endif
+
+#endif
diff --git a/libvpx/vp8/decoder/threading.c b/libvpx/vp8/decoder/threading.c
new file mode 100644
index 0000000..88c06be
--- /dev/null
+++ b/libvpx/vp8/decoder/threading.c
@@ -0,0 +1,918 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
+# include <unistd.h>
+#endif
+#include "onyxd_int.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/threading.h"
+
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/extend.h"
+#include "vpx_ports/vpx_timer.h"
+#include "detokenize.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/setupintrarecon.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "error_concealment.h"
+#endif
+
+#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
+#define CALLOC_ARRAY_ALIGNED(p, n, algn) do {                      \
+  CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n)));  \
+  memset((p), 0, (n) * sizeof(*(p)));                              \
+} while (0)
+
+
+extern void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
+
+static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
+{
+    VP8_COMMON *const pc = & pbi->common;
+    int i;
+
+    for (i = 0; i < count; i++)
+    {
+        MACROBLOCKD *mbd = &mbrd[i].mbd;
+        mbd->subpixel_predict        = xd->subpixel_predict;
+        mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
+        mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
+        mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
+
+        mbd->mode_info_context = pc->mi   + pc->mode_info_stride * (i + 1);
+        mbd->mode_info_stride  = pc->mode_info_stride;
+
+        mbd->frame_type = pc->frame_type;
+        mbd->pre = xd->pre;
+        mbd->dst = xd->dst;
+
+        mbd->segmentation_enabled    = xd->segmentation_enabled;
+        mbd->mb_segement_abs_delta     = xd->mb_segement_abs_delta;
+        vpx_memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
+
+        /*signed char ref_lf_deltas[MAX_REF_LF_DELTAS];*/
+        vpx_memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
+        /*signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];*/
+        vpx_memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
+        /*unsigned char mode_ref_lf_delta_enabled;
+        unsigned char mode_ref_lf_delta_update;*/
+        mbd->mode_ref_lf_delta_enabled    = xd->mode_ref_lf_delta_enabled;
+        mbd->mode_ref_lf_delta_update    = xd->mode_ref_lf_delta_update;
+
+        mbd->current_bc = &pbi->mbc[0];
+
+        vpx_memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+        vpx_memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        vpx_memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        vpx_memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+
+        mbd->fullpixel_mask = 0xffffffff;
+
+        if (pc->full_pixel)
+            mbd->fullpixel_mask = 0xfffffff8;
+
+    }
+
+    for (i = 0; i < pc->mb_rows; i++)
+        pbi->mt_current_mb_col[i] = -1;
+}
+
+static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
+                                 unsigned int mb_idx)
+{
+    MB_PREDICTION_MODE mode;
+    int i;
+#if CONFIG_ERROR_CONCEALMENT
+    int corruption_detected = 0;
+#endif
+
+    if (xd->mode_info_context->mbmi.mb_skip_coeff)
+    {
+        vp8_reset_mb_tokens_context(xd);
+    }
+    else if (!vp8dx_bool_error(xd->current_bc))
+    {
+        int eobtotal;
+        eobtotal = vp8_decode_mb_tokens(pbi, xd);
+
+        /* Special case:  Force the loopfilter to skip when eobtotal is zero */
+        xd->mode_info_context->mbmi.mb_skip_coeff = (eobtotal==0);
+    }
+
+    mode = xd->mode_info_context->mbmi.mode;
+
+    if (xd->segmentation_enabled)
+        vp8_mb_init_dequantizer(pbi, xd);
+
+
+#if CONFIG_ERROR_CONCEALMENT
+
+    if(pbi->ec_active)
+    {
+        int throw_residual;
+        /* When we have independent partitions we can apply residual even
+         * though other partitions within the frame are corrupt.
+         */
+        throw_residual = (!pbi->independent_partitions &&
+                          pbi->frame_corrupt_residual);
+        throw_residual = (throw_residual || vp8dx_bool_error(xd->current_bc));
+
+        if ((mb_idx >= pbi->mvs_corrupt_from_mb || throw_residual))
+        {
+            /* MB with corrupt residuals or corrupt mode/motion vectors.
+             * Better to use the predictor as reconstruction.
+             */
+            pbi->frame_corrupt_residual = 1;
+            vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+            vp8_conceal_corrupt_mb(xd);
+
+
+            corruption_detected = 1;
+
+            /* force idct to be skipped for B_PRED and use the
+             * prediction only for reconstruction
+             * */
+            vpx_memset(xd->eobs, 0, 25);
+        }
+    }
+#endif
+
+    /* do prediction */
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    {
+        vp8_build_intra_predictors_mbuv_s(xd,
+                                          xd->recon_above[1],
+                                          xd->recon_above[2],
+                                          xd->recon_left[1],
+                                          xd->recon_left[2],
+                                          xd->recon_left_stride[1],
+                                          xd->dst.u_buffer, xd->dst.v_buffer,
+                                          xd->dst.uv_stride);
+
+        if (mode != B_PRED)
+        {
+            vp8_build_intra_predictors_mby_s(xd,
+                                                 xd->recon_above[0],
+                                                 xd->recon_left[0],
+                                                 xd->recon_left_stride[0],
+                                                 xd->dst.y_buffer,
+                                                 xd->dst.y_stride);
+        }
+        else
+        {
+            short *DQC = xd->dequant_y1;
+            int dst_stride = xd->dst.y_stride;
+
+            /* clear out residual eob info */
+            if(xd->mode_info_context->mbmi.mb_skip_coeff)
+                vpx_memset(xd->eobs, 0, 25);
+
+            intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
+
+            for (i = 0; i < 16; i++)
+            {
+                BLOCKD *b = &xd->block[i];
+                unsigned char *dst = xd->dst.y_buffer + b->offset;
+                B_PREDICTION_MODE b_mode =
+                    xd->mode_info_context->bmi[i].as_mode;
+                unsigned char *Above;
+                unsigned char *yleft;
+                int left_stride;
+                unsigned char top_left;
+
+                /*Caution: For some b_mode, it needs 8 pixels (4 above + 4 above-right).*/
+                if (i < 4 && pbi->common.filter_level)
+                    Above = xd->recon_above[0] + b->offset;
+                else
+                    Above = dst - dst_stride;
+
+                if (i%4==0 && pbi->common.filter_level)
+                {
+                    yleft = xd->recon_left[0] + i;
+                    left_stride = 1;
+                }
+                else
+                {
+                    yleft = dst - 1;
+                    left_stride = dst_stride;
+                }
+
+                if ((i==4 || i==8 || i==12) && pbi->common.filter_level)
+                    top_left = *(xd->recon_left[0] + i - 1);
+                else
+                    top_left = Above[-1];
+
+                vp8_intra4x4_predict(Above, yleft, left_stride,
+                                     b_mode, dst, dst_stride, top_left);
+
+                if (xd->eobs[i] )
+                {
+                    if (xd->eobs[i] > 1)
+                    {
+                        vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride);
+                    }
+                    else
+                    {
+                        vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0],
+                                             dst, dst_stride, dst, dst_stride);
+                        ((int *)b->qcoeff)[0] = 0;
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        vp8_build_inter_predictors_mb(xd);
+    }
+
+
+#if CONFIG_ERROR_CONCEALMENT
+    if (corruption_detected)
+    {
+        return;
+    }
+#endif
+
+    if(!xd->mode_info_context->mbmi.mb_skip_coeff)
+    {
+        /* dequantization and idct */
+        if (mode != B_PRED)
+        {
+            short *DQC = xd->dequant_y1;
+
+            if (mode != SPLITMV)
+            {
+                BLOCKD *b = &xd->block[24];
+
+                /* do 2nd order transform on the dc block */
+                if (xd->eobs[24] > 1)
+                {
+                    vp8_dequantize_b(b, xd->dequant_y2);
+
+                    vp8_short_inv_walsh4x4(&b->dqcoeff[0],
+                        xd->qcoeff);
+                    ((int *)b->qcoeff)[0] = 0;
+                    ((int *)b->qcoeff)[1] = 0;
+                    ((int *)b->qcoeff)[2] = 0;
+                    ((int *)b->qcoeff)[3] = 0;
+                    ((int *)b->qcoeff)[4] = 0;
+                    ((int *)b->qcoeff)[5] = 0;
+                    ((int *)b->qcoeff)[6] = 0;
+                    ((int *)b->qcoeff)[7] = 0;
+                }
+                else
+                {
+                    b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
+                    vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
+                        xd->qcoeff);
+                    ((int *)b->qcoeff)[0] = 0;
+                }
+
+                /* override the dc dequant constant in order to preserve the
+                 * dc components
+                 */
+                DQC = xd->dequant_y1_dc;
+            }
+
+            vp8_dequant_idct_add_y_block
+                            (xd->qcoeff, DQC,
+                             xd->dst.y_buffer,
+                             xd->dst.y_stride, xd->eobs);
+        }
+
+        vp8_dequant_idct_add_uv_block
+                        (xd->qcoeff+16*16, xd->dequant_uv,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
+    }
+}
+
+static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
+{
+    volatile const int *last_row_current_mb_col;
+    volatile int *current_mb_col;
+    int mb_row;
+    VP8_COMMON *pc = &pbi->common;
+    const int nsync = pbi->sync_range;
+    const int first_row_no_sync_above = pc->mb_cols + nsync;
+    int num_part = 1 << pbi->common.multi_token_partition;
+    int last_mb_row = start_mb_row;
+
+    YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+    YV12_BUFFER_CONFIG *yv12_fb_lst = pbi->dec_fb_ref[LAST_FRAME];
+
+    int recon_y_stride = yv12_fb_new->y_stride;
+    int recon_uv_stride = yv12_fb_new->uv_stride;
+
+    unsigned char *ref_buffer[MAX_REF_FRAMES][3];
+    unsigned char *dst_buffer[3];
+    int i;
+    int ref_fb_corrupted[MAX_REF_FRAMES];
+
+    ref_fb_corrupted[INTRA_FRAME] = 0;
+
+    for(i = 1; i < MAX_REF_FRAMES; i++)
+    {
+        YV12_BUFFER_CONFIG *this_fb = pbi->dec_fb_ref[i];
+
+        ref_buffer[i][0] = this_fb->y_buffer;
+        ref_buffer[i][1] = this_fb->u_buffer;
+        ref_buffer[i][2] = this_fb->v_buffer;
+
+        ref_fb_corrupted[i] = this_fb->corrupted;
+    }
+
+    dst_buffer[0] = yv12_fb_new->y_buffer;
+    dst_buffer[1] = yv12_fb_new->u_buffer;
+    dst_buffer[2] = yv12_fb_new->v_buffer;
+
+    xd->up_available = (start_mb_row != 0);
+
+    for (mb_row = start_mb_row; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
+    {
+       int i;
+       int recon_yoffset, recon_uvoffset;
+       int mb_col;
+       int filter_level;
+       loop_filter_info_n *lfi_n = &pc->lf_info;
+
+       /* save last row processed by this thread */
+       last_mb_row = mb_row;
+       /* select bool coder for current partition */
+       xd->current_bc =  &pbi->mbc[mb_row%num_part];
+
+       if (mb_row > 0)
+           last_row_current_mb_col = &pbi->mt_current_mb_col[mb_row -1];
+       else
+           last_row_current_mb_col = &first_row_no_sync_above;
+
+       current_mb_col = &pbi->mt_current_mb_col[mb_row];
+
+       recon_yoffset = mb_row * recon_y_stride * 16;
+       recon_uvoffset = mb_row * recon_uv_stride * 8;
+
+       /* reset contexts */
+       xd->above_context = pc->above_context;
+       vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+
+       xd->left_available = 0;
+
+       xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+       xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+       if (pbi->common.filter_level)
+       {
+          xd->recon_above[0] = pbi->mt_yabove_row[mb_row] + 0*16 +32;
+          xd->recon_above[1] = pbi->mt_uabove_row[mb_row] + 0*8 +16;
+          xd->recon_above[2] = pbi->mt_vabove_row[mb_row] + 0*8 +16;
+
+          xd->recon_left[0] = pbi->mt_yleft_col[mb_row];
+          xd->recon_left[1] = pbi->mt_uleft_col[mb_row];
+          xd->recon_left[2] = pbi->mt_vleft_col[mb_row];
+
+          /* TODO: move to outside row loop */
+          xd->recon_left_stride[0] = 1;
+          xd->recon_left_stride[1] = 1;
+       }
+       else
+       {
+          xd->recon_above[0] = dst_buffer[0] + recon_yoffset;
+          xd->recon_above[1] = dst_buffer[1] + recon_uvoffset;
+          xd->recon_above[2] = dst_buffer[2] + recon_uvoffset;
+
+          xd->recon_left[0] = xd->recon_above[0] - 1;
+          xd->recon_left[1] = xd->recon_above[1] - 1;
+          xd->recon_left[2] = xd->recon_above[2] - 1;
+
+          xd->recon_above[0] -= xd->dst.y_stride;
+          xd->recon_above[1] -= xd->dst.uv_stride;
+          xd->recon_above[2] -= xd->dst.uv_stride;
+
+          /* TODO: move to outside row loop */
+          xd->recon_left_stride[0] = xd->dst.y_stride;
+          xd->recon_left_stride[1] = xd->dst.uv_stride;
+
+          setup_intra_recon_left(xd->recon_left[0], xd->recon_left[1],
+                                 xd->recon_left[2], xd->dst.y_stride,
+                                 xd->dst.uv_stride);
+       }
+
+       for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
+       {
+           *current_mb_col = mb_col - 1;
+
+           if ((mb_col & (nsync - 1)) == 0)
+           {
+               while (mb_col > (*last_row_current_mb_col - nsync))
+               {
+                   x86_pause_hint();
+                   thread_sleep(0);
+               }
+           }
+
+           /* Distance of MB to the various image edges.
+            * These are specified to 8th pel as they are always
+            * compared to values that are in 1/8th pel units.
+            */
+           xd->mb_to_left_edge = -((mb_col * 16) << 3);
+           xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+
+    #if CONFIG_ERROR_CONCEALMENT
+           {
+               int corrupt_residual =
+                           (!pbi->independent_partitions &&
+                           pbi->frame_corrupt_residual) ||
+                           vp8dx_bool_error(xd->current_bc);
+               if (pbi->ec_active &&
+                   (xd->mode_info_context->mbmi.ref_frame ==
+                                                    INTRA_FRAME) &&
+                   corrupt_residual)
+               {
+                   /* We have an intra block with corrupt
+                    * coefficients, better to conceal with an inter
+                    * block.
+                    * Interpolate MVs from neighboring MBs
+                    *
+                    * Note that for the first mb with corrupt
+                    * residual in a frame, we might not discover
+                    * that before decoding the residual. That
+                    * happens after this check, and therefore no
+                    * inter concealment will be done.
+                    */
+                   vp8_interpolate_motion(xd,
+                                          mb_row, mb_col,
+                                          pc->mb_rows, pc->mb_cols,
+                                          pc->mode_info_stride);
+               }
+           }
+    #endif
+
+
+           xd->dst.y_buffer = dst_buffer[0] + recon_yoffset;
+           xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
+           xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
+
+           xd->pre.y_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
+           xd->pre.u_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
+           xd->pre.v_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
+
+           /* propagate errors from reference frames */
+           xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
+
+           mt_decode_macroblock(pbi, xd, 0);
+
+           xd->left_available = 1;
+
+           /* check if the boolean decoder has suffered an error */
+           xd->corrupted |= vp8dx_bool_error(xd->current_bc);
+
+           xd->recon_above[0] += 16;
+           xd->recon_above[1] += 8;
+           xd->recon_above[2] += 8;
+
+           if (!pbi->common.filter_level)
+           {
+              xd->recon_left[0] += 16;
+              xd->recon_left[1] += 8;
+              xd->recon_left[2] += 8;
+           }
+
+           if (pbi->common.filter_level)
+           {
+               int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
+                               xd->mode_info_context->mbmi.mode != SPLITMV &&
+                               xd->mode_info_context->mbmi.mb_skip_coeff);
+
+               const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode];
+               const int seg = xd->mode_info_context->mbmi.segment_id;
+               const int ref_frame = xd->mode_info_context->mbmi.ref_frame;
+
+               filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+               if( mb_row != pc->mb_rows-1 )
+               {
+                   /* Save decoded MB last row data for next-row decoding */
+                   vpx_memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
+                   vpx_memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
+                   vpx_memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
+               }
+
+               /* save left_col for next MB decoding */
+               if(mb_col != pc->mb_cols-1)
+               {
+                   MODE_INFO *next = xd->mode_info_context +1;
+
+                   if (next->mbmi.ref_frame == INTRA_FRAME)
+                   {
+                       for (i = 0; i < 16; i++)
+                           pbi->mt_yleft_col[mb_row][i] = xd->dst.y_buffer [i* recon_y_stride + 15];
+                       for (i = 0; i < 8; i++)
+                       {
+                           pbi->mt_uleft_col[mb_row][i] = xd->dst.u_buffer [i* recon_uv_stride + 7];
+                           pbi->mt_vleft_col[mb_row][i] = xd->dst.v_buffer [i* recon_uv_stride + 7];
+                       }
+                   }
+               }
+
+               /* loopfilter on this macroblock. */
+               if (filter_level)
+               {
+                   if(pc->filter_type == NORMAL_LOOPFILTER)
+                   {
+                       loop_filter_info lfi;
+                       FRAME_TYPE frame_type = pc->frame_type;
+                       const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+                       lfi.mblim = lfi_n->mblim[filter_level];
+                       lfi.blim = lfi_n->blim[filter_level];
+                       lfi.lim = lfi_n->lim[filter_level];
+                       lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+                       if (mb_col > 0)
+                           vp8_loop_filter_mbv
+                           (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+                       if (!skip_lf)
+                           vp8_loop_filter_bv
+                           (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+                       /* don't apply across umv border */
+                       if (mb_row > 0)
+                           vp8_loop_filter_mbh
+                           (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
+
+                       if (!skip_lf)
+                           vp8_loop_filter_bh
+                           (xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer,  recon_y_stride, recon_uv_stride, &lfi);
+                   }
+                   else
+                   {
+                       if (mb_col > 0)
+                           vp8_loop_filter_simple_mbv
+                           (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+                       if (!skip_lf)
+                           vp8_loop_filter_simple_bv
+                           (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+
+                       /* don't apply across umv border */
+                       if (mb_row > 0)
+                           vp8_loop_filter_simple_mbh
+                           (xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
+
+                       if (!skip_lf)
+                           vp8_loop_filter_simple_bh
+                           (xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
+                   }
+               }
+
+           }
+
+           recon_yoffset += 16;
+           recon_uvoffset += 8;
+
+           ++xd->mode_info_context;  /* next mb */
+
+           xd->above_context++;
+       }
+
+       /* adjust to the next row of mbs */
+       if (pbi->common.filter_level)
+       {
+           if(mb_row != pc->mb_rows-1)
+           {
+               int lasty = yv12_fb_lst->y_width + VP8BORDERINPIXELS;
+               int lastuv = (yv12_fb_lst->y_width>>1) + (VP8BORDERINPIXELS>>1);
+
+               for (i = 0; i < 4; i++)
+               {
+                   pbi->mt_yabove_row[mb_row +1][lasty + i] = pbi->mt_yabove_row[mb_row +1][lasty -1];
+                   pbi->mt_uabove_row[mb_row +1][lastuv + i] = pbi->mt_uabove_row[mb_row +1][lastuv -1];
+                   pbi->mt_vabove_row[mb_row +1][lastuv + i] = pbi->mt_vabove_row[mb_row +1][lastuv -1];
+               }
+           }
+       }
+       else
+           vp8_extend_mb_row(yv12_fb_new, xd->dst.y_buffer + 16,
+                             xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+
+       /* last MB of row is ready just after extension is done */
+       *current_mb_col = mb_col + nsync;
+
+       ++xd->mode_info_context;      /* skip prediction column */
+       xd->up_available = 1;
+
+       /* since we have multithread */
+       xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
+    }
+
+    /* signal end of frame decoding if this thread processed the last mb_row */
+    if (last_mb_row == (pc->mb_rows - 1))
+        sem_post(&pbi->h_event_end_decoding);
+
+}
+
+
+static THREAD_FUNCTION thread_decoding_proc(void *p_data)
+{
+    int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
+    VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
+    MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
+    ENTROPY_CONTEXT_PLANES mb_row_left_context;
+
+    while (1)
+    {
+        if (pbi->b_multithreaded_rd == 0)
+            break;
+
+        if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0)
+        {
+            if (pbi->b_multithreaded_rd == 0)
+                break;
+            else
+            {
+                MACROBLOCKD *xd = &mbrd->mbd;
+                xd->left_context = &mb_row_left_context;
+
+                mt_decode_mb_rows(pbi, xd, ithread+1);
+            }
+        }
+    }
+
+    return 0 ;
+}
+
+
+void vp8_decoder_create_threads(VP8D_COMP *pbi)
+{
+    int core_count = 0;
+    unsigned int ithread;
+
+    pbi->b_multithreaded_rd = 0;
+    pbi->allocated_decoding_thread_count = 0;
+
+    /* limit decoding threads to the max number of token partitions */
+    core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
+
+    /* limit decoding threads to the available cores */
+    if (core_count > pbi->common.processor_core_count)
+        core_count = pbi->common.processor_core_count;
+
+    if (core_count > 1)
+    {
+        pbi->b_multithreaded_rd = 1;
+        pbi->decoding_thread_count = core_count - 1;
+
+        CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count);
+        CALLOC_ARRAY(pbi->h_event_start_decoding, pbi->decoding_thread_count);
+        CALLOC_ARRAY_ALIGNED(pbi->mb_row_di, pbi->decoding_thread_count, 32);
+        CALLOC_ARRAY(pbi->de_thread_data, pbi->decoding_thread_count);
+
+        for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
+        {
+            sem_init(&pbi->h_event_start_decoding[ithread], 0, 0);
+
+            vp8_setup_block_dptrs(&pbi->mb_row_di[ithread].mbd);
+
+            pbi->de_thread_data[ithread].ithread  = ithread;
+            pbi->de_thread_data[ithread].ptr1     = (void *)pbi;
+            pbi->de_thread_data[ithread].ptr2     = (void *) &pbi->mb_row_di[ithread];
+
+            pthread_create(&pbi->h_decoding_thread[ithread], 0, thread_decoding_proc, (&pbi->de_thread_data[ithread]));
+        }
+
+        sem_init(&pbi->h_event_end_decoding, 0, 0);
+
+        pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
+    }
+}
+
+
+void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
+{
+    int i;
+
+    if (pbi->b_multithreaded_rd)
+    {
+            vpx_free(pbi->mt_current_mb_col);
+            pbi->mt_current_mb_col = NULL ;
+
+        /* Free above_row buffers. */
+        if (pbi->mt_yabove_row)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                    vpx_free(pbi->mt_yabove_row[i]);
+                    pbi->mt_yabove_row[i] = NULL ;
+            }
+            vpx_free(pbi->mt_yabove_row);
+            pbi->mt_yabove_row = NULL ;
+        }
+
+        if (pbi->mt_uabove_row)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                    vpx_free(pbi->mt_uabove_row[i]);
+                    pbi->mt_uabove_row[i] = NULL ;
+            }
+            vpx_free(pbi->mt_uabove_row);
+            pbi->mt_uabove_row = NULL ;
+        }
+
+        if (pbi->mt_vabove_row)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                    vpx_free(pbi->mt_vabove_row[i]);
+                    pbi->mt_vabove_row[i] = NULL ;
+            }
+            vpx_free(pbi->mt_vabove_row);
+            pbi->mt_vabove_row = NULL ;
+        }
+
+        /* Free left_col buffers. */
+        if (pbi->mt_yleft_col)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                    vpx_free(pbi->mt_yleft_col[i]);
+                    pbi->mt_yleft_col[i] = NULL ;
+            }
+            vpx_free(pbi->mt_yleft_col);
+            pbi->mt_yleft_col = NULL ;
+        }
+
+        if (pbi->mt_uleft_col)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                    vpx_free(pbi->mt_uleft_col[i]);
+                    pbi->mt_uleft_col[i] = NULL ;
+            }
+            vpx_free(pbi->mt_uleft_col);
+            pbi->mt_uleft_col = NULL ;
+        }
+
+        if (pbi->mt_vleft_col)
+        {
+            for (i=0; i< mb_rows; i++)
+            {
+                    vpx_free(pbi->mt_vleft_col[i]);
+                    pbi->mt_vleft_col[i] = NULL ;
+            }
+            vpx_free(pbi->mt_vleft_col);
+            pbi->mt_vleft_col = NULL ;
+        }
+    }
+}
+
+
+void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
+{
+    VP8_COMMON *const pc = & pbi->common;
+    int i;
+    int uv_width;
+
+    if (pbi->b_multithreaded_rd)
+    {
+        vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
+
+        /* our internal buffers are always multiples of 16 */
+        if ((width & 0xf) != 0)
+            width += 16 - (width & 0xf);
+
+        if (width < 640) pbi->sync_range = 1;
+        else if (width <= 1280) pbi->sync_range = 8;
+        else if (width <= 2560) pbi->sync_range =16;
+        else pbi->sync_range = 32;
+
+        uv_width = width >>1;
+
+        /* Allocate an int for each mb row. */
+        CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);
+
+        /* Allocate memory for above_row buffers. */
+        CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1))));
+
+        CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+
+        CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+
+        /* Allocate memory for left_col buffers. */
+        CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_yleft_col[i], vpx_calloc(sizeof(unsigned char) * 16, 1));
+
+        CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_uleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
+
+        CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
+        for (i = 0; i < pc->mb_rows; i++)
+            CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
+    }
+}
+
+
+void vp8_decoder_remove_threads(VP8D_COMP *pbi)
+{
+    /* shutdown MB Decoding thread; */
+    if (pbi->b_multithreaded_rd)
+    {
+        int i;
+
+        pbi->b_multithreaded_rd = 0;
+
+        /* allow all threads to exit */
+        for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+        {
+            sem_post(&pbi->h_event_start_decoding[i]);
+            pthread_join(pbi->h_decoding_thread[i], NULL);
+        }
+
+        for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+        {
+            sem_destroy(&pbi->h_event_start_decoding[i]);
+        }
+
+        sem_destroy(&pbi->h_event_end_decoding);
+
+            vpx_free(pbi->h_decoding_thread);
+            pbi->h_decoding_thread = NULL;
+
+            vpx_free(pbi->h_event_start_decoding);
+            pbi->h_event_start_decoding = NULL;
+
+            vpx_free(pbi->mb_row_di);
+            pbi->mb_row_di = NULL ;
+
+            vpx_free(pbi->de_thread_data);
+            pbi->de_thread_data = NULL;
+    }
+}
+
+void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+    VP8_COMMON *pc = &pbi->common;
+    unsigned int i;
+    int j;
+
+    int filter_level = pc->filter_level;
+    YV12_BUFFER_CONFIG *yv12_fb_new = pbi->dec_fb_ref[INTRA_FRAME];
+
+    if (filter_level)
+    {
+        /* Set above_row buffer to 127 for decoding first MB row */
+        vpx_memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, yv12_fb_new->y_width + 5);
+        vpx_memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+        vpx_memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+
+        for (j=1; j<pc->mb_rows; j++)
+        {
+            vpx_memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS-1, (unsigned char)129, 1);
+            vpx_memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+            vpx_memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+        }
+
+        /* Set left_col to 129 initially */
+        for (j=0; j<pc->mb_rows; j++)
+        {
+            vpx_memset(pbi->mt_yleft_col[j], (unsigned char)129, 16);
+            vpx_memset(pbi->mt_uleft_col[j], (unsigned char)129, 8);
+            vpx_memset(pbi->mt_vleft_col[j], (unsigned char)129, 8);
+        }
+
+        /* Initialize the loop filter for this frame. */
+        vp8_loop_filter_frame_init(pc, &pbi->mb, filter_level);
+    }
+    else
+        vp8_setup_intra_recon_top_line(yv12_fb_new);
+
+    setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
+
+    for (i = 0; i < pbi->decoding_thread_count; i++)
+        sem_post(&pbi->h_event_start_decoding[i]);
+
+    mt_decode_mb_rows(pbi, xd, 0);
+
+    sem_wait(&pbi->h_event_end_decoding);   /* add back for each frame */
+}
diff --git a/libvpx/vp8/decoder/treereader.h b/libvpx/vp8/decoder/treereader.h
new file mode 100644
index 0000000..238ff85
--- /dev/null
+++ b/libvpx/vp8/decoder/treereader.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef tree_reader_h
+#define tree_reader_h 1
+
+#include "vp8/common/treecoder.h"
+
+#include "dboolhuff.h"
+
+typedef BOOL_DECODER vp8_reader;
+
+#define vp8_read vp8dx_decode_bool
+#define vp8_read_literal vp8_decode_value
+#define vp8_read_bit( R) vp8_read( R, vp8_prob_half)
+
+
+/* Intent of tree data structure is to make decoding trivial. */
+
+static int vp8_treed_read(
+    vp8_reader *const r,        /* !!! must return a 0 or 1 !!! */
+    vp8_tree t,
+    const vp8_prob *const p
+)
+{
+    register vp8_tree_index i = 0;
+
+    while ((i = t[ i + vp8_read(r, p[i>>1])]) > 0) ;
+
+    return -i;
+}
+
+#endif /* tree_reader_h */
diff --git a/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm b/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
new file mode 100644
index 0000000..a644a00
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
@@ -0,0 +1,310 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_start_encode|
+    EXPORT |vp8_encode_bool|
+    EXPORT |vp8_stop_encode|
+    EXPORT |vp8_encode_value|
+    IMPORT |vp8_validate_buffer_arm|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+    ; macro for validating write buffer position
+    ; needs vp8_writer in r0
+    ; start shall not be in r1
+    MACRO
+    VALIDATE_POS $start, $pos
+    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+    ldr  r2, [r0, #vp8_writer_buffer_end]
+    ldr  r3, [r0, #vp8_writer_error]
+    mov  r1, $pos
+    mov  r0, $start
+    bl   vp8_validate_buffer_arm
+    pop  {r0-r3, r12, lr}
+    MEND
+
+; r0 BOOL_CODER *br
+; r1 unsigned char *source
+; r2 unsigned char *source_end
+|vp8_start_encode| PROC
+    str     r2,  [r0, #vp8_writer_buffer_end]
+    mov     r12, #0
+    mov     r3,  #255
+    mvn     r2,  #23
+    str     r12, [r0, #vp8_writer_lowvalue]
+    str     r3,  [r0, #vp8_writer_range]
+    str     r2,  [r0, #vp8_writer_count]
+    str     r12, [r0, #vp8_writer_pos]
+    str     r1,  [r0, #vp8_writer_buffer]
+    bx      lr
+    ENDP
+
+; r0 BOOL_CODER *br
+; r1 int bit
+; r2 int probability
+|vp8_encode_bool| PROC
+    push    {r4-r10, lr}
+
+    mov     r4, r2
+
+    ldr     r2, [r0, #vp8_writer_lowvalue]
+    ldr     r5, [r0, #vp8_writer_range]
+    ldr     r3, [r0, #vp8_writer_count]
+
+    sub     r7, r5, #1                  ; range-1
+
+    cmp     r1, #0
+    mul     r6, r4, r7                  ; ((range-1) * probability)
+
+    mov     r7, #1
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * probability) >> 8)
+
+    addne   r2, r2, r4                  ; if  (bit) lowvalue += split
+    subne   r4, r5, r4                  ; if  (bit) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r9, #0
+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r1, [r7, r4]
+    cmpge   r1, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r9, [r7, r4]                ; w->buffer[x]
+    add     r9, r9, #1
+    strb    r9, [r7, r4]                ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r9, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r1, r4, #1                  ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r1, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r9, r1                 ; validate_buffer at pos
+
+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    str     r2, [r0, #vp8_writer_lowvalue]
+    str     r5, [r0, #vp8_writer_range]
+    str     r3, [r0, #vp8_writer_count]
+    pop     {r4-r10, pc}
+    ENDP
+
+; r0 BOOL_CODER *br
+|vp8_stop_encode| PROC
+    push    {r4-r10, lr}
+
+    ldr     r2, [r0, #vp8_writer_lowvalue]
+    ldr     r5, [r0, #vp8_writer_range]
+    ldr     r3, [r0, #vp8_writer_count]
+
+    mov     r10, #32
+
+stop_encode_loop
+    sub     r7, r5, #1                  ; range-1
+
+    mov     r4, r7, lsl #7              ; ((range-1) * 128)
+
+    mov     r7, #1
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero_se      ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set_se
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start_se
+token_zero_while_loop_se
+    mov     r9, #0
+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start_se
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r1, [r7, r4]
+    cmpge   r1, #0xff
+    beq     token_zero_while_loop_se
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r9, [r7, r4]                ; w->buffer[x]
+    add     r9, r9, #1
+    strb    r9, [r7, r4]                ; w->buffer[x] + 1
+token_high_bit_not_set_se
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r9, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r1, r4, #1                  ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r1, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r9, r1                 ; validate_buffer at pos
+
+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r10, r10, #1
+    bne     stop_encode_loop
+
+    str     r2, [r0, #vp8_writer_lowvalue]
+    str     r5, [r0, #vp8_writer_range]
+    str     r3, [r0, #vp8_writer_count]
+    pop     {r4-r10, pc}
+
+    ENDP
+
+; r0 BOOL_CODER *br
+; r1 int data
+; r2 int bits
+|vp8_encode_value| PROC
+    push    {r4-r12, lr}
+
+    mov     r10, r2
+
+    ldr     r2, [r0, #vp8_writer_lowvalue]
+    ldr     r5, [r0, #vp8_writer_range]
+    ldr     r3, [r0, #vp8_writer_count]
+
+    rsb     r4, r10, #32                 ; 32-n
+
+    ; v is kept in r1 during the token pack loop
+    lsl     r1, r1, r4                  ; r1 = v << 32 - n
+
+encode_value_loop
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsls    r1, r1, #1                  ; bit = v >> n
+    mov     r4, r7, lsl #7              ; ((range-1) * 128)
+
+    mov     r7, #1
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bit) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bit) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero_ev      ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set_ev
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start_ev
+token_zero_while_loop_ev
+    mov     r9, #0
+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start_ev
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop_ev
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r9, [r7, r4]                ; w->buffer[x]
+    add     r9, r9, #1
+    strb    r9, [r7, r4]                ; w->buffer[x] + 1
+token_high_bit_not_set_ev
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r9, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r9, r11                ; validate_buffer at pos
+
+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
+
+token_count_lt_zero_ev
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r10, r10, #1
+    bne     encode_value_loop
+
+    str     r2, [r0, #vp8_writer_lowvalue]
+    str     r5, [r0, #vp8_writer_range]
+    str     r3, [r0, #vp8_writer_count]
+    pop     {r4-r12, pc}
+    ENDP
+
+    END
diff --git a/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
new file mode 100644
index 0000000..a1cd467
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
@@ -0,0 +1,317 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8cx_pack_tokens_armv5|
+    IMPORT |vp8_validate_buffer_arm|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+
+    ; macro for validating write buffer position
+    ; needs vp8_writer in r0
+    ; start shall not be in r1
+    MACRO
+    VALIDATE_POS $start, $pos
+    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+    ldr  r2, [r0, #vp8_writer_buffer_end]
+    ldr  r3, [r0, #vp8_writer_error]
+    mov  r1, $pos
+    mov  r0, $start
+    bl   vp8_validate_buffer_arm
+    pop  {r0-r3, r12, lr}
+    MEND
+
+
+; r0 vp8_writer *w
+; r1 const TOKENEXTRA *p
+; r2 int xcount
+; r3 vp8_coef_encodings
+; s0 vp8_extra_bits
+; s1 vp8_coef_tree
+|vp8cx_pack_tokens_armv5| PROC
+    push    {r4-r12, lr}
+    sub     sp, sp, #16
+
+    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
+    ;  sizeof (TOKENEXTRA) is 8
+    add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
+    str     r2, [sp, #0]
+    str     r3, [sp, #8]                ; save vp8_coef_encodings
+    ldr     r2, [r0, #vp8_writer_lowvalue]
+    ldr     r5, [r0, #vp8_writer_range]
+    ldr     r3, [r0, #vp8_writer_count]
+    b       check_p_lt_stop
+
+while_p_lt_stop
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r4, [sp, #8]                ; vp8_coef_encodings
+    mov     lr, #0
+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]
+
+    ldr     r6, [r4, #vp8_token_value]  ; v
+    ldr     r8, [r4, #vp8_token_len]    ; n
+
+    ; vp8 specific skip_eob_node
+    cmp     r7, #0
+    movne   lr, #2                      ; i = 2
+    subne   r8, r8, #1                  ; --n
+
+    rsb     r4, r8, #32                 ; 32-n
+    ldr     r10, [sp, #60]              ; vp8_coef_tree
+
+    ; v is kept in r12 during the token pack loop
+    lsl     r12, r6, r4                ; r12 = v << 32 - n
+
+; loop start
+token_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsls    r12, r12, #1                ; bb = v >> n
+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
+
+    ; bb can only be 0 or 1.  So only execute this statement
+    ; if bb == 1, otherwise it will act like i + 0
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+    ; r10 is used earlier in the loop, but r10 is used as
+    ; temp variable here.  So after r10 is used, reload
+    ; vp8_coef_tree_dcd into r10
+    ldr     r10, [sp, #60]              ; vp8_coef_tree
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r8, r8, #1                  ; --n
+    bne     token_loop
+
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r7, [sp, #56]               ; vp8_extra_bits
+    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+    ;  element.  Here vp8_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+
+    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
+    cmp     r4, #0
+    beq     skip_extra_bits
+
+;   if( b->base_val)
+    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+    cmp     r8, #0                      ; if( L)
+    beq     no_extra_bits
+
+    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
+    asr     r7, lr, #1                  ; v=e>>1
+
+    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
+    str     r10, [sp, #4]               ; b->tree
+
+    rsb     r4, r8, #32
+    lsl     r12, r7, r4
+
+    mov     lr, #0                      ; i = 0
+
+extra_bits_loop
+    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
+    sub     r7, r5, #1                  ; range-1
+    lsls    r12, r12, #1                ; v >> n
+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    clz     r6, r4
+    sub     r6, r6, #24
+
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     extra_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset= shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     extra_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos - 1
+    b       extra_zero_while_start
+extra_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+extra_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     extra_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]
+extra_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+    ldr     r10, [sp, #4]               ; b->tree
+extra_count_lt_zero
+    lsl     r2, r2, r6
+
+    subs    r8, r8, #1                  ; --n
+    bne     extra_bits_loop             ; while (n)
+
+no_extra_bits
+    ldr     lr, [r1, #4]                ; e = p->Extra
+    add     r4, r5, #1                  ; range + 1
+    tst     lr, #1
+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+    addne   r2, r2, r4                  ; lowvalue += split
+    subne   r4, r5, r4                  ; range = range-split
+    tst     r2, #0x80000000             ; lowvalue & 0x80000000
+    lsl     r5, r4, #1                  ; range <<= 1
+    beq     end_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mov     r7, #0
+    sub     r4, r4, #1
+    b       end_zero_while_start
+end_zero_while_loop
+    strb    r7, [r6, r4]
+    sub     r4, r4, #1                  ; x--
+end_zero_while_start
+    cmp     r4, #0
+    ldrge   r6, [r0, #vp8_writer_buffer]
+    ldrb    r12, [r6, r4]
+    cmpge   r12, #0xff
+    beq     end_zero_while_loop
+
+    ldr     r6, [r0, #vp8_writer_buffer]
+    ldrb    r7, [r6, r4]
+    add     r7, r7, #1
+    strb    r7, [r6, r4]
+end_high_bit_not_set
+    adds    r3, r3, #1                  ; ++count
+    lsl     r2, r2, #1                  ; lowvalue  <<= 1
+    bne     end_count_zero
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mvn     r3, #7
+    ldr     r7, [r0, #vp8_writer_buffer]
+    lsr     r6, r2, #24                 ; lowvalue >> 24
+    add     r12, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r12, [r0, #vp8_writer_pos]
+
+    VALIDATE_POS r7, r12               ; validate_buffer at pos
+
+    strb    r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+check_p_lt_stop
+    ldr     r4, [sp, #0]                ; stop
+    cmp     r1, r4                      ; while( p < stop)
+    bcc     while_p_lt_stop
+
+    str     r2, [r0, #vp8_writer_lowvalue]
+    str     r5, [r0, #vp8_writer_range]
+    str     r3, [r0, #vp8_writer_count]
+    add     sp, sp, #16
+    pop     {r4-r12, pc}
+    ENDP
+
+    END
diff --git a/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
new file mode 100644
index 0000000..1fa5e6c
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
@@ -0,0 +1,352 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8cx_pack_mb_row_tokens_armv5|
+    IMPORT |vp8_validate_buffer_arm|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+
+    ; macro for validating write buffer position
+    ; needs vp8_writer in r0
+    ; start shall not be in r1
+    MACRO
+    VALIDATE_POS $start, $pos
+    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+    ldr  r2, [r0, #vp8_writer_buffer_end]
+    ldr  r3, [r0, #vp8_writer_error]
+    mov  r1, $pos
+    mov  r0, $start
+    bl   vp8_validate_buffer_arm
+    pop  {r0-r3, r12, lr}
+    MEND
+
+; r0 VP8_COMP *cpi
+; r1 vp8_writer *w
+; r2 vp8_coef_encodings
+; r3 vp8_extra_bits
+; s0 vp8_coef_tree
+
+|vp8cx_pack_mb_row_tokens_armv5| PROC
+    push    {r4-r12, lr}
+    sub     sp, sp, #24
+
+    ; Compute address of cpi->common.mb_rows
+    ldr     r4, _VP8_COMP_common_
+    ldr     r6, _VP8_COMMON_MBrows_
+    add     r4, r0, r4
+
+    ldr     r5, [r4, r6]                ; load up mb_rows
+
+    str     r2, [sp, #20]               ; save vp8_coef_encodings
+    str     r5, [sp, #12]               ; save mb_rows
+    str     r3, [sp, #8]                ; save vp8_extra_bits
+
+    ldr     r4, _VP8_COMP_tplist_
+    add     r4, r0, r4
+    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
+
+    mov     r0, r1                      ; keep same as other loops
+
+    ldr     r2, [r0, #vp8_writer_lowvalue]
+    ldr     r5, [r0, #vp8_writer_range]
+    ldr     r3, [r0, #vp8_writer_count]
+
+mb_row_loop
+
+    ldr     r1, [r7, #tokenlist_start]
+    ldr     r9, [r7, #tokenlist_stop]
+    str     r9, [sp, #0]                ; save stop for later comparison
+    str     r7, [sp, #16]               ; tokenlist address for next time
+
+    b       check_p_lt_stop
+
+    ; actuall work gets done here!
+
+while_p_lt_stop
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r4, [sp, #20]               ; vp8_coef_encodings
+    mov     lr, #0
+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]
+
+    ldr     r6, [r4, #vp8_token_value]  ; v
+    ldr     r8, [r4, #vp8_token_len]    ; n
+
+    ; vp8 specific skip_eob_node
+    cmp     r7, #0
+    movne   lr, #2                      ; i = 2
+    subne   r8, r8, #1                  ; --n
+
+    rsb     r4, r8, #32                 ; 32-n
+    ldr     r10, [sp, #64]              ; vp8_coef_tree
+
+    ; v is kept in r12 during the token pack loop
+    lsl     r12, r6, r4                 ; r12 = v << 32 - n
+
+; loop start
+token_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsls    r12, r12, #1                ; bb = v >> n
+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
+
+    ; bb can only be 0 or 1.  So only execute this statement
+    ; if bb == 1, otherwise it will act like i + 0
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+    ; r10 is used earlier in the loop, but r10 is used as
+    ; temp variable here.  So after r10 is used, reload
+    ; vp8_coef_tree_dcd into r10
+    ldr     r10, [sp, #64]              ; vp8_coef_tree
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r8, r8, #1                  ; --n
+    bne     token_loop
+
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r7, [sp, #8]                ; vp8_extra_bits
+    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+    ;  element.  Here vp8_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+
+    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
+    cmp     r4, #0
+    beq     skip_extra_bits
+
+;   if( b->base_val)
+    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+    cmp     r8, #0                      ; if( L)
+    beq     no_extra_bits
+
+    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
+    asr     r7, lr, #1                  ; v=e>>1
+
+    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
+    str     r10, [sp, #4]               ; b->tree
+
+    rsb     r4, r8, #32
+    lsl     r12, r7, r4
+
+    mov     lr, #0                      ; i = 0
+
+extra_bits_loop
+    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
+    sub     r7, r5, #1                  ; range-1
+    lsls    r12, r12, #1                ; v >> n
+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    clz     r6, r4
+    sub     r6, r6, #24
+
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     extra_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset= shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     extra_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos - 1
+    b       extra_zero_while_start
+extra_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+extra_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     extra_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]
+extra_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+    ldr     r10, [sp, #4]               ; b->tree
+extra_count_lt_zero
+    lsl     r2, r2, r6
+
+    subs    r8, r8, #1                  ; --n
+    bne     extra_bits_loop             ; while (n)
+
+no_extra_bits
+    ldr     lr, [r1, #4]                ; e = p->Extra
+    add     r4, r5, #1                  ; range + 1
+    tst     lr, #1
+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+    addne   r2, r2, r4                  ; lowvalue += split
+    subne   r4, r5, r4                  ; range = range-split
+    tst     r2, #0x80000000             ; lowvalue & 0x80000000
+    lsl     r5, r4, #1                  ; range <<= 1
+    beq     end_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mov     r7, #0
+    sub     r4, r4, #1
+    b       end_zero_while_start
+end_zero_while_loop
+    strb    r7, [r6, r4]
+    sub     r4, r4, #1                  ; x--
+end_zero_while_start
+    cmp     r4, #0
+    ldrge   r6, [r0, #vp8_writer_buffer]
+    ldrb    r12, [r6, r4]
+    cmpge   r12, #0xff
+    beq     end_zero_while_loop
+
+    ldr     r6, [r0, #vp8_writer_buffer]
+    ldrb    r7, [r6, r4]
+    add     r7, r7, #1
+    strb    r7, [r6, r4]
+end_high_bit_not_set
+    adds    r3, r3, #1                  ; ++count
+    lsl     r2, r2, #1                  ; lowvalue  <<= 1
+    bne     end_count_zero
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mvn     r3, #7
+    ldr     r7, [r0, #vp8_writer_buffer]
+    lsr     r6, r2, #24                 ; lowvalue >> 24
+    add     r12, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r12, [r0, #vp8_writer_pos]
+
+    VALIDATE_POS r7, r12               ; validate_buffer at pos
+
+    strb    r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+check_p_lt_stop
+    ldr     r4, [sp, #0]                ; stop
+    cmp     r1, r4                      ; while( p < stop)
+    bcc     while_p_lt_stop
+
+    ldr     r6, [sp, #12]               ; mb_rows
+    ldr     r7, [sp, #16]               ; tokenlist address
+    subs    r6, r6, #1
+    add     r7, r7, #TOKENLIST_SZ       ; next element in the array
+    str     r6, [sp, #12]
+    bne     mb_row_loop
+
+    str     r2, [r0, #vp8_writer_lowvalue]
+    str     r5, [r0, #vp8_writer_range]
+    str     r3, [r0, #vp8_writer_count]
+    add     sp, sp, #24
+    pop     {r4-r12, pc}
+    ENDP
+
+_VP8_COMP_common_
+    DCD     vp8_comp_common
+_VP8_COMMON_MBrows_
+    DCD     vp8_common_mb_rows
+_VP8_COMP_tplist_
+    DCD     vp8_comp_tplist
+
+    END
diff --git a/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
new file mode 100644
index 0000000..90a98fe
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -0,0 +1,471 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
+    IMPORT |vp8_validate_buffer_arm|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+    ; macro for validating write buffer position
+    ; needs vp8_writer in r0
+    ; start shall not be in r1
+    MACRO
+    VALIDATE_POS $start, $pos
+    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+    ldr  r2, [r0, #vp8_writer_buffer_end]
+    ldr  r3, [r0, #vp8_writer_error]
+    mov  r1, $pos
+    mov  r0, $start
+    bl   vp8_validate_buffer_arm
+    pop  {r0-r3, r12, lr}
+    MEND
+
+; r0 VP8_COMP *cpi
+; r1 unsigned char *cx_data
+; r2 const unsigned char *cx_data_end
+; r3 int num_part
+; s0 vp8_coef_encodings
+; s1 vp8_extra_bits,
+; s2 const vp8_tree_index *
+
+|vp8cx_pack_tokens_into_partitions_armv5| PROC
+    push    {r4-r12, lr}
+    sub     sp, sp, #40
+
+    ; Compute address of cpi->common.mb_rows
+    ldr     r4, _VP8_COMP_common_
+    ldr     r6, _VP8_COMMON_MBrows_
+    add     r4, r0, r4
+
+    ldr     r5, [r4, r6]                ; load up mb_rows
+
+    str     r5, [sp, #36]               ; save mb_rows
+    str     r1, [sp, #24]               ; save ptr = cx_data
+    str     r3, [sp, #20]               ; save num_part
+    str     r2, [sp, #8]                ; save cx_data_end
+
+    ldr     r4, _VP8_COMP_tplist_
+    add     r4, r0, r4
+    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
+    str     r7, [sp, #32]               ; store start of cpi->tp_list
+
+    ldr     r11, _VP8_COMP_bc_          ; load up vp8_writer out of cpi
+    add     r0, r0, r11
+
+    mov     r11, #0
+    str     r11, [sp, #28]              ; i
+
+numparts_loop
+    ldr     r2, _vp8_writer_sz_         ; load up sizeof(vp8_writer)
+    add     r0, r2                      ; bc[i + 1]
+
+    ldr     r10, [sp, #24]              ; ptr
+    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
+    subs    r5, r5, r11                 ; move start point with each partition
+                                        ; mb_rows starts at i
+    str     r5,  [sp, #12]
+
+    ; Reset all of the VP8 Writer data for each partition that
+    ; is processed.
+    ; start_encode
+
+    ldr     r3, [sp, #8]
+    str     r3, [r0, #vp8_writer_buffer_end]
+
+    mov     r2, #0                      ; vp8_writer_lowvalue
+    mov     r5, #255                    ; vp8_writer_range
+    mvn     r3, #23                     ; vp8_writer_count
+
+    str     r2,  [r0, #vp8_writer_pos]
+    str     r10, [r0, #vp8_writer_buffer]
+
+    ble     end_partition               ; if (mb_rows <= 0) end partition
+
+mb_row_loop
+
+    ldr     r1, [r7, #tokenlist_start]
+    ldr     r9, [r7, #tokenlist_stop]
+    str     r9, [sp, #0]                ; save stop for later comparison
+    str     r7, [sp, #16]               ; tokenlist address for next time
+
+    b       check_p_lt_stop
+
+    ; actual work gets done here!
+
+while_p_lt_stop
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r4, [sp, #80]               ; vp8_coef_encodings
+    mov     lr, #0
+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+
+    ldrb    r7, [r1, #tokenextra_skip_eob_node]
+
+    ldr     r6, [r4, #vp8_token_value]  ; v
+    ldr     r8, [r4, #vp8_token_len]    ; n
+
+    ; vp8 specific skip_eob_node
+    cmp     r7, #0
+    movne   lr, #2                      ; i = 2
+    subne   r8, r8, #1                  ; --n
+
+    rsb     r4, r8, #32                 ; 32-n
+    ldr     r10, [sp, #88]              ; vp8_coef_tree
+
+    ; v is kept in r12 during the token pack loop
+    lsl     r12, r6, r4                ; r12 = v << 32 - n
+
+; loop start
+token_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsls    r12, r12, #1                ; bb = v >> n
+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
+
+    ; bb can only be 0 or 1.  So only execute this statement
+    ; if bb == 1, otherwise it will act like i + 0
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+    ; r10 is used earlier in the loop, but r10 is used as
+    ; temp variable here.  So after r10 is used, reload
+    ; vp8_coef_tree_dcd into r10
+    ldr     r10, [sp, #88]              ; vp8_coef_tree
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r8, r8, #1                  ; --n
+    bne     token_loop
+
+    ldrb    r6, [r1, #tokenextra_token] ; t
+    ldr     r7, [sp, #84]                ; vp8_extra_bits
+    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+    ;  element.  Here vp8_extra_bit_struct == 16
+    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
+
+    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
+    cmp     r4, #0
+    beq     skip_extra_bits
+
+;   if( b->base_val)
+    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
+    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
+    cmp     r8, #0                      ; if( L)
+    beq     no_extra_bits
+
+    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
+    asr     r7, lr, #1                  ; v=e>>1
+
+    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
+    str     r10, [sp, #4]               ; b->tree
+
+    rsb     r4, r8, #32
+    lsl     r12, r7, r4
+
+    mov     lr, #0                      ; i = 0
+
+extra_bits_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]
+    sub     r7, r5, #1                  ; range-1
+    lsls    r12, r12, #1                ; v >> n
+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    clz     r6, r4
+    sub     r6, r6, #24
+
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     extra_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset= shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     extra_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos - 1
+    b       extra_zero_while_start
+extra_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+extra_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     extra_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]
+extra_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+    ldr     r10, [sp, #4]               ; b->tree
+extra_count_lt_zero
+    lsl     r2, r2, r6
+
+    subs    r8, r8, #1                  ; --n
+    bne     extra_bits_loop             ; while (n)
+
+no_extra_bits
+    ldr     lr, [r1, #4]                ; e = p->Extra
+    add     r4, r5, #1                  ; range + 1
+    tst     lr, #1
+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+    addne   r2, r2, r4                  ; lowvalue += split
+    subne   r4, r5, r4                  ; range = range-split
+    tst     r2, #0x80000000             ; lowvalue & 0x80000000
+    lsl     r5, r4, #1                  ; range <<= 1
+    beq     end_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mov     r7, #0
+    sub     r4, r4, #1
+    b       end_zero_while_start
+end_zero_while_loop
+    strb    r7, [r6, r4]
+    sub     r4, r4, #1                  ; x--
+end_zero_while_start
+    cmp     r4, #0
+    ldrge   r6, [r0, #vp8_writer_buffer]
+    ldrb    r12, [r6, r4]
+    cmpge   r12, #0xff
+    beq     end_zero_while_loop
+
+    ldr     r6, [r0, #vp8_writer_buffer]
+    ldrb    r7, [r6, r4]
+    add     r7, r7, #1
+    strb    r7, [r6, r4]
+end_high_bit_not_set
+    adds    r3, r3, #1                  ; ++count
+    lsl     r2, r2, #1                  ; lowvalue  <<= 1
+    bne     end_count_zero
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mvn     r3, #7                      ; count = -8
+    ldr     r7, [r0, #vp8_writer_buffer]
+    lsr     r6, r2, #24                 ; lowvalue >> 24
+    add     r12, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r12, [r0, #vp8_writer_pos]
+
+    VALIDATE_POS r7, r12                ; validate_buffer at pos
+
+    strb    r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+check_p_lt_stop
+    ldr     r4, [sp, #0]                ; stop
+    cmp     r1, r4                      ; while( p < stop)
+    bcc     while_p_lt_stop
+
+    ldr     r10, [sp, #20]              ; num_parts
+    mov     r1, #TOKENLIST_SZ
+    mul     r1, r10, r1
+
+    ldr     r6, [sp, #12]               ; mb_rows
+    ldr     r7, [sp, #16]               ; tokenlist address
+    subs    r6, r6, r10
+    add     r7, r7, r1                  ; next element in the array
+    str     r6, [sp, #12]
+    bgt     mb_row_loop
+
+end_partition
+    mov     r12, #32
+
+stop_encode_loop
+    sub     r7, r5, #1                  ; range-1
+
+    mov     r4, r7, lsl #7              ; ((range-1) * 128)
+
+    mov     r7, #1
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero_se      ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set_se
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start_se
+token_zero_while_loop_se
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start_se
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop_se
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set_se
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r12, r12, #1
+    bne     stop_encode_loop
+
+    ldr     r4,  [r0, #vp8_writer_pos]  ; w->pos
+    ldr     r12, [sp, #24]              ; ptr
+    add     r12, r12, r4                ; ptr += w->pos
+    str     r12, [sp, #24]
+
+    ldr     r11, [sp, #28]              ; i
+    ldr     r10, [sp, #20]              ; num_parts
+
+    add     r11, r11, #1                ; i++
+    str     r11, [sp, #28]
+
+    ldr     r7, [sp, #32]               ; cpi->tp_list[i]
+    mov     r1, #TOKENLIST_SZ
+    add     r7, r7, r1                  ; next element in cpi->tp_list
+    str     r7, [sp, #32]               ; cpi->tp_list[i+1]
+
+    cmp     r10, r11
+    bgt     numparts_loop
+
+    add     sp, sp, #40
+    pop     {r4-r12, pc}
+    ENDP
+
+_VP8_COMP_common_
+    DCD     vp8_comp_common
+_VP8_COMMON_MBrows_
+    DCD     vp8_common_mb_rows
+_VP8_COMP_tplist_
+    DCD     vp8_comp_tplist
+_VP8_COMP_bc_
+    DCD     vp8_comp_bc
+_vp8_writer_sz_
+    DCD     vp8_writer_sz
+
+    END
diff --git a/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm b/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
new file mode 100644
index 0000000..d61f5d9
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
@@ -0,0 +1,225 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_fast_quantize_b_armv6|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    BLOCK *b
+; r1    BLOCKD *d
+|vp8_fast_quantize_b_armv6| PROC
+    stmfd   sp!, {r1, r4-r11, lr}
+
+    ldr     r3, [r0, #vp8_block_coeff]      ; coeff
+    ldr     r4, [r0, #vp8_block_quant_fast] ; quant_fast
+    ldr     r5, [r0, #vp8_block_round]      ; round
+    ldr     r6, [r1, #vp8_blockd_qcoeff]    ; qcoeff
+    ldr     r7, [r1, #vp8_blockd_dqcoeff]   ; dqcoeff
+    ldr     r8, [r1, #vp8_blockd_dequant]   ; dequant
+
+    ldr     r2, loop_count          ; loop_count=0x1000000. 'lsls' instruction
+                                    ; is used to update the counter so that
+                                    ; it can be used to mark nonzero
+                                    ; quantized coefficient pairs.
+
+    mov     r1, #0                  ; flags for quantized coeffs
+
+    ; PART 1: quantization and dequantization loop
+loop
+    ldr     r9, [r3], #4            ; [z1 | z0]
+    ldr     r10, [r5], #4           ; [r1 | r0]
+    ldr     r11, [r4], #4           ; [q1 | q0]
+
+    ssat16  lr, #1, r9              ; [sz1 | sz0]
+    eor     r9, r9, lr              ; [z1 ^ sz1 | z0 ^ sz0]
+    ssub16  r9, r9, lr              ; x = (z ^ sz) - sz
+    sadd16  r9, r9, r10             ; [x1+r1 | x0+r0]
+
+    ldr     r12, [r3], #4           ; [z3 | z2]
+
+    smulbb  r0, r9, r11             ; [(x0+r0)*q0]
+    smultt  r9, r9, r11             ; [(x1+r1)*q1]
+
+    ldr     r10, [r5], #4           ; [r3 | r2]
+
+    ssat16  r11, #1, r12            ; [sz3 | sz2]
+    eor     r12, r12, r11           ; [z3 ^ sz3 | z2 ^ sz2]
+    pkhtb   r0, r9, r0, asr #16     ; [y1 | y0]
+    ldr     r9, [r4], #4            ; [q3 | q2]
+    ssub16  r12, r12, r11           ; x = (z ^ sz) - sz
+
+    sadd16  r12, r12, r10           ; [x3+r3 | x2+r2]
+
+    eor     r0, r0, lr              ; [(y1 ^ sz1) | (y0 ^ sz0)]
+
+    smulbb  r10, r12, r9            ; [(x2+r2)*q2]
+    smultt  r12, r12, r9            ; [(x3+r3)*q3]
+
+    ssub16  r0, r0, lr              ; x = (y ^ sz) - sz
+
+    cmp     r0, #0                  ; check if zero
+    orrne   r1, r1, r2, lsr #24     ; add flag for nonzero coeffs
+
+    str     r0, [r6], #4            ; *qcoeff++ = x
+    ldr     r9, [r8], #4            ; [dq1 | dq0]
+
+    pkhtb   r10, r12, r10, asr #16  ; [y3 | y2]
+    eor     r10, r10, r11           ; [(y3 ^ sz3) | (y2 ^ sz2)]
+    ssub16  r10, r10, r11           ; x = (y ^ sz) - sz
+
+    cmp     r10, #0                 ; check if zero
+    orrne   r1, r1, r2, lsr #23     ; add flag for nonzero coeffs
+
+    str     r10, [r6], #4           ; *qcoeff++ = x
+    ldr     r11, [r8], #4           ; [dq3 | dq2]
+
+    smulbb  r12, r0, r9             ; [x0*dq0]
+    smultt  r0, r0, r9              ; [x1*dq1]
+
+    smulbb  r9, r10, r11            ; [x2*dq2]
+    smultt  r10, r10, r11           ; [x3*dq3]
+
+    lsls    r2, r2, #2              ; update loop counter
+    strh    r12, [r7, #0]           ; dqcoeff[0] = [x0*dq0]
+    strh    r0, [r7, #2]            ; dqcoeff[1] = [x1*dq1]
+    strh    r9, [r7, #4]            ; dqcoeff[2] = [x2*dq2]
+    strh    r10, [r7, #6]           ; dqcoeff[3] = [x3*dq3]
+    add     r7, r7, #8              ; dqcoeff += 8
+    bne     loop
+
+    ; PART 2: check position for eob...
+    ldr     r11, [sp, #0]           ; restore BLOCKD pointer
+    mov     lr, #0                  ; init eob
+    cmp     r1, #0                  ; coeffs after quantization?
+    ldr     r12, [r11, #vp8_blockd_eob]
+    beq     end                     ; skip eob calculations if all zero
+
+    ldr     r0, [r11, #vp8_blockd_qcoeff]
+
+    ; check shortcut for nonzero qcoeffs
+    tst    r1, #0x80
+    bne    quant_coeff_15_14
+    tst    r1, #0x20
+    bne    quant_coeff_13_11
+    tst    r1, #0x8
+    bne    quant_coeff_12_7
+    tst    r1, #0x40
+    bne    quant_coeff_10_9
+    tst    r1, #0x10
+    bne    quant_coeff_8_3
+    tst    r1, #0x2
+    bne    quant_coeff_6_5
+    tst    r1, #0x4
+    bne    quant_coeff_4_2
+    b      quant_coeff_1_0
+
+quant_coeff_15_14
+    ldrh    r2, [r0, #30]       ; rc=15, i=15
+    mov     lr, #16
+    cmp     r2, #0
+    bne     end
+
+    ldrh    r3, [r0, #28]       ; rc=14, i=14
+    mov     lr, #15
+    cmp     r3, #0
+    bne     end
+
+quant_coeff_13_11
+    ldrh    r2, [r0, #22]       ; rc=11, i=13
+    mov     lr, #14
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_12_7
+    ldrh    r3, [r0, #14]       ; rc=7,  i=12
+    mov     lr, #13
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #20]       ; rc=10, i=11
+    mov     lr, #12
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_10_9
+    ldrh    r3, [r0, #26]       ; rc=13, i=10
+    mov     lr, #11
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #24]       ; rc=12, i=9
+    mov     lr, #10
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_8_3
+    ldrh    r3, [r0, #18]       ; rc=9,  i=8
+    mov     lr, #9
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #12]       ; rc=6,  i=7
+    mov     lr, #8
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_6_5
+    ldrh    r3, [r0, #6]        ; rc=3,  i=6
+    mov     lr, #7
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #4]        ; rc=2,  i=5
+    mov     lr, #6
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_4_2
+    ldrh    r3, [r0, #10]       ; rc=5,  i=4
+    mov     lr, #5
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #16]       ; rc=8,  i=3
+    mov     lr, #4
+    cmp     r2, #0
+    bne     end
+
+    ldrh    r3, [r0, #8]        ; rc=4,  i=2
+    mov     lr, #3
+    cmp     r3, #0
+    bne     end
+
+quant_coeff_1_0
+    ldrh    r2, [r0, #2]        ; rc=1,  i=1
+    mov     lr, #2
+    cmp     r2, #0
+    bne     end
+
+    mov     lr, #1              ; rc=0,  i=0
+
+end
+    strb    lr, [r12]
+    ldmfd   sp!, {r1, r4-r11, pc}
+
+    ENDP
+
+loop_count
+    DCD     0x1000000
+
+    END
+
diff --git a/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
new file mode 100644
index 0000000..000805d
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
@@ -0,0 +1,138 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mse16x16_armv6|
+
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
+;      So, we can remove this part of calculation.
+
+|vp8_mse16x16_armv6| PROC
+
+    push    {r4-r9, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     r4, #0              ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r5, [r0, #0x0]      ; load 4 src pixels
+    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r5, r6          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0x4]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+    ldr     r5, [r0, #0x8]      ; load 4 src pixels
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0xc]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    subs    r12, r12, #1        ; next row
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r1, [sp, #28]       ; get address of sse
+    mov     r0, r4              ; return sse
+    str     r4, [r1]            ; store sse
+
+    pop     {r4-r9, pc}
+
+    ENDP
+
+    END
diff --git a/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm b/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
new file mode 100644
index 0000000..8034c1d
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm
@@ -0,0 +1,262 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_fdct4x4_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct4x4_armv6| PROC
+
+    stmfd       sp!, {r4 - r12, lr}
+
+    ; PART 1
+
+    ; coeffs 0-3
+    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]
+
+    ldr         r10, c7500
+    ldr         r11, c14500
+    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]
+    ldr         lr, c0x00080008
+    ror         r5, r5, #16         ; [i2 | i3]
+
+    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8
+    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8
+
+    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]
+
+    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]
+
+    str         r6, [r1, #4]
+
+    ; coeffs 4-7
+    ror         r9, r9, #16         ; [i6 | i7]
+
+    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift
+    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8
+    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8
+
+    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]
+
+    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]
+
+    str         r6, [r1, #12]
+
+    ; coeffs 8-11
+    ror         r5, r5, #16         ; [i10 | i11]
+
+    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8
+    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8
+
+    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]
+
+    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]
+
+    str         r6, [r1, #20]
+
+    ; coeffs 12-15
+    ror         r5, r5, #16         ; [i14 | i15]
+
+    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift
+    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8
+    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8
+
+    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]
+
+    str         r6, [r1, #28]
+
+
+    ; PART 2 -------------------------------------------------
+    ldr         r11, c12000
+    ldr         r10, c51000
+    ldr         lr, c0x00070007
+
+    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]
+    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]
+    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]
+    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    add         r0, r11, #0x10000   ; add (d!=0)
+
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    ldr         r12, c0x08a914e8    ; [2217 | 5352]
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #0]        ; [     o1 |      o0]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #16]       ; [     o9 |      o8]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    ldr         r3, [r1, #4]        ; [i3 | i2]
+
+    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]
+
+    str         r9, [r1, #8]        ; [o5 | 04]
+
+    ldr         r9, [r1, #12]       ; [i7 | i6]
+    ldr         r8, [r1, #28]       ; [i15|i14]
+    ldr         r2, [r1, #20]       ; [i11|i10]
+    str         r5, [r1, #24]       ; [o13|o12]
+
+    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]
+    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #4]        ; [     o3 |      o2]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #20]       ; [    o11 |     o10]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    str         r9, [r1, #12]       ; [o7 | o6]
+    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]
+
+    str         r5, [r1, #28]       ; [o15|o14]
+
+    ldmfd       sp!, {r4 - r12, pc}
+
+    ENDP
+
+; Used constants
+c7500
+    DCD     7500
+c14500
+    DCD     14500
+c0x22a453a0
+    DCD     0x22a453a0
+c0x00080008
+    DCD     0x00080008
+c12000
+    DCD     12000
+c51000
+    DCD     51000
+c0x00070007
+    DCD     0x00070007
+c0x08a914e8
+    DCD     0x08a914e8
+
+    END
diff --git a/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm b/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
new file mode 100644
index 0000000..f329f8f
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
@@ -0,0 +1,272 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_subtract_mby_armv6|
+    EXPORT  |vp8_subtract_mbuv_armv6|
+    EXPORT  |vp8_subtract_b_armv6|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    BLOCK *be
+; r1    BLOCKD *bd
+; r2    int pitch
+|vp8_subtract_b_armv6| PROC
+
+    stmfd   sp!, {r4-r9}
+
+    ldr     r4, [r0, #vp8_block_base_src]
+    ldr     r5, [r0, #vp8_block_src]
+    ldr     r6, [r0, #vp8_block_src_diff]
+
+    ldr     r3, [r4]
+    ldr     r7, [r0, #vp8_block_src_stride]
+    add     r3, r3, r5          ; src = *base_src + src
+    ldr     r8, [r1, #vp8_blockd_predictor]
+
+    mov     r9, #4              ; loop count
+
+loop_block
+
+    ldr     r0, [r3], r7        ; src
+    ldr     r1, [r8], r2        ; pred
+
+    uxtb16  r4, r0              ; [s2 | s0]
+    uxtb16  r5, r1              ; [p2 | p0]
+    uxtb16  r0, r0, ror #8      ; [s3 | s1]
+    uxtb16  r1, r1, ror #8      ; [p3 | p1]
+
+    usub16  r4, r4, r5          ; [d2 | d0]
+    usub16  r5, r0, r1          ; [d3 | d1]
+
+    subs    r9, r9, #1          ; decrement loop counter
+
+    pkhbt   r0, r4, r5, lsl #16 ; [d1 | d0]
+    pkhtb   r1, r5, r4, asr #16 ; [d3 | d2]
+
+    str     r0, [r6, #0]        ; diff
+    str     r1, [r6, #4]        ; diff
+
+    add     r6, r6, r2, lsl #1  ; update diff pointer
+    bne     loop_block
+
+    ldmfd   sp!, {r4-r9}
+    mov     pc, lr
+
+    ENDP
+
+
+; r0    short *diff
+; r1    unsigned char *usrc
+; r2    unsigned char *vsrc
+; r3    int src_stride
+; sp    unsigned char *upred
+; sp    unsigned char *vpred
+; sp    int pred_stride
+|vp8_subtract_mbuv_armv6| PROC
+
+    stmfd   sp!, {r4-r11}
+
+    add     r0, r0, #512        ; set *diff point to Cb
+    mov     r4, #8              ; loop count
+    ldr     r5, [sp, #32]       ; upred
+    ldr     r12, [sp, #40]      ; pred_stride
+
+    ; Subtract U block
+loop_u
+    ldr     r6, [r1]            ; usrc      (A)
+    ldr     r7, [r5]            ; upred     (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r1, #4]       ; usrc      (B)
+    ldr     r11, [r5, #4]       ; upred     (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    add     r1, r1, r3          ; update usrc pointer
+    add     r5, r5, r12         ; update upred pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (B)
+
+    bne     loop_u
+
+    ldr     r5, [sp, #36]       ; vpred
+    mov     r4, #8              ; loop count
+
+    ; Subtract V block
+loop_v
+    ldr     r6, [r2]            ; vsrc      (A)
+    ldr     r7, [r5]            ; vpred     (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r2, #4]       ; vsrc      (B)
+    ldr     r11, [r5, #4]       ; vpred     (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    add     r2, r2, r3          ; update vsrc pointer
+    add     r5, r5, r12         ; update vpred pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (B)
+
+    bne     loop_v
+
+    ldmfd   sp!, {r4-r11}
+    bx      lr
+
+    ENDP
+
+
+; r0    short *diff
+; r1    unsigned char *src
+; r2    int src_stride
+; r3    unsigned char *pred
+; sp    int pred_stride
+|vp8_subtract_mby_armv6| PROC
+
+    stmfd   sp!, {r4-r11}
+    ldr     r12, [sp, #32]      ; pred_stride
+    mov     r4, #16
+loop
+    ldr     r6, [r1]            ; src       (A)
+    ldr     r7, [r3]            ; pred      (A)
+
+    uxtb16  r8, r6              ; [s2 | s0] (A)
+    uxtb16  r9, r7              ; [p2 | p0] (A)
+    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
+    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (A)
+    usub16  r7, r10, r11        ; [d3 | d1] (A)
+
+    ldr     r10, [r1, #4]       ; src       (B)
+    ldr     r11, [r3, #4]       ; pred      (B)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
+
+    str     r8, [r0], #4        ; diff      (A)
+    uxtb16  r8, r10             ; [s2 | s0] (B)
+    str     r9, [r0], #4        ; diff      (A)
+
+    uxtb16  r9, r11             ; [p2 | p0] (B)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (B)
+    usub16  r7, r10, r11        ; [d3 | d1] (B)
+
+    ldr     r10, [r1, #8]       ; src       (C)
+    ldr     r11, [r3, #8]       ; pred      (C)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
+
+    str     r8, [r0], #4        ; diff      (B)
+    uxtb16  r8, r10             ; [s2 | s0] (C)
+    str     r9, [r0], #4        ; diff      (B)
+
+    uxtb16  r9, r11             ; [p2 | p0] (C)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (C)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (C)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (C)
+    usub16  r7, r10, r11        ; [d3 | d1] (C)
+
+    ldr     r10, [r1, #12]      ; src       (D)
+    ldr     r11, [r3, #12]      ; pred      (D)
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (C)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (C)
+
+    str     r8, [r0], #4        ; diff      (C)
+    uxtb16  r8, r10             ; [s2 | s0] (D)
+    str     r9, [r0], #4        ; diff      (C)
+
+    uxtb16  r9, r11             ; [p2 | p0] (D)
+    uxtb16  r10, r10, ror #8    ; [s3 | s1] (D)
+    uxtb16  r11, r11, ror #8    ; [p3 | p1] (D)
+
+    usub16  r6, r8, r9          ; [d2 | d0] (D)
+    usub16  r7, r10, r11        ; [d3 | d1] (D)
+
+    add     r1, r1, r2          ; update src pointer
+    add     r3, r3, r12         ; update pred pointer
+
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)
+
+    str     r8, [r0], #4        ; diff      (D)
+    subs    r4, r4, #1          ; update loop counter
+    str     r9, [r0], #4        ; diff      (D)
+
+    bne     loop
+
+    ldmfd   sp!, {r4-r11}
+    bx      lr
+
+    ENDP
+
+    END
+
diff --git a/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm b/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm
new file mode 100644
index 0000000..5eaf3f2
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm
@@ -0,0 +1,212 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_walsh4x4_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
+; r0    short *input,
+; r1    short *output,
+; r2    int pitch
+|vp8_short_walsh4x4_armv6| PROC
+
+    stmdb       sp!, {r4 - r11, lr}
+
+    ldrd        r4, r5, [r0], r2
+    ldr         lr, c00040004
+    ldrd        r6, r7, [r0], r2
+
+    ; 0-3
+    qadd16      r3, r4, r5          ; [d1|a1] [1+3   |   0+2]
+    qsub16      r4, r4, r5          ; [c1|b1] [1-3   |   0-2]
+
+    ldrd        r8, r9, [r0], r2
+    ; 4-7
+    qadd16      r5, r6, r7          ; [d1|a1] [5+7   |   4+6]
+    qsub16      r6, r6, r7          ; [c1|b1] [5-7   |   4-6]
+
+    ldrd        r10, r11, [r0]
+    ; 8-11
+    qadd16      r7, r8, r9          ; [d1|a1] [9+11  |  8+10]
+    qsub16      r8, r8, r9          ; [c1|b1] [9-11  |  8-10]
+
+    ; 12-15
+    qadd16      r9, r10, r11        ; [d1|a1] [13+15 | 12+14]
+    qsub16      r10, r10, r11       ; [c1|b1] [13-15 | 12-14]
+
+
+    lsls        r2, r3, #16
+    smuad       r11, r3, lr         ; A0 = a1<<2 + d1<<2
+    addne       r11, r11, #1        ; A0 += (a1!=0)
+
+    lsls        r2, r7, #16
+    smuad       r12, r7, lr         ; C0 = a1<<2 + d1<<2
+    addne       r12, r12, #1        ; C0 += (a1!=0)
+
+    add         r0, r11, r12        ; a1_0 = A0 + C0
+    sub         r11, r11, r12       ; b1_0 = A0 - C0
+
+    lsls        r2, r5, #16
+    smuad       r12, r5, lr         ; B0 = a1<<2 + d1<<2
+    addne       r12, r12, #1        ; B0 += (a1!=0)
+
+    lsls        r2, r9, #16
+    smuad       r2, r9, lr          ; D0 = a1<<2 + d1<<2
+    addne       r2, r2, #1          ; D0 += (a1!=0)
+
+    add         lr, r12, r2         ; d1_0 = B0 + D0
+    sub         r12, r12, r2        ; c1_0 = B0 - D0
+
+    ; op[0,4,8,12]
+    adds        r2, r0, lr          ; a2 = a1_0 + d1_0
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r0, r0, lr          ; d2 = a1_0 - d1_0
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1]            ; op[0]
+
+    addmi       r0, r0, #1          ; += a2 < 0
+    add         r0, r0, #3          ; += 3
+    ldr         lr, c00040004
+    mov         r0, r0, asr #3      ; >> 3
+    strh        r0, [r1, #24]       ; op[12]
+
+    adds        r2, r11, r12        ; b2 = b1_0 + c1_0
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r0, r11, r12        ; c2 = b1_0 - c1_0
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #8]        ; op[4]
+
+    addmi       r0, r0, #1          ; += a2 < 0
+    add         r0, r0, #3          ; += 3
+    smusd       r3, r3, lr          ; A3 = a1<<2 - d1<<2
+    smusd       r7, r7, lr          ; C3 = a1<<2 - d1<<2
+    mov         r0, r0, asr #3      ; >> 3
+    strh        r0, [r1, #16]       ; op[8]
+
+
+    ; op[3,7,11,15]
+    add         r0, r3, r7          ; a1_3 = A3 + C3
+    sub         r3, r3, r7          ; b1_3 = A3 - C3
+
+    smusd       r5, r5, lr          ; B3 = a1<<2 - d1<<2
+    smusd       r9, r9, lr          ; D3 = a1<<2 - d1<<2
+    add         r7, r5, r9          ; d1_3 = B3 + D3
+    sub         r5, r5, r9          ; c1_3 = B3 - D3
+
+    adds        r2, r0, r7          ; a2 = a1_3 + d1_3
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r3, r5          ; b2 = b1_3 + c1_3
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #6]        ; op[3]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r3, r5          ; c2 = b1_3 - c1_3
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #14]       ; op[7]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r0, r7          ; d2 = a1_3 - d1_3
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #22]       ; op[11]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    smuad       r3, r4, lr          ; A1 = b1<<2 + c1<<2
+    smuad       r5, r8, lr          ; C1 = b1<<2 + c1<<2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #30]       ; op[15]
+
+    ; op[1,5,9,13]
+    add         r0, r3, r5          ; a1_1 = A1 + C1
+    sub         r3, r3, r5          ; b1_1 = A1 - C1
+
+    smuad       r7, r6, lr          ; B1 = b1<<2 + c1<<2
+    smuad       r9, r10, lr         ; D1 = b1<<2 + c1<<2
+    add         r5, r7, r9          ; d1_1 = B1 + D1
+    sub         r7, r7, r9          ; c1_1 = B1 - D1
+
+    adds        r2, r0, r5          ; a2 = a1_1 + d1_1
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r3, r7          ; b2 = b1_1 + c1_1
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #2]        ; op[1]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r3, r7          ; c2 = b1_1 - c1_1
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #10]       ; op[5]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r0, r5          ; d2 = a1_1 - d1_1
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #18]       ; op[9]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    smusd       r4, r4, lr          ; A2 = b1<<2 - c1<<2
+    smusd       r8, r8, lr          ; C2 = b1<<2 - c1<<2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #26]       ; op[13]
+
+
+    ; op[2,6,10,14]
+    add         r11, r4, r8         ; a1_2 = A2 + C2
+    sub         r12, r4, r8         ; b1_2 = A2 - C2
+
+    smusd       r6, r6, lr          ; B2 = b1<<2 - c1<<2
+    smusd       r10, r10, lr        ; D2 = b1<<2 - c1<<2
+    add         r4, r6, r10         ; d1_2 = B2 + D2
+    sub         r8, r6, r10         ; c1_2 = B2 - D2
+
+    adds        r2, r11, r4         ; a2 = a1_2 + d1_2
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r12, r8         ; b2 = b1_2 + c1_2
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #4]        ; op[2]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r12, r8         ; c2 = b1_2 - c1_2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #12]       ; op[6]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r11, r4         ; d2 = a1_2 - d1_2
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #20]       ; op[10]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #28]       ; op[14]
+
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_short_walsh4x4_armv6|
+
+c00040004
+    DCD         0x00040004
+
+    END
diff --git a/libvpx/vp8/encoder/arm/boolhuff_arm.c b/libvpx/vp8/encoder/arm/boolhuff_arm.c
new file mode 100644
index 0000000..17a941b
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/boolhuff_arm.c
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/encoder/boolhuff.h"
+#include "vpx/internal/vpx_codec_internal.h"
+
+const unsigned int vp8_prob_cost[256] =
+{
+    2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
+    1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
+    767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
+    617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
+    511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
+    428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
+    361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
+    304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
+    255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
+    211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
+    172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
+    137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
+    105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
+    75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
+    48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
+    22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
+};
+
+int vp8_validate_buffer_arm(const unsigned char *start,
+                            size_t               len,
+                            const unsigned char *end,
+                            struct vpx_internal_error_info *error)
+{
+    return validate_buffer(start, len, end, error);
+}
diff --git a/libvpx/vp8/encoder/arm/dct_arm.c b/libvpx/vp8/encoder/arm/dct_arm.c
new file mode 100644
index 0000000..af0fb27
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/dct_arm.c
@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+#if HAVE_MEDIA
+
+void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch)
+{
+    vp8_short_fdct4x4_armv6(input,   output,    pitch);
+    vp8_short_fdct4x4_armv6(input + 4, output + 16, pitch);
+}
+
+#endif /* HAVE_MEDIA */
diff --git a/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm
new file mode 100644
index 0000000..1430588
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm
@@ -0,0 +1,258 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_fast_quantize_b_neon|
+    EXPORT  |vp8_fast_quantize_b_pair_neon|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=4
+
+;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
+|vp8_fast_quantize_b_pair_neon| PROC
+
+    stmfd           sp!, {r4-r9}
+    vstmdb          sp!, {q4-q7}
+
+    ldr             r4, [r0, #vp8_block_coeff]
+    ldr             r5, [r0, #vp8_block_quant_fast]
+    ldr             r6, [r0, #vp8_block_round]
+
+    vld1.16         {q0, q1}, [r4@128]  ; load z
+
+    ldr             r7, [r2, #vp8_blockd_qcoeff]
+
+    vabs.s16        q4, q0              ; calculate x = abs(z)
+    vabs.s16        q5, q1
+
+    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+    vshr.s16        q2, q0, #15         ; sz
+    vshr.s16        q3, q1, #15
+
+    vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]
+    vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]
+
+    ldr             r4, [r1, #vp8_block_coeff]
+
+    vadd.s16        q4, q6              ; x + Round
+    vadd.s16        q5, q7
+
+    vld1.16         {q0, q1}, [r4@128]  ; load z2
+
+    vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16
+    vqdmulh.s16     q5, q9
+
+    vabs.s16        q10, q0             ; calculate x2 = abs(z_2)
+    vabs.s16        q11, q1
+    vshr.s16        q12, q0, #15        ; sz2
+    vshr.s16        q13, q1, #15
+
+    ;modify data to have its original sign
+    veor.s16        q4, q2              ; y^sz
+    veor.s16        q5, q3
+
+    vadd.s16        q10, q6             ; x2 + Round
+    vadd.s16        q11, q7
+
+    ldr             r8, [r2, #vp8_blockd_dequant]
+
+    vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16
+    vqdmulh.s16     q11, q9
+
+    vshr.s16        q4, #1              ; right shift 1 after vqdmulh
+    vshr.s16        q5, #1
+
+    vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]
+
+    vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+    vsub.s16        q5, q3
+
+    vshr.s16        q10, #1             ; right shift 1 after vqdmulh
+    vshr.s16        q11, #1
+
+    ldr             r9, [r2, #vp8_blockd_dqcoeff]
+
+    veor.s16        q10, q12            ; y2^sz2
+    veor.s16        q11, q13
+
+    vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1
+
+
+    vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+    vsub.s16        q11, q13
+
+    ldr             r6, [r3, #vp8_blockd_qcoeff]
+
+    vmul.s16        q2, q6, q4          ; x * Dequant
+    vmul.s16        q3, q7, q5
+
+    adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table
+
+    vceq.s16        q8, q8              ; set q8 to all 1
+
+    vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2
+
+    vmul.s16        q12, q6, q10        ; x2 * Dequant
+    vmul.s16        q13, q7, q11
+
+    vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order
+
+    vtst.16         q14, q4, q8         ; now find eob
+    vtst.16         q15, q5, q8         ; non-zero element is set to all 1
+
+    vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant
+
+    ldr             r7, [r3, #vp8_blockd_dqcoeff]
+
+    vand            q0, q6, q14         ; get all valid numbers from scan array
+    vand            q1, q7, q15
+
+    vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant
+
+    vtst.16         q2, q10, q8         ; now find eob
+    vtst.16         q3, q11, q8         ; non-zero element is set to all 1
+
+    vmax.u16        q0, q0, q1          ; find maximum value in q0, q1
+
+    vand            q10, q6, q2         ; get all valid numbers from scan array
+    vand            q11, q7, q3
+    vmax.u16        q10, q10, q11       ; find maximum value in q10, q11
+
+    vmax.u16        d0, d0, d1
+    vmax.u16        d20, d20, d21
+    vmovl.u16       q0, d0
+    vmovl.u16       q10, d20
+
+    vmax.u32        d0, d0, d1
+    vmax.u32        d20, d20, d21
+    vpmax.u32       d0, d0, d0
+    vpmax.u32       d20, d20, d20
+
+    ldr             r4, [r2, #vp8_blockd_eob]
+    ldr             r5, [r3, #vp8_blockd_eob]
+
+    vst1.8          {d0[0]}, [r4]       ; store eob
+    vst1.8          {d20[0]}, [r5]      ; store eob
+
+    vldmia          sp!, {q4-q7}
+    ldmfd           sp!, {r4-r9}
+    bx              lr
+
+    ENDP
+
+;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+|vp8_fast_quantize_b_neon| PROC
+
+    stmfd           sp!, {r4-r7}
+
+    ldr             r3, [r0, #vp8_block_coeff]
+    ldr             r4, [r0, #vp8_block_quant_fast]
+    ldr             r5, [r0, #vp8_block_round]
+
+    vld1.16         {q0, q1}, [r3@128]  ; load z
+    vorr.s16        q14, q0, q1         ; check if all zero (step 1)
+    ldr             r6, [r1, #vp8_blockd_qcoeff]
+    ldr             r7, [r1, #vp8_blockd_dqcoeff]
+    vorr.s16        d28, d28, d29       ; check if all zero (step 2)
+
+    vabs.s16        q12, q0             ; calculate x = abs(z)
+    vabs.s16        q13, q1
+
+    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+    vshr.s16        q2, q0, #15         ; sz
+    vmov            r2, r3, d28         ; check if all zero (step 3)
+    vshr.s16        q3, q1, #15
+
+    vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15]
+    vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15]
+
+    vadd.s16        q12, q14            ; x + Round
+    vadd.s16        q13, q15
+
+    adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table
+
+    vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16
+    vqdmulh.s16     q13, q9
+
+    vld1.16         {q10, q11}, [r0@128]; load inverse scan order
+
+    vceq.s16        q8, q8              ; set q8 to all 1
+
+    ldr             r4, [r1, #vp8_blockd_dequant]
+
+    vshr.s16        q12, #1             ; right shift 1 after vqdmulh
+    vshr.s16        q13, #1
+
+    ldr             r5, [r1, #vp8_blockd_eob]
+
+    orr             r2, r2, r3          ; check if all zero (step 4)
+    cmp             r2, #0              ; check if all zero (step 5)
+    beq             zero_output         ; check if all zero (step 6)
+
+    ;modify data to have its original sign
+    veor.s16        q12, q2             ; y^sz
+    veor.s16        q13, q3
+
+    vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
+    vsub.s16        q13, q3
+
+    vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i]
+
+    vtst.16         q14, q12, q8        ; now find eob
+    vtst.16         q15, q13, q8        ; non-zero element is set to all 1
+
+    vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1
+
+    vand            q10, q10, q14       ; get all valid numbers from scan array
+    vand            q11, q11, q15
+
+
+    vmax.u16        q0, q10, q11        ; find maximum value in q0, q1
+    vmax.u16        d0, d0, d1
+    vmovl.u16       q0, d0
+
+    vmul.s16        q2, q12             ; x * Dequant
+    vmul.s16        q3, q13
+
+    vmax.u32        d0, d0, d1
+    vpmax.u32       d0, d0, d0
+
+    vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant
+
+    vst1.8          {d0[0]}, [r5]       ; store eob
+
+    ldmfd           sp!, {r4-r7}
+    bx              lr
+
+zero_output
+    strb            r2, [r5]            ; store eob
+    vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0
+    vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0
+
+    ldmfd           sp!, {r4-r7}
+    bx              lr
+
+    ENDP
+
+; default inverse zigzag table is defined in vp8/common/entropy.c
+    ALIGN 16    ; enable use of @128 bit aligned loads
+inv_zig_zag
+    DCW 0x0001, 0x0002, 0x0006, 0x0007
+    DCW 0x0003, 0x0005, 0x0008, 0x000d
+    DCW 0x0004, 0x0009, 0x000c, 0x000e
+    DCW 0x000a, 0x000b, 0x000f, 0x0010
+
+    END
+
diff --git a/libvpx/vp8/encoder/arm/neon/picklpf_arm.c b/libvpx/vp8/encoder/arm/neon/picklpf_arm.c
new file mode 100644
index 0000000..ec8071e
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/picklpf_arm.c
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/loopfilter.h"
+#include "vpx_scale/yv12config.h"
+
+extern void vp8_memcpy_partial_neon(unsigned char *dst_ptr,
+                                    unsigned char *src_ptr,
+                                    int sz);
+
+
+void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc,
+                                      YV12_BUFFER_CONFIG *dst_ybc)
+{
+    unsigned char *src_y, *dst_y;
+    int yheight;
+    int ystride;
+    int yoffset;
+    int linestocopy;
+
+    yheight  = src_ybc->y_height;
+    ystride  = src_ybc->y_stride;
+
+    /* number of MB rows to use in partial filtering */
+    linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION;
+    linestocopy = linestocopy ? linestocopy << 4 : 16;     /* 16 lines per MB */
+
+    /* Copy extra 4 so that full filter context is available if filtering done
+     * on the copied partial frame and not original. Partial filter does mb
+     * filtering for top row also, which can modify3 pixels above.
+     */
+    linestocopy += 4;
+    /* partial image starts at ~middle of frame (macroblock border) */
+    yoffset  = ystride * (((yheight >> 5) * 16) - 4);
+    src_y = src_ybc->y_buffer + yoffset;
+    dst_y = dst_ybc->y_buffer + yoffset;
+
+    vp8_memcpy_partial_neon(dst_y, src_y, ystride * linestocopy);
+}
diff --git a/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm b/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm
new file mode 100644
index 0000000..09dd011
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm
@@ -0,0 +1,221 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_fdct4x4_neon|
+    EXPORT  |vp8_short_fdct8x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=4
+
+
+    ALIGN 16    ; enable use of @128 bit aligned loads
+coeff
+    DCW      5352,  5352,  5352, 5352
+    DCW      2217,  2217,  2217, 2217
+    DCD     14500, 14500, 14500, 14500
+    DCD      7500,  7500,  7500, 7500
+    DCD     12000, 12000, 12000, 12000
+    DCD     51000, 51000, 51000, 51000
+
+;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct4x4_neon| PROC
+
+    ; Part one
+    vld1.16         {d0}, [r0@64], r2
+    adr             r12, coeff
+    vld1.16         {d1}, [r0@64], r2
+    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
+    vld1.16         {d2}, [r0@64], r2
+    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
+    vld1.16         {d3}, [r0@64], r2
+
+    ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+    vtrn.32         d0, d2
+    vtrn.32         d1, d3
+    vld1.32         {q11,q12}, [r12@128]    ; q11=12000, q12=51000
+    vtrn.16         d0, d1
+    vtrn.16         d2, d3
+
+    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[3]
+    vadd.s16        d5, d1, d2      ; b1 = ip[1] + ip[2]
+    vsub.s16        d6, d1, d2      ; c1 = ip[1] - ip[2]
+    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[3]
+
+    vshl.s16        q2, q2, #3      ; (a1, b1) << 3
+    vshl.s16        q3, q3, #3      ; (c1, d1) << 3
+
+    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1
+    vsub.s16        d2, d4, d5      ; op[2] = a1 - b1
+
+    vmlal.s16       q9, d7, d16     ; d1*5352 + 14500
+    vmlal.s16       q10, d7, d17    ; d1*2217 + 7500
+    vmlal.s16       q9, d6, d17     ; c1*2217 + d1*5352 + 14500
+    vmlsl.s16       q10, d6, d16    ; d1*2217 - c1*5352 + 7500
+
+    vshrn.s32       d1, q9, #12     ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
+    vshrn.s32       d3, q10, #12    ; op[3] = (d1*2217 - c1*5352 +  7500)>>12
+
+
+    ; Part two
+
+    ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+    vtrn.32         d0, d2
+    vtrn.32         d1, d3
+    vtrn.16         d0, d1
+    vtrn.16         d2, d3
+
+    vmov.s16        d26, #7
+
+    vadd.s16        d4, d0, d3      ; a1 = ip[0] + ip[12]
+    vadd.s16        d5, d1, d2      ; b1 = ip[4] + ip[8]
+    vsub.s16        d6, d1, d2      ; c1 = ip[4] - ip[8]
+    vadd.s16        d4, d4, d26     ; a1 + 7
+    vsub.s16        d7, d0, d3      ; d1 = ip[0] - ip[12]
+
+    vadd.s16        d0, d4, d5      ; op[0] = a1 + b1 + 7
+    vsub.s16        d2, d4, d5      ; op[8] = a1 - b1 + 7
+
+    vmlal.s16       q11, d7, d16    ; d1*5352 + 12000
+    vmlal.s16       q12, d7, d17    ; d1*2217 + 51000
+
+    vceq.s16        d4, d7, #0
+
+    vshr.s16        d0, d0, #4
+    vshr.s16        d2, d2, #4
+
+    vmlal.s16       q11, d6, d17    ; c1*2217 + d1*5352 + 12000
+    vmlsl.s16       q12, d6, d16    ; d1*2217 - c1*5352 + 51000
+
+    vmvn.s16        d4, d4
+    vshrn.s32       d1, q11, #16    ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
+    vsub.s16        d1, d1, d4      ; op[4] += (d1!=0)
+    vshrn.s32       d3, q12, #16    ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
+
+    vst1.16         {q0, q1}, [r1@128]
+
+    bx              lr
+
+    ENDP
+
+;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
+|vp8_short_fdct8x4_neon| PROC
+
+    ; Part one
+
+    vld1.16         {q0}, [r0@128], r2
+    adr             r12, coeff
+    vld1.16         {q1}, [r0@128], r2
+    vld1.16         {q8}, [r12@128]!        ; d16=5352,  d17=2217
+    vld1.16         {q2}, [r0@128], r2
+    vld1.32         {q9, q10}, [r12@128]!   ;  q9=14500, q10=7500
+    vld1.16         {q3}, [r0@128], r2
+
+    ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
+    vtrn.32         q0, q2          ; [A0|B0]
+    vtrn.32         q1, q3          ; [A1|B1]
+    vtrn.16         q0, q1          ; [A2|B2]
+    vtrn.16         q2, q3          ; [A3|B3]
+
+    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[3]
+    vadd.s16        q12, q1, q2     ; b1 = ip[1] + ip[2]
+    vsub.s16        q13, q1, q2     ; c1 = ip[1] - ip[2]
+    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[3]
+
+    vshl.s16        q11, q11, #3    ; a1 << 3
+    vshl.s16        q12, q12, #3    ; b1 << 3
+    vshl.s16        q13, q13, #3    ; c1 << 3
+    vshl.s16        q14, q14, #3    ; d1 << 3
+
+    vadd.s16        q0, q11, q12    ; [A0 | B0] = a1 + b1
+    vsub.s16        q2, q11, q12    ; [A2 | B2] = a1 - b1
+
+    vmov.s16        q11, q9         ; 14500
+    vmov.s16        q12, q10        ; 7500
+
+    vmlal.s16       q9, d28, d16    ; A[1] = d1*5352 + 14500
+    vmlal.s16       q10, d28, d17   ; A[3] = d1*2217 + 7500
+    vmlal.s16       q11, d29, d16   ; B[1] = d1*5352 + 14500
+    vmlal.s16       q12, d29, d17   ; B[3] = d1*2217 + 7500
+
+    vmlal.s16       q9, d26, d17    ; A[1] = c1*2217 + d1*5352 + 14500
+    vmlsl.s16       q10, d26, d16   ; A[3] = d1*2217 - c1*5352 + 7500
+    vmlal.s16       q11, d27, d17   ; B[1] = c1*2217 + d1*5352 + 14500
+    vmlsl.s16       q12, d27, d16   ; B[3] = d1*2217 - c1*5352 + 7500
+
+    vshrn.s32       d2, q9, #12     ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
+    vshrn.s32       d6, q10, #12    ; A[3] = (d1*2217 - c1*5352 +  7500)>>12
+    vshrn.s32       d3, q11, #12    ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
+    vshrn.s32       d7, q12, #12    ; B[3] = (d1*2217 - c1*5352 +  7500)>>12
+
+
+    ; Part two
+    vld1.32         {q9,q10}, [r12@128]    ; q9=12000, q10=51000
+
+    ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
+    vtrn.32         q0, q2          ; q0=[A0 | B0]
+    vtrn.32         q1, q3          ; q1=[A4 | B4]
+    vtrn.16         q0, q1          ; q2=[A8 | B8]
+    vtrn.16         q2, q3          ; q3=[A12|B12]
+
+    vmov.s16        q15, #7
+
+    vadd.s16        q11, q0, q3     ; a1 = ip[0] + ip[12]
+    vadd.s16        q12, q1, q2     ; b1 = ip[4] + ip[8]
+    vadd.s16        q11, q11, q15   ; a1 + 7
+    vsub.s16        q13, q1, q2     ; c1 = ip[4] - ip[8]
+    vsub.s16        q14, q0, q3     ; d1 = ip[0] - ip[12]
+
+    vadd.s16        q0, q11, q12    ; a1 + b1 + 7
+    vsub.s16        q1, q11, q12    ; a1 - b1 + 7
+
+    vmov.s16        q11, q9         ; 12000
+    vmov.s16        q12, q10        ; 51000
+
+    vshr.s16        d0, d0, #4      ; A[0] = (a1 + b1 + 7)>>4
+    vshr.s16        d4, d1, #4      ; B[0] = (a1 + b1 + 7)>>4
+    vshr.s16        d2, d2, #4      ; A[8] = (a1 + b1 + 7)>>4
+    vshr.s16        d6, d3, #4      ; B[8] = (a1 + b1 + 7)>>4
+
+
+    vmlal.s16       q9, d28, d16    ; A[4]  = d1*5352 + 12000
+    vmlal.s16       q10, d28, d17   ; A[12] = d1*2217 + 51000
+    vmlal.s16       q11, d29, d16   ; B[4]  = d1*5352 + 12000
+    vmlal.s16       q12, d29, d17   ; B[12] = d1*2217 + 51000
+
+    vceq.s16        q14, q14, #0
+
+    vmlal.s16       q9, d26, d17    ; A[4]  = c1*2217 + d1*5352 + 12000
+    vmlsl.s16       q10, d26, d16   ; A[12] = d1*2217 - c1*5352 + 51000
+    vmlal.s16       q11, d27, d17   ; B[4]  = c1*2217 + d1*5352 + 12000
+    vmlsl.s16       q12, d27, d16   ; B[12] = d1*2217 - c1*5352 + 51000
+
+    vmvn.s16        q14, q14
+
+    vshrn.s32       d1, q9, #16     ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
+    vshrn.s32       d3, q10, #16    ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
+    vsub.s16        d1, d1, d28     ; A[4] += (d1!=0)
+
+    vshrn.s32       d5, q11, #16    ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
+    vshrn.s32       d7, q12, #16    ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
+    vsub.s16        d5, d5, d29     ; B[4] += (d1!=0)
+
+    vst1.16         {q0, q1}, [r1@128]! ; block A
+    vst1.16         {q2, q3}, [r1@128]! ; block B
+
+    bx              lr
+
+    ENDP
+
+    END
+
diff --git a/libvpx/vp8/encoder/arm/neon/subtract_neon.asm b/libvpx/vp8/encoder/arm/neon/subtract_neon.asm
new file mode 100644
index 0000000..91a328c
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/subtract_neon.asm
@@ -0,0 +1,199 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_subtract_b_neon|
+    EXPORT |vp8_subtract_mby_neon|
+    EXPORT |vp8_subtract_mbuv_neon|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
+|vp8_subtract_b_neon| PROC
+
+    stmfd   sp!, {r4-r7}
+
+    ldr     r3, [r0, #vp8_block_base_src]
+    ldr     r4, [r0, #vp8_block_src]
+    ldr     r5, [r0, #vp8_block_src_diff]
+    ldr     r3, [r3]
+    ldr     r6, [r0, #vp8_block_src_stride]
+    add     r3, r3, r4                      ; src = *base_src + src
+    ldr     r7, [r1, #vp8_blockd_predictor]
+
+    vld1.8          {d0}, [r3], r6          ;load src
+    vld1.8          {d1}, [r7], r2          ;load pred
+    vld1.8          {d2}, [r3], r6
+    vld1.8          {d3}, [r7], r2
+    vld1.8          {d4}, [r3], r6
+    vld1.8          {d5}, [r7], r2
+    vld1.8          {d6}, [r3], r6
+    vld1.8          {d7}, [r7], r2
+
+    vsubl.u8        q10, d0, d1
+    vsubl.u8        q11, d2, d3
+    vsubl.u8        q12, d4, d5
+    vsubl.u8        q13, d6, d7
+
+    mov             r2, r2, lsl #1
+
+    vst1.16         {d20}, [r5], r2         ;store diff
+    vst1.16         {d22}, [r5], r2
+    vst1.16         {d24}, [r5], r2
+    vst1.16         {d26}, [r5], r2
+
+    ldmfd   sp!, {r4-r7}
+    bx              lr
+
+    ENDP
+
+
+;==========================================
+;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
+;                           unsigned char *pred, int pred_stride)
+|vp8_subtract_mby_neon| PROC
+    push            {r4-r7}
+    mov             r12, #4
+    ldr             r4, [sp, #16]           ; pred_stride
+    mov             r6, #32                 ; "diff" stride x2
+    add             r5, r0, #16             ; second diff pointer
+
+subtract_mby_loop
+    vld1.8          {q0}, [r1], r2          ;load src
+    vld1.8          {q1}, [r3], r4          ;load pred
+    vld1.8          {q2}, [r1], r2
+    vld1.8          {q3}, [r3], r4
+    vld1.8          {q4}, [r1], r2
+    vld1.8          {q5}, [r3], r4
+    vld1.8          {q6}, [r1], r2
+    vld1.8          {q7}, [r3], r4
+
+    vsubl.u8        q8, d0, d2
+    vsubl.u8        q9, d1, d3
+    vsubl.u8        q10, d4, d6
+    vsubl.u8        q11, d5, d7
+    vsubl.u8        q12, d8, d10
+    vsubl.u8        q13, d9, d11
+    vsubl.u8        q14, d12, d14
+    vsubl.u8        q15, d13, d15
+
+    vst1.16         {q8}, [r0], r6          ;store diff
+    vst1.16         {q9}, [r5], r6
+    vst1.16         {q10}, [r0], r6
+    vst1.16         {q11}, [r5], r6
+    vst1.16         {q12}, [r0], r6
+    vst1.16         {q13}, [r5], r6
+    vst1.16         {q14}, [r0], r6
+    vst1.16         {q15}, [r5], r6
+
+    subs            r12, r12, #1
+    bne             subtract_mby_loop
+
+    pop             {r4-r7}
+    bx              lr
+    ENDP
+
+;=================================
+;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+;                         int src_stride, unsigned char *upred,
+;                         unsigned char *vpred, int pred_stride)
+
+|vp8_subtract_mbuv_neon| PROC
+    push            {r4-r7}
+    ldr             r4, [sp, #16]       ; upred
+    ldr             r5, [sp, #20]       ; vpred
+    ldr             r6, [sp, #24]       ; pred_stride
+    add             r0, r0, #512        ; short *udiff = diff + 256;
+    mov             r12, #32            ; "diff" stride x2
+    add             r7, r0, #16         ; second diff pointer
+
+;u
+    vld1.8          {d0}, [r1], r3      ;load usrc
+    vld1.8          {d1}, [r4], r6      ;load upred
+    vld1.8          {d2}, [r1], r3
+    vld1.8          {d3}, [r4], r6
+    vld1.8          {d4}, [r1], r3
+    vld1.8          {d5}, [r4], r6
+    vld1.8          {d6}, [r1], r3
+    vld1.8          {d7}, [r4], r6
+    vld1.8          {d8}, [r1], r3
+    vld1.8          {d9}, [r4], r6
+    vld1.8          {d10}, [r1], r3
+    vld1.8          {d11}, [r4], r6
+    vld1.8          {d12}, [r1], r3
+    vld1.8          {d13}, [r4], r6
+    vld1.8          {d14}, [r1], r3
+    vld1.8          {d15}, [r4], r6
+
+    vsubl.u8        q8, d0, d1
+    vsubl.u8        q9, d2, d3
+    vsubl.u8        q10, d4, d5
+    vsubl.u8        q11, d6, d7
+    vsubl.u8        q12, d8, d9
+    vsubl.u8        q13, d10, d11
+    vsubl.u8        q14, d12, d13
+    vsubl.u8        q15, d14, d15
+
+    vst1.16         {q8}, [r0], r12     ;store diff
+    vst1.16         {q9}, [r7], r12
+    vst1.16         {q10}, [r0], r12
+    vst1.16         {q11}, [r7], r12
+    vst1.16         {q12}, [r0], r12
+    vst1.16         {q13}, [r7], r12
+    vst1.16         {q14}, [r0], r12
+    vst1.16         {q15}, [r7], r12
+
+;v
+    vld1.8          {d0}, [r2], r3      ;load vsrc
+    vld1.8          {d1}, [r5], r6      ;load vpred
+    vld1.8          {d2}, [r2], r3
+    vld1.8          {d3}, [r5], r6
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d5}, [r5], r6
+    vld1.8          {d6}, [r2], r3
+    vld1.8          {d7}, [r5], r6
+    vld1.8          {d8}, [r2], r3
+    vld1.8          {d9}, [r5], r6
+    vld1.8          {d10}, [r2], r3
+    vld1.8          {d11}, [r5], r6
+    vld1.8          {d12}, [r2], r3
+    vld1.8          {d13}, [r5], r6
+    vld1.8          {d14}, [r2], r3
+    vld1.8          {d15}, [r5], r6
+
+    vsubl.u8        q8, d0, d1
+    vsubl.u8        q9, d2, d3
+    vsubl.u8        q10, d4, d5
+    vsubl.u8        q11, d6, d7
+    vsubl.u8        q12, d8, d9
+    vsubl.u8        q13, d10, d11
+    vsubl.u8        q14, d12, d13
+    vsubl.u8        q15, d14, d15
+
+    vst1.16         {q8}, [r0], r12     ;store diff
+    vst1.16         {q9}, [r7], r12
+    vst1.16         {q10}, [r0], r12
+    vst1.16         {q11}, [r7], r12
+    vst1.16         {q12}, [r0], r12
+    vst1.16         {q13}, [r7], r12
+    vst1.16         {q14}, [r0], r12
+    vst1.16         {q15}, [r7], r12
+
+    pop             {r4-r7}
+    bx              lr
+
+    ENDP
+
+    END
diff --git a/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
new file mode 100644
index 0000000..5b9f11e
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
@@ -0,0 +1,70 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_memcpy_partial_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;=========================================
+;this is not a full memcpy function!!!
+;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
+;                             int sz);
+|vp8_memcpy_partial_neon| PROC
+    ;pld                [r1]                        ;preload pred data
+    ;pld                [r1, #128]
+    ;pld                [r1, #256]
+    ;pld                [r1, #384]
+
+    mov             r12, r2, lsr #8                 ;copy 256 bytes data at one time
+
+memcpy_neon_loop
+    vld1.8          {q0, q1}, [r1]!                 ;load src data
+    subs            r12, r12, #1
+    vld1.8          {q2, q3}, [r1]!
+    vst1.8          {q0, q1}, [r0]!                 ;copy to dst_ptr
+    vld1.8          {q4, q5}, [r1]!
+    vst1.8          {q2, q3}, [r0]!
+    vld1.8          {q6, q7}, [r1]!
+    vst1.8          {q4, q5}, [r0]!
+    vld1.8          {q8, q9}, [r1]!
+    vst1.8          {q6, q7}, [r0]!
+    vld1.8          {q10, q11}, [r1]!
+    vst1.8          {q8, q9}, [r0]!
+    vld1.8          {q12, q13}, [r1]!
+    vst1.8          {q10, q11}, [r0]!
+    vld1.8          {q14, q15}, [r1]!
+    vst1.8          {q12, q13}, [r0]!
+    vst1.8          {q14, q15}, [r0]!
+
+    ;pld                [r1]                        ;preload pred data -- need to adjust for real device
+    ;pld                [r1, #128]
+    ;pld                [r1, #256]
+    ;pld                [r1, #384]
+
+    bne             memcpy_neon_loop
+
+    ands            r3, r2, #0xff                   ;extra copy
+    beq             done_copy_neon_loop
+
+extra_copy_neon_loop
+    vld1.8          {q0}, [r1]!                 ;load src data
+    subs            r3, r3, #16
+    vst1.8          {q0}, [r0]!
+    bne             extra_copy_neon_loop
+
+done_copy_neon_loop
+    bx              lr
+    ENDP
+
+    END
diff --git a/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
new file mode 100644
index 0000000..55edbf5
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
@@ -0,0 +1,116 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mse16x16_neon|
+    EXPORT  |vp8_get4x4sse_cs_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;============================
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;note: in this function, sum is never used. So, we can remove this part of calculation
+;from vp8_variance().
+
+|vp8_mse16x16_neon| PROC
+    vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
+    vmov.i8         q8, #0
+    vmov.i8         q9, #0
+    vmov.i8         q10, #0
+
+    mov             r12, #8
+
+mse16x16_neon_loop
+    vld1.8          {q0}, [r0], r1              ;Load up source and reference
+    vld1.8          {q2}, [r2], r3
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vsubl.u8        q11, d0, d4
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vmlal.s16       q7, d22, d22
+    vmlal.s16       q8, d23, d23
+
+    subs            r12, r12, #1
+
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vmlal.s16       q7, d26, d26
+    vmlal.s16       q8, d27, d27
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             mse16x16_neon_loop
+
+    vadd.u32        q7, q7, q8
+    vadd.u32        q9, q9, q10
+
+    ldr             r12, [sp]               ;load *sse from stack
+
+    vadd.u32        q10, q7, q9
+    vpaddl.u32      q1, q10
+    vadd.u64        d0, d2, d3
+
+    vst1.32         {d0[0]}, [r12]
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+
+;=============================
+; r0    unsigned char *src_ptr,
+; r1    int  source_stride,
+; r2    unsigned char *ref_ptr,
+; r3    int  recon_stride
+|vp8_get4x4sse_cs_neon| PROC
+    vld1.8          {d0}, [r0], r1              ;Load up source and reference
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d1}, [r0], r1
+    vld1.8          {d5}, [r2], r3
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d6}, [r2], r3
+    vld1.8          {d3}, [r0], r1
+    vld1.8          {d7}, [r2], r3
+
+    vsubl.u8        q11, d0, d4
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vmull.s16       q7, d22, d22
+    vmull.s16       q8, d24, d24
+    vmull.s16       q9, d26, d26
+    vmull.s16       q10, d28, d28
+
+    vadd.u32        q7, q7, q8
+    vadd.u32        q9, q9, q10
+    vadd.u32        q9, q7, q9
+
+    vpaddl.u32      q1, q9
+    vadd.u64        d0, d2, d3
+
+    vmov.32         r0, d0[0]
+    bx              lr
+
+    ENDP
+
+    END
diff --git a/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
new file mode 100644
index 0000000..2226629
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
@@ -0,0 +1,103 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_walsh4x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
+; r0   short *input,
+; r1   short *output,
+; r2   int pitch
+|vp8_short_walsh4x4_neon| PROC
+
+    vld1.16         {d0}, [r0@64], r2   ; load input
+    vld1.16         {d1}, [r0@64], r2
+    vld1.16         {d2}, [r0@64], r2
+    vld1.16         {d3}, [r0@64]
+
+    ;First for-loop
+    ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+    vtrn.32         d0, d2
+    vtrn.32         d1, d3
+
+    vmov.s32        q15, #3             ; add 3 to all values
+
+    vtrn.16         d0, d1
+    vtrn.16         d2, d3
+
+    vadd.s16        d4, d0, d2          ; ip[0] + ip[2]
+    vadd.s16        d5, d1, d3          ; ip[1] + ip[3]
+    vsub.s16        d6, d1, d3          ; ip[1] - ip[3]
+    vsub.s16        d7, d0, d2          ; ip[0] - ip[2]
+
+    vshl.s16        d4, d4, #2          ; a1 = (ip[0] + ip[2]) << 2
+    vshl.s16        d5, d5, #2          ; d1 = (ip[1] + ip[3]) << 2
+    vshl.s16        d6, d6, #2          ; c1 = (ip[1] - ip[3]) << 2
+    vceq.s16        d16, d4, #0         ; a1 == 0
+    vshl.s16        d7, d7, #2          ; b1 = (ip[0] - ip[2]) << 2
+
+    vadd.s16        d0, d4, d5          ; a1 + d1
+    vmvn            d16, d16            ; a1 != 0
+    vsub.s16        d3, d4, d5          ; op[3] = a1 - d1
+    vadd.s16        d1, d7, d6          ; op[1] = b1 + c1
+    vsub.s16        d2, d7, d6          ; op[2] = b1 - c1
+    vsub.s16        d0, d0, d16         ; op[0] = a1 + d1 + (a1 != 0)
+
+    ;Second for-loop
+    ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+    vtrn.32         d1, d3
+    vtrn.32         d0, d2
+    vtrn.16         d2, d3
+    vtrn.16         d0, d1
+
+    vaddl.s16       q8, d0, d2          ; a1 = ip[0]+ip[8]
+    vaddl.s16       q9, d1, d3          ; d1 = ip[4]+ip[12]
+    vsubl.s16       q10, d1, d3         ; c1 = ip[4]-ip[12]
+    vsubl.s16       q11, d0, d2         ; b1 = ip[0]-ip[8]
+
+    vadd.s32        q0, q8, q9          ; a2 = a1 + d1
+    vadd.s32        q1, q11, q10        ; b2 = b1 + c1
+    vsub.s32        q2, q11, q10        ; c2 = b1 - c1
+    vsub.s32        q3, q8, q9          ; d2 = a1 - d1
+
+    vclt.s32        q8, q0, #0
+    vclt.s32        q9, q1, #0
+    vclt.s32        q10, q2, #0
+    vclt.s32        q11, q3, #0
+
+    ; subtract -1 (or 0)
+    vsub.s32        q0, q0, q8          ; a2 += a2 < 0
+    vsub.s32        q1, q1, q9          ; b2 += b2 < 0
+    vsub.s32        q2, q2, q10         ; c2 += c2 < 0
+    vsub.s32        q3, q3, q11         ; d2 += d2 < 0
+
+    vadd.s32        q8, q0, q15         ; a2 + 3
+    vadd.s32        q9, q1, q15         ; b2 + 3
+    vadd.s32        q10, q2, q15        ; c2 + 3
+    vadd.s32        q11, q3, q15        ; d2 + 3
+
+    ; vrshrn? would add 1 << 3-1 = 2
+    vshrn.s32       d0, q8, #3
+    vshrn.s32       d1, q9, #3
+    vshrn.s32       d2, q10, #3
+    vshrn.s32       d3, q11, #3
+
+    vst1.16         {q0, q1}, [r1@128]
+
+    bx              lr
+
+    ENDP
+
+    END
diff --git a/libvpx/vp8/encoder/arm/quantize_arm.c b/libvpx/vp8/encoder/arm/quantize_arm.c
new file mode 100644
index 0000000..8999e34
--- /dev/null
+++ b/libvpx/vp8/encoder/arm/quantize_arm.c
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/encoder/block.h"
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/entropy.h"
+
+
+#if HAVE_NEON
+
+/* vp8_quantize_mbX functions here differs from corresponding ones in
+ * quantize.c only by using quantize_b_pair function pointer instead of
+ * the regular quantize_b function pointer */
+void vp8_quantize_mby_neon(MACROBLOCK *x)
+{
+    int i;
+    int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+    for (i = 0; i < 16; i+=2)
+        x->quantize_b_pair(&x->block[i], &x->block[i+1],
+                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
+
+    if(has_2nd_order)
+        x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp8_quantize_mb_neon(MACROBLOCK *x)
+{
+    int i;
+    int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+    for (i = 0; i < 24; i+=2)
+        x->quantize_b_pair(&x->block[i], &x->block[i+1],
+                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
+
+    if (has_2nd_order)
+        x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
+}
+
+
+void vp8_quantize_mbuv_neon(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i+=2)
+        x->quantize_b_pair(&x->block[i], &x->block[i+1],
+                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
+}
+
+#endif /* HAVE_NEON */
diff --git a/libvpx/vp8/encoder/asm_enc_offsets.c b/libvpx/vp8/encoder/asm_enc_offsets.c
new file mode 100644
index 0000000..a4169b3
--- /dev/null
+++ b/libvpx/vp8/encoder/asm_enc_offsets.c
@@ -0,0 +1,93 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/asm_offsets.h"
+#include "vpx_config.h"
+#include "block.h"
+#include "vp8/common/blockd.h"
+#include "onyx_int.h"
+#include "treewriter.h"
+#include "tokenize.h"
+
+BEGIN
+
+/* regular quantize */
+DEFINE(vp8_block_coeff,                         offsetof(BLOCK, coeff));
+DEFINE(vp8_block_zbin,                          offsetof(BLOCK, zbin));
+DEFINE(vp8_block_round,                         offsetof(BLOCK, round));
+DEFINE(vp8_block_quant,                         offsetof(BLOCK, quant));
+DEFINE(vp8_block_quant_fast,                    offsetof(BLOCK, quant_fast));
+DEFINE(vp8_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));
+DEFINE(vp8_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));
+DEFINE(vp8_block_quant_shift,                   offsetof(BLOCK, quant_shift));
+
+DEFINE(vp8_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));
+DEFINE(vp8_blockd_dequant,                      offsetof(BLOCKD, dequant));
+DEFINE(vp8_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));
+DEFINE(vp8_blockd_eob,                          offsetof(BLOCKD, eob));
+
+/* subtract */
+DEFINE(vp8_block_base_src,                      offsetof(BLOCK, base_src));
+DEFINE(vp8_block_src,                           offsetof(BLOCK, src));
+DEFINE(vp8_block_src_diff,                      offsetof(BLOCK, src_diff));
+DEFINE(vp8_block_src_stride,                    offsetof(BLOCK, src_stride));
+
+DEFINE(vp8_blockd_predictor,                    offsetof(BLOCKD, predictor));
+
+/* pack tokens */
+DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));
+DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));
+DEFINE(vp8_writer_count,                        offsetof(vp8_writer, count));
+DEFINE(vp8_writer_pos,                          offsetof(vp8_writer, pos));
+DEFINE(vp8_writer_buffer,                       offsetof(vp8_writer, buffer));
+DEFINE(vp8_writer_buffer_end,                   offsetof(vp8_writer, buffer_end));
+DEFINE(vp8_writer_error,                        offsetof(vp8_writer, error));
+
+DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));
+DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));
+DEFINE(tokenextra_context_tree,                 offsetof(TOKENEXTRA, context_tree));
+DEFINE(tokenextra_skip_eob_node,                offsetof(TOKENEXTRA, skip_eob_node));
+DEFINE(TOKENEXTRA_SZ,                           sizeof(TOKENEXTRA));
+
+DEFINE(vp8_extra_bit_struct_sz,                 sizeof(vp8_extra_bit_struct));
+
+DEFINE(vp8_token_value,                         offsetof(vp8_token, value));
+DEFINE(vp8_token_len,                           offsetof(vp8_token, Len));
+
+DEFINE(vp8_extra_bit_struct_tree,               offsetof(vp8_extra_bit_struct, tree));
+DEFINE(vp8_extra_bit_struct_prob,               offsetof(vp8_extra_bit_struct, prob));
+DEFINE(vp8_extra_bit_struct_len,                offsetof(vp8_extra_bit_struct, Len));
+DEFINE(vp8_extra_bit_struct_base_val,           offsetof(vp8_extra_bit_struct, base_val));
+
+DEFINE(vp8_comp_tplist,                         offsetof(VP8_COMP, tplist));
+DEFINE(vp8_comp_common,                         offsetof(VP8_COMP, common));
+DEFINE(vp8_comp_bc ,                            offsetof(VP8_COMP, bc));
+DEFINE(vp8_writer_sz ,                          sizeof(vp8_writer));
+
+DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
+DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));
+DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));
+
+DEFINE(vp8_common_mb_rows,                      offsetof(VP8_COMMON, mb_rows));
+
+END
+
+/* add asserts for any offset that is not supported by assembly code
+ * add asserts for any size that is not supported by assembly code
+
+ * These are used in vp8cx_pack_tokens.  They are hard coded so if their sizes
+ * change they will have to be adjusted.
+ */
+
+#if HAVE_EDSP
+ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
+ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)
+#endif
diff --git a/libvpx/vp8/encoder/bitstream.c b/libvpx/vp8/encoder/bitstream.c
new file mode 100644
index 0000000..f84ae68
--- /dev/null
+++ b/libvpx/vp8/encoder/bitstream.c
@@ -0,0 +1,1732 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/header.h"
+#include "encodemv.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/findnearmv.h"
+#include "mcomp.h"
+#include "vp8/common/systemdependent.h"
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+#include "vp8/common/pragmas.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_mem/vpx_mem.h"
+#include "bitstream.h"
+
+#include "defaultcoefcounts.h"
+#include "vp8/common/common.h"
+
+const int vp8cx_base_skip_false_prob[128] =
+{
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    251, 248, 244, 240, 236, 232, 229, 225,
+    221, 217, 213, 208, 204, 199, 194, 190,
+    187, 183, 179, 175, 172, 168, 164, 160,
+    157, 153, 149, 145, 142, 138, 134, 130,
+    127, 124, 120, 117, 114, 110, 107, 104,
+    101, 98,  95,  92,  89,  86,  83, 80,
+    77,  74,  71,  68,  65,  62,  59, 56,
+    53,  50,  47,  44,  41,  38,  35, 32,
+    30,  28,  26,  24,  22,  20,  18, 16,
+};
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+#endif
+
+#ifdef ENTROPY_STATS
+int intra_mode_stats[10][10][10];
+static unsigned int tree_update_hist [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] [2];
+extern unsigned int active_section;
+#endif
+
+#ifdef MODE_STATS
+int count_mb_seg[4] = { 0, 0, 0, 0 };
+#endif
+
+
+static void update_mode(
+    vp8_writer *const w,
+    int n,
+    vp8_token tok               [/* n */],
+    vp8_tree tree,
+    vp8_prob Pnew               [/* n-1 */],
+    vp8_prob Pcur               [/* n-1 */],
+    unsigned int bct            [/* n-1 */] [2],
+    const unsigned int num_events[/* n */]
+)
+{
+    unsigned int new_b = 0, old_b = 0;
+    int i = 0;
+
+    vp8_tree_probs_from_distribution(
+        n--, tok, tree,
+        Pnew, bct, num_events,
+        256, 1
+    );
+
+    do
+    {
+        new_b += vp8_cost_branch(bct[i], Pnew[i]);
+        old_b += vp8_cost_branch(bct[i], Pcur[i]);
+    }
+    while (++i < n);
+
+    if (new_b + (n << 8) < old_b)
+    {
+        int i = 0;
+
+        vp8_write_bit(w, 1);
+
+        do
+        {
+            const vp8_prob p = Pnew[i];
+
+            vp8_write_literal(w, Pcur[i] = p ? p : 1, 8);
+        }
+        while (++i < n);
+    }
+    else
+        vp8_write_bit(w, 0);
+}
+
+static void update_mbintra_mode_probs(VP8_COMP *cpi)
+{
+    VP8_COMMON *const x = & cpi->common;
+
+    vp8_writer *const w = cpi->bc;
+
+    {
+        vp8_prob Pnew   [VP8_YMODES-1];
+        unsigned int bct [VP8_YMODES-1] [2];
+
+        update_mode(
+            w, VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree,
+            Pnew, x->fc.ymode_prob, bct, (unsigned int *)cpi->mb.ymode_count
+        );
+    }
+    {
+        vp8_prob Pnew   [VP8_UV_MODES-1];
+        unsigned int bct [VP8_UV_MODES-1] [2];
+
+        update_mode(
+            w, VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree,
+            Pnew, x->fc.uv_mode_prob, bct, (unsigned int *)cpi->mb.uv_mode_count
+        );
+    }
+}
+
+static void write_ymode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_ymode_tree, p, vp8_ymode_encodings + m);
+}
+
+static void kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_kf_ymode_tree, p, vp8_kf_ymode_encodings + m);
+}
+
+static void write_uv_mode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_uv_mode_tree, p, vp8_uv_mode_encodings + m);
+}
+
+
+static void write_bmode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_bmode_tree, p, vp8_bmode_encodings + m);
+}
+
+static void write_split(vp8_writer *bc, int x)
+{
+    vp8_write_token(
+        bc, vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + x
+    );
+}
+
+void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
+{
+    const TOKENEXTRA *stop = p + xcount;
+    unsigned int split;
+    unsigned int shift;
+    int count = w->count;
+    unsigned int range = w->range;
+    unsigned int lowvalue = w->lowvalue;
+
+    while (p < stop)
+    {
+        const int t = p->Token;
+        vp8_token *a = vp8_coef_encodings + t;
+        const vp8_extra_bit_struct *b = vp8_extra_bits + t;
+        int i = 0;
+        const unsigned char *pp = p->context_tree;
+        int v = a->value;
+        int n = a->Len;
+
+        if (p->skip_eob_node)
+        {
+            n--;
+            i = 2;
+        }
+
+        do
+        {
+            const int bb = (v >> --n) & 1;
+            split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+            i = vp8_coef_tree[i+bb];
+
+            if (bb)
+            {
+                lowvalue += split;
+                range = range - split;
+            }
+            else
+            {
+                range = split;
+            }
+
+            shift = vp8_norm[range];
+            range <<= shift;
+            count += shift;
+
+            if (count >= 0)
+            {
+                int offset = shift - count;
+
+                if ((lowvalue << (offset - 1)) & 0x80000000)
+                {
+                    int x = w->pos - 1;
+
+                    while (x >= 0 && w->buffer[x] == 0xff)
+                    {
+                        w->buffer[x] = (unsigned char)0;
+                        x--;
+                    }
+
+                    w->buffer[x] += 1;
+                }
+
+                validate_buffer(w->buffer + w->pos,
+                                1,
+                                w->buffer_end,
+                                w->error);
+
+                w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                lowvalue <<= offset;
+                shift = count;
+                lowvalue &= 0xffffff;
+                count -= 8 ;
+            }
+
+            lowvalue <<= shift;
+        }
+        while (n);
+
+
+        if (b->base_val)
+        {
+            const int e = p->Extra, L = b->Len;
+
+            if (L)
+            {
+                const unsigned char *pp = b->prob;
+                int v = e >> 1;
+                int n = L;              /* number of bits in v, assumed nonzero */
+                int i = 0;
+
+                do
+                {
+                    const int bb = (v >> --n) & 1;
+                    split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+                    i = b->tree[i+bb];
+
+                    if (bb)
+                    {
+                        lowvalue += split;
+                        range = range - split;
+                    }
+                    else
+                    {
+                        range = split;
+                    }
+
+                    shift = vp8_norm[range];
+                    range <<= shift;
+                    count += shift;
+
+                    if (count >= 0)
+                    {
+                        int offset = shift - count;
+
+                        if ((lowvalue << (offset - 1)) & 0x80000000)
+                        {
+                            int x = w->pos - 1;
+
+                            while (x >= 0 && w->buffer[x] == 0xff)
+                            {
+                                w->buffer[x] = (unsigned char)0;
+                                x--;
+                            }
+
+                            w->buffer[x] += 1;
+                        }
+
+                        validate_buffer(w->buffer + w->pos,
+                                        1,
+                                        w->buffer_end,
+                                        w->error);
+
+                        w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                        lowvalue <<= offset;
+                        shift = count;
+                        lowvalue &= 0xffffff;
+                        count -= 8 ;
+                    }
+
+                    lowvalue <<= shift;
+                }
+                while (n);
+            }
+
+
+            {
+
+                split = (range + 1) >> 1;
+
+                if (e & 1)
+                {
+                    lowvalue += split;
+                    range = range - split;
+                }
+                else
+                {
+                    range = split;
+                }
+
+                range <<= 1;
+
+                if ((lowvalue & 0x80000000))
+                {
+                    int x = w->pos - 1;
+
+                    while (x >= 0 && w->buffer[x] == 0xff)
+                    {
+                        w->buffer[x] = (unsigned char)0;
+                        x--;
+                    }
+
+                    w->buffer[x] += 1;
+
+                }
+
+                lowvalue  <<= 1;
+
+                if (!++count)
+                {
+                    count = -8;
+
+                    validate_buffer(w->buffer + w->pos,
+                                    1,
+                                    w->buffer_end,
+                                    w->error);
+
+                    w->buffer[w->pos++] = (lowvalue >> 24);
+                    lowvalue &= 0xffffff;
+                }
+            }
+
+        }
+
+        ++p;
+    }
+
+    w->count = count;
+    w->lowvalue = lowvalue;
+    w->range = range;
+
+}
+
+static void write_partition_size(unsigned char *cx_data, int size)
+{
+    signed char csize;
+
+    csize = size & 0xff;
+    *cx_data = csize;
+    csize = (size >> 8) & 0xff;
+    *(cx_data + 1) = csize;
+    csize = (size >> 16) & 0xff;
+    *(cx_data + 2) = csize;
+
+}
+
+static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
+                                          unsigned char * cx_data_end,
+                                          int num_part)
+{
+
+    int i;
+    unsigned char *ptr = cx_data;
+    unsigned char *ptr_end = cx_data_end;
+    vp8_writer * w;
+
+    for (i = 0; i < num_part; i++)
+    {
+        int mb_row;
+
+        w = cpi->bc + i + 1;
+
+        vp8_start_encode(w, ptr, ptr_end);
+
+        for (mb_row = i; mb_row < cpi->common.mb_rows; mb_row += num_part)
+        {
+            const TOKENEXTRA *p    = cpi->tplist[mb_row].start;
+            const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+            int tokens = (int)(stop - p);
+
+            vp8_pack_tokens_c(w, p, tokens);
+        }
+
+        vp8_stop_encode(w);
+        ptr += w->pos;
+    }
+}
+
+
+static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
+{
+    int mb_row;
+
+    for (mb_row = 0; mb_row < cpi->common.mb_rows; mb_row++)
+    {
+        const TOKENEXTRA *p    = cpi->tplist[mb_row].start;
+        const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+        int tokens = (int)(stop - p);
+
+        vp8_pack_tokens_c(w, p, tokens);
+    }
+
+}
+
+static void write_mv_ref
+(
+    vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p
+)
+{
+#if CONFIG_DEBUG
+    assert(NEARESTMV <= m  &&  m <= SPLITMV);
+#endif
+    vp8_write_token(w, vp8_mv_ref_tree, p,
+                    vp8_mv_ref_encoding_array - NEARESTMV + m);
+}
+
+static void write_sub_mv_ref
+(
+    vp8_writer *w, B_PREDICTION_MODE m, const vp8_prob *p
+)
+{
+#if CONFIG_DEBUG
+    assert(LEFT4X4 <= m  &&  m <= NEW4X4);
+#endif
+    vp8_write_token(w, vp8_sub_mv_ref_tree, p,
+                    vp8_sub_mv_ref_encoding_array - LEFT4X4 + m);
+}
+
+static void write_mv
+(
+    vp8_writer *w, const MV *mv, const int_mv *ref, const MV_CONTEXT *mvc
+)
+{
+    MV e;
+    e.row = mv->row - ref->as_mv.row;
+    e.col = mv->col - ref->as_mv.col;
+
+    vp8_encode_motion_vector(w, &e, mvc);
+}
+
+static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACROBLOCKD *x)
+{
+    /* Encode the MB segment id. */
+    if (x->segmentation_enabled && x->update_mb_segmentation_map)
+    {
+        switch (mi->segment_id)
+        {
+        case 0:
+            vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+            break;
+        case 1:
+            vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 1, x->mb_segment_tree_probs[1]);
+            break;
+        case 2:
+            vp8_write(w, 1, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 0, x->mb_segment_tree_probs[2]);
+            break;
+        case 3:
+            vp8_write(w, 1, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 1, x->mb_segment_tree_probs[2]);
+            break;
+
+            /* TRAP.. This should not happen */
+        default:
+            vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+            break;
+        }
+    }
+}
+void vp8_convert_rfct_to_prob(VP8_COMP *const cpi)
+{
+    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+    /* Calculate the probabilities used to code the ref frame based on usage */
+    if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter)))
+        cpi->prob_intra_coded = 1;
+
+    cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+    if (!cpi->prob_last_coded)
+        cpi->prob_last_coded = 1;
+
+    cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+                  ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+    if (!cpi->prob_gf_coded)
+        cpi->prob_gf_coded = 1;
+
+}
+
+static void pack_inter_mode_mvs(VP8_COMP *const cpi)
+{
+    VP8_COMMON *const pc = & cpi->common;
+    vp8_writer *const w = cpi->bc;
+    const MV_CONTEXT *mvc = pc->fc.mvc;
+
+
+    MODE_INFO *m = pc->mi;
+    const int mis = pc->mode_info_stride;
+    int mb_row = -1;
+
+    int prob_skip_false = 0;
+
+    cpi->mb.partition_info = cpi->mb.pi;
+
+    vp8_convert_rfct_to_prob(cpi);
+
+#ifdef ENTROPY_STATS
+    active_section = 1;
+#endif
+
+    if (pc->mb_no_coeff_skip)
+    {
+        int total_mbs = pc->mb_rows * pc->mb_cols;
+
+        prob_skip_false = (total_mbs - cpi->mb.skip_true_count ) * 256 / total_mbs;
+
+        if (prob_skip_false <= 1)
+            prob_skip_false = 1;
+
+        if (prob_skip_false > 255)
+            prob_skip_false = 255;
+
+        cpi->prob_skip_false = prob_skip_false;
+        vp8_write_literal(w, prob_skip_false, 8);
+    }
+
+    vp8_write_literal(w, cpi->prob_intra_coded, 8);
+    vp8_write_literal(w, cpi->prob_last_coded, 8);
+    vp8_write_literal(w, cpi->prob_gf_coded, 8);
+
+    update_mbintra_mode_probs(cpi);
+
+    vp8_write_mvprobs(cpi);
+
+    while (++mb_row < pc->mb_rows)
+    {
+        int mb_col = -1;
+
+        while (++mb_col < pc->mb_cols)
+        {
+            const MB_MODE_INFO *const mi = & m->mbmi;
+            const MV_REFERENCE_FRAME rf = mi->ref_frame;
+            const MB_PREDICTION_MODE mode = mi->mode;
+
+            MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+            /* Distance of Mb to the various image edges.
+             * These specified to 8th pel as they are always compared to MV
+             * values that are in 1/8th pel units
+             */
+            xd->mb_to_left_edge = -((mb_col * 16) << 3);
+            xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+            xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+            xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+#ifdef ENTROPY_STATS
+            active_section = 9;
+#endif
+
+            if (cpi->mb.e_mbd.update_mb_segmentation_map)
+                write_mb_features(w, mi, &cpi->mb.e_mbd);
+
+            if (pc->mb_no_coeff_skip)
+                vp8_encode_bool(w, m->mbmi.mb_skip_coeff, prob_skip_false);
+
+            if (rf == INTRA_FRAME)
+            {
+                vp8_write(w, 0, cpi->prob_intra_coded);
+#ifdef ENTROPY_STATS
+                active_section = 6;
+#endif
+                write_ymode(w, mode, pc->fc.ymode_prob);
+
+                if (mode == B_PRED)
+                {
+                    int j = 0;
+
+                    do
+                        write_bmode(w, m->bmi[j].as_mode, pc->fc.bmode_prob);
+                    while (++j < 16);
+                }
+
+                write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob);
+            }
+            else    /* inter coded */
+            {
+                int_mv best_mv;
+                vp8_prob mv_ref_p [VP8_MVREFS-1];
+
+                vp8_write(w, 1, cpi->prob_intra_coded);
+
+                if (rf == LAST_FRAME)
+                    vp8_write(w, 0, cpi->prob_last_coded);
+                else
+                {
+                    vp8_write(w, 1, cpi->prob_last_coded);
+                    vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, cpi->prob_gf_coded);
+                }
+
+                {
+                    int_mv n1, n2;
+                    int ct[4];
+
+                    vp8_find_near_mvs(xd, m, &n1, &n2, &best_mv, ct, rf, cpi->common.ref_frame_sign_bias);
+                    vp8_clamp_mv2(&best_mv, xd);
+
+                    vp8_mv_ref_probs(mv_ref_p, ct);
+
+#ifdef ENTROPY_STATS
+                    accum_mv_refs(mode, ct);
+#endif
+
+                }
+
+#ifdef ENTROPY_STATS
+                active_section = 3;
+#endif
+
+                write_mv_ref(w, mode, mv_ref_p);
+
+                switch (mode)   /* new, split require MVs */
+                {
+                case NEWMV:
+
+#ifdef ENTROPY_STATS
+                    active_section = 5;
+#endif
+
+                    write_mv(w, &mi->mv.as_mv, &best_mv, mvc);
+                    break;
+
+                case SPLITMV:
+                {
+                    int j = 0;
+
+#ifdef MODE_STATS
+                    ++count_mb_seg [mi->partitioning];
+#endif
+
+                    write_split(w, mi->partitioning);
+
+                    do
+                    {
+                        B_PREDICTION_MODE blockmode;
+                        int_mv blockmv;
+                        const int *const  L = vp8_mbsplits [mi->partitioning];
+                        int k = -1;  /* first block in subset j */
+                        int mv_contz;
+                        int_mv leftmv, abovemv;
+
+                        blockmode =  cpi->mb.partition_info->bmi[j].mode;
+                        blockmv =  cpi->mb.partition_info->bmi[j].mv;
+#if CONFIG_DEBUG
+                        while (j != L[++k])
+                            if (k >= 16)
+                                assert(0);
+#else
+                        while (j != L[++k]);
+#endif
+                        leftmv.as_int = left_block_mv(m, k);
+                        abovemv.as_int = above_block_mv(m, k, mis);
+                        mv_contz = vp8_mv_cont(&leftmv, &abovemv);
+
+                        write_sub_mv_ref(w, blockmode, vp8_sub_mv_ref_prob2 [mv_contz]);
+
+                        if (blockmode == NEW4X4)
+                        {
+#ifdef ENTROPY_STATS
+                            active_section = 11;
+#endif
+                            write_mv(w, &blockmv.as_mv, &best_mv, (const MV_CONTEXT *) mvc);
+                        }
+                    }
+                    while (++j < cpi->mb.partition_info->count);
+                }
+                break;
+                default:
+                    break;
+                }
+            }
+
+            ++m;
+            cpi->mb.partition_info++;
+        }
+
+        ++m;  /* skip L prediction border */
+        cpi->mb.partition_info++;
+    }
+}
+
+
+static void write_kfmodes(VP8_COMP *cpi)
+{
+    vp8_writer *const bc = cpi->bc;
+    const VP8_COMMON *const c = & cpi->common;
+    /* const */
+    MODE_INFO *m = c->mi;
+
+    int mb_row = -1;
+    int prob_skip_false = 0;
+
+    if (c->mb_no_coeff_skip)
+    {
+        int total_mbs = c->mb_rows * c->mb_cols;
+
+        prob_skip_false = (total_mbs - cpi->mb.skip_true_count ) * 256 / total_mbs;
+
+        if (prob_skip_false <= 1)
+            prob_skip_false = 1;
+
+        if (prob_skip_false >= 255)
+            prob_skip_false = 255;
+
+        cpi->prob_skip_false = prob_skip_false;
+        vp8_write_literal(bc, prob_skip_false, 8);
+    }
+
+    while (++mb_row < c->mb_rows)
+    {
+        int mb_col = -1;
+
+        while (++mb_col < c->mb_cols)
+        {
+            const int ym = m->mbmi.mode;
+
+            if (cpi->mb.e_mbd.update_mb_segmentation_map)
+                write_mb_features(bc, &m->mbmi, &cpi->mb.e_mbd);
+
+            if (c->mb_no_coeff_skip)
+                vp8_encode_bool(bc, m->mbmi.mb_skip_coeff, prob_skip_false);
+
+            kfwrite_ymode(bc, ym, vp8_kf_ymode_prob);
+
+            if (ym == B_PRED)
+            {
+                const int mis = c->mode_info_stride;
+                int i = 0;
+
+                do
+                {
+                    const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
+                    const B_PREDICTION_MODE L = left_block_mode(m, i);
+                    const int bm = m->bmi[i].as_mode;
+
+#ifdef ENTROPY_STATS
+                    ++intra_mode_stats [A] [L] [bm];
+#endif
+
+                    write_bmode(bc, bm, vp8_kf_bmode_prob [A] [L]);
+                }
+                while (++i < 16);
+            }
+
+            write_uv_mode(bc, (m++)->mbmi.uv_mode, vp8_kf_uv_mode_prob);
+        }
+
+        m++;    /* skip L prediction border */
+    }
+}
+
+#if 0
+/* This function is used for debugging probability trees. */
+static void print_prob_tree(vp8_prob
+     coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES])
+{
+    /* print coef probability tree */
+    int i,j,k,l;
+    FILE* f = fopen("enc_tree_probs.txt", "a");
+    fprintf(f, "{\n");
+    for (i = 0; i < BLOCK_TYPES; i++)
+    {
+        fprintf(f, "  {\n");
+        for (j = 0; j < COEF_BANDS; j++)
+        {
+            fprintf(f, "    {\n");
+            for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+            {
+                fprintf(f, "      {");
+                for (l = 0; l < ENTROPY_NODES; l++)
+                {
+                    fprintf(f, "%3u, ",
+                            (unsigned int)(coef_probs [i][j][k][l]));
+                }
+                fprintf(f, " }\n");
+            }
+            fprintf(f, "    }\n");
+        }
+        fprintf(f, "  }\n");
+    }
+    fprintf(f, "}\n");
+    fclose(f);
+}
+#endif
+
+static void sum_probs_over_prev_coef_context(
+        const unsigned int probs[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
+        unsigned int* out)
+{
+    int i, j;
+    for (i=0; i < MAX_ENTROPY_TOKENS; ++i)
+    {
+        for (j=0; j < PREV_COEF_CONTEXTS; ++j)
+        {
+            const unsigned int tmp = out[i];
+            out[i] += probs[j][i];
+            /* check for wrap */
+            if (out[i] < tmp)
+                out[i] = UINT_MAX;
+        }
+    }
+}
+
+static int prob_update_savings(const unsigned int *ct,
+                                   const vp8_prob oldp, const vp8_prob newp,
+                                   const vp8_prob upd)
+{
+    const int old_b = vp8_cost_branch(ct, oldp);
+    const int new_b = vp8_cost_branch(ct, newp);
+    const int update_b = 8 +
+                         ((vp8_cost_one(upd) - vp8_cost_zero(upd)) >> 8);
+
+    return old_b - new_b - update_b;
+}
+
+static int independent_coef_context_savings(VP8_COMP *cpi)
+{
+    MACROBLOCK *const x = & cpi->mb;
+    int savings = 0;
+    int i = 0;
+    do
+    {
+        int j = 0;
+        do
+        {
+            int k = 0;
+            unsigned int prev_coef_count_sum[MAX_ENTROPY_TOKENS] = {0};
+            int prev_coef_savings[MAX_ENTROPY_TOKENS] = {0};
+            const unsigned int (*probs)[MAX_ENTROPY_TOKENS];
+            /* Calculate new probabilities given the constraint that
+             * they must be equal over the prev coef contexts
+             */
+
+            probs = (const unsigned int (*)[MAX_ENTROPY_TOKENS])
+                x->coef_counts[i][j];
+
+            /* Reset to default probabilities at key frames */
+            if (cpi->common.frame_type == KEY_FRAME)
+                probs = default_coef_counts[i][j];
+
+            sum_probs_over_prev_coef_context(probs, prev_coef_count_sum);
+
+            do
+            {
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                int t = 0;      /* token/prob index */
+
+                vp8_tree_probs_from_distribution(
+                    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+                    cpi->frame_coef_probs[i][j][k],
+                    cpi->frame_branch_ct [i][j][k],
+                    prev_coef_count_sum,
+                    256, 1);
+
+                do
+                {
+                    const unsigned int *ct  = cpi->frame_branch_ct [i][j][k][t];
+                    const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+                    const vp8_prob oldp = cpi->common.fc.coef_probs [i][j][k][t];
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+                    const int s = prob_update_savings(ct, oldp, newp, upd);
+
+                    if (cpi->common.frame_type != KEY_FRAME ||
+                        (cpi->common.frame_type == KEY_FRAME && newp != oldp))
+                        prev_coef_savings[t] += s;
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+            k = 0;
+            do
+            {
+                /* We only update probabilities if we can save bits, except
+                 * for key frames where we have to update all probabilities
+                 * to get the equal probabilities across the prev coef
+                 * contexts.
+                 */
+                if (prev_coef_savings[k] > 0 ||
+                    cpi->common.frame_type == KEY_FRAME)
+                    savings += prev_coef_savings[k];
+            }
+            while (++k < ENTROPY_NODES);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+    return savings;
+}
+
+static int default_coef_context_savings(VP8_COMP *cpi)
+{
+    MACROBLOCK *const x = & cpi->mb;
+    int savings = 0;
+    int i = 0;
+    do
+    {
+        int j = 0;
+        do
+        {
+            int k = 0;
+            do
+            {
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                int t = 0;      /* token/prob index */
+
+                vp8_tree_probs_from_distribution(
+                    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+                    cpi->frame_coef_probs [i][j][k],
+                    cpi->frame_branch_ct [i][j][k],
+                    x->coef_counts [i][j][k],
+                    256, 1
+                );
+
+                do
+                {
+                    const unsigned int *ct  = cpi->frame_branch_ct [i][j][k][t];
+                    const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+                    const vp8_prob oldp = cpi->common.fc.coef_probs [i][j][k][t];
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+                    const int s = prob_update_savings(ct, oldp, newp, upd);
+
+                    if (s > 0)
+                    {
+                        savings += s;
+                    }
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+    return savings;
+}
+
+void vp8_calc_ref_frame_costs(int *ref_frame_cost,
+                              int prob_intra,
+                              int prob_last,
+                              int prob_garf
+                             )
+{
+    ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(prob_intra);
+    ref_frame_cost[LAST_FRAME]    = vp8_cost_one(prob_intra)
+                                    + vp8_cost_zero(prob_last);
+    ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(prob_intra)
+                                    + vp8_cost_one(prob_last)
+                                    + vp8_cost_zero(prob_garf);
+    ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(prob_intra)
+                                    + vp8_cost_one(prob_last)
+                                    + vp8_cost_one(prob_garf);
+
+}
+
+int vp8_estimate_entropy_savings(VP8_COMP *cpi)
+{
+    int savings = 0;
+
+    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+    int new_intra, new_last, new_garf, oldtotal, newtotal;
+    int ref_frame_cost[MAX_REF_FRAMES];
+
+    vp8_clear_system_state();
+
+    if (cpi->common.frame_type != KEY_FRAME)
+    {
+        if (!(new_intra = rf_intra * 255 / (rf_intra + rf_inter)))
+            new_intra = 1;
+
+        new_last = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+        new_garf = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+                  ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+
+        vp8_calc_ref_frame_costs(ref_frame_cost,new_intra,new_last,new_garf);
+
+        newtotal =
+            rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
+            rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] +
+            rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] +
+            rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME];
+
+
+        /* old costs */
+        vp8_calc_ref_frame_costs(ref_frame_cost,cpi->prob_intra_coded,
+                                 cpi->prob_last_coded,cpi->prob_gf_coded);
+
+        oldtotal =
+            rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
+            rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] +
+            rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] +
+            rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME];
+
+        savings += (oldtotal - newtotal) / 256;
+    }
+
+
+    if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+        savings += independent_coef_context_savings(cpi);
+    else
+        savings += default_coef_context_savings(cpi);
+
+
+    return savings;
+}
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+int vp8_update_coef_context(VP8_COMP *cpi)
+{
+    int savings = 0;
+
+
+    if (cpi->common.frame_type == KEY_FRAME)
+    {
+        /* Reset to default counts/probabilities at key frames */
+        vp8_copy(cpi->coef_counts, default_coef_counts);
+    }
+
+    if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+        savings += independent_coef_context_savings(cpi);
+    else
+        savings += default_coef_context_savings(cpi);
+
+    return savings;
+}
+#endif
+
+void vp8_update_coef_probs(VP8_COMP *cpi)
+{
+    int i = 0;
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    vp8_writer *const w = cpi->bc;
+#endif
+    int savings = 0;
+
+    vp8_clear_system_state();
+
+    do
+    {
+        int j = 0;
+
+        do
+        {
+            int k = 0;
+            int prev_coef_savings[ENTROPY_NODES] = {0};
+            if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+            {
+                for (k = 0; k < PREV_COEF_CONTEXTS; ++k)
+                {
+                    int t;      /* token/prob index */
+                    for (t = 0; t < ENTROPY_NODES; ++t)
+                    {
+                        const unsigned int *ct = cpi->frame_branch_ct [i][j]
+                                                                      [k][t];
+                        const vp8_prob newp = cpi->frame_coef_probs[i][j][k][t];
+                        const vp8_prob oldp = cpi->common.fc.coef_probs[i][j]
+                                                                       [k][t];
+                        const vp8_prob upd = vp8_coef_update_probs[i][j][k][t];
+
+                        prev_coef_savings[t] +=
+                                prob_update_savings(ct, oldp, newp, upd);
+                    }
+                }
+                k = 0;
+            }
+            do
+            {
+                /* note: use result from vp8_estimate_entropy_savings, so no
+                 * need to call vp8_tree_probs_from_distribution here.
+                 */
+
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                int t = 0;      /* token/prob index */
+
+                do
+                {
+                    const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+
+                    vp8_prob *Pold = cpi->common.fc.coef_probs [i][j][k] + t;
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+
+                    int s = prev_coef_savings[t];
+                    int u = 0;
+
+                    if (!(cpi->oxcf.error_resilient_mode &
+                            VPX_ERROR_RESILIENT_PARTITIONS))
+                    {
+                        s = prob_update_savings(
+                                cpi->frame_branch_ct [i][j][k][t],
+                                *Pold, newp, upd);
+                    }
+
+                    if (s > 0)
+                        u = 1;
+
+                    /* Force updates on key frames if the new is different,
+                     * so that we can be sure we end up with equal probabilities
+                     * over the prev coef contexts.
+                     */
+                    if ((cpi->oxcf.error_resilient_mode &
+                            VPX_ERROR_RESILIENT_PARTITIONS) &&
+                        cpi->common.frame_type == KEY_FRAME && newp != *Pold)
+                        u = 1;
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    cpi->update_probs[i][j][k][t] = u;
+#else
+                    vp8_write(w, u, upd);
+#endif
+
+
+#ifdef ENTROPY_STATS
+                    ++ tree_update_hist [i][j][k][t] [u];
+#endif
+
+                    if (u)
+                    {
+                        /* send/use new probability */
+
+                        *Pold = newp;
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+                        vp8_write_literal(w, newp, 8);
+#endif
+
+                        savings += s;
+
+                    }
+
+                }
+                while (++t < ENTROPY_NODES);
+
+                /* Accum token counts for generation of default statistics */
+#ifdef ENTROPY_STATS
+                t = 0;
+
+                do
+                {
+                    context_counters [i][j][k][t] += cpi->coef_counts [i][j][k][t];
+                }
+                while (++t < MAX_ENTROPY_TOKENS);
+
+#endif
+
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+
+}
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+static void pack_coef_probs(VP8_COMP *cpi)
+{
+    int i = 0;
+    vp8_writer *const w = cpi->bc;
+
+    do
+    {
+        int j = 0;
+
+        do
+        {
+            int k = 0;
+
+            do
+            {
+                int t = 0;      /* token/prob index */
+
+                do
+                {
+                    const vp8_prob newp = cpi->common.fc.coef_probs [i][j][k][t];
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+
+                    const char u = cpi->update_probs[i][j][k][t] ;
+
+                    vp8_write(w, u, upd);
+
+                    if (u)
+                    {
+                        /* send/use new probability */
+                        vp8_write_literal(w, newp, 8);
+                    }
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+}
+#endif
+
+#ifdef PACKET_TESTING
+FILE *vpxlogc = 0;
+#endif
+
+static void put_delta_q(vp8_writer *bc, int delta_q)
+{
+    if (delta_q != 0)
+    {
+        vp8_write_bit(bc, 1);
+        vp8_write_literal(bc, abs(delta_q), 4);
+
+        if (delta_q < 0)
+            vp8_write_bit(bc, 1);
+        else
+            vp8_write_bit(bc, 0);
+    }
+    else
+        vp8_write_bit(bc, 0);
+}
+
+void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest_end, unsigned long *size)
+{
+    int i, j;
+    VP8_HEADER oh;
+    VP8_COMMON *const pc = & cpi->common;
+    vp8_writer *const bc = cpi->bc;
+    MACROBLOCKD *const xd = & cpi->mb.e_mbd;
+    int extra_bytes_packed = 0;
+
+    unsigned char *cx_data = dest;
+    unsigned char *cx_data_end = dest_end;
+    const int *mb_feature_data_bits;
+
+    oh.show_frame = (int) pc->show_frame;
+    oh.type = (int)pc->frame_type;
+    oh.version = pc->version;
+    oh.first_partition_length_in_bytes = 0;
+
+    mb_feature_data_bits = vp8_mb_feature_data_bits;
+
+    bc[0].error = &pc->error;
+
+    validate_buffer(cx_data, 3, cx_data_end, &cpi->common.error);
+    cx_data += 3;
+
+#if defined(SECTIONBITS_OUTPUT)
+    Sectionbits[active_section = 1] += sizeof(VP8_HEADER) * 8 * 256;
+#endif
+
+    /* every keyframe send startcode, width, height, scale factor, clamp
+     * and color type
+     */
+    if (oh.type == KEY_FRAME)
+    {
+        int v;
+
+        validate_buffer(cx_data, 7, cx_data_end, &cpi->common.error);
+
+        /* Start / synch code */
+        cx_data[0] = 0x9D;
+        cx_data[1] = 0x01;
+        cx_data[2] = 0x2a;
+
+        v = (pc->horiz_scale << 14) | pc->Width;
+        cx_data[3] = v;
+        cx_data[4] = v >> 8;
+
+        v = (pc->vert_scale << 14) | pc->Height;
+        cx_data[5] = v;
+        cx_data[6] = v >> 8;
+
+
+        extra_bytes_packed = 7;
+        cx_data += extra_bytes_packed ;
+
+        vp8_start_encode(bc, cx_data, cx_data_end);
+
+        /* signal clr type */
+        vp8_write_bit(bc, pc->clr_type);
+        vp8_write_bit(bc, pc->clamp_type);
+
+    }
+    else
+        vp8_start_encode(bc, cx_data, cx_data_end);
+
+
+    /* Signal whether or not Segmentation is enabled */
+    vp8_write_bit(bc, xd->segmentation_enabled);
+
+    /*  Indicate which features are enabled */
+    if (xd->segmentation_enabled)
+    {
+        /* Signal whether or not the segmentation map is being updated. */
+        vp8_write_bit(bc, xd->update_mb_segmentation_map);
+        vp8_write_bit(bc, xd->update_mb_segmentation_data);
+
+        if (xd->update_mb_segmentation_data)
+        {
+            signed char Data;
+
+            vp8_write_bit(bc, xd->mb_segement_abs_delta);
+
+            /* For each segmentation feature (Quant and loop filter level) */
+            for (i = 0; i < MB_LVL_MAX; i++)
+            {
+                /* For each of the segments */
+                for (j = 0; j < MAX_MB_SEGMENTS; j++)
+                {
+                    Data = xd->segment_feature_data[i][j];
+
+                    /* Frame level data */
+                    if (Data)
+                    {
+                        vp8_write_bit(bc, 1);
+
+                        if (Data < 0)
+                        {
+                            Data = - Data;
+                            vp8_write_literal(bc, Data, mb_feature_data_bits[i]);
+                            vp8_write_bit(bc, 1);
+                        }
+                        else
+                        {
+                            vp8_write_literal(bc, Data, mb_feature_data_bits[i]);
+                            vp8_write_bit(bc, 0);
+                        }
+                    }
+                    else
+                        vp8_write_bit(bc, 0);
+                }
+            }
+        }
+
+        if (xd->update_mb_segmentation_map)
+        {
+            /* Write the probs used to decode the segment id for each mb */
+            for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
+            {
+                int Data = xd->mb_segment_tree_probs[i];
+
+                if (Data != 255)
+                {
+                    vp8_write_bit(bc, 1);
+                    vp8_write_literal(bc, Data, 8);
+                }
+                else
+                    vp8_write_bit(bc, 0);
+            }
+        }
+    }
+
+    vp8_write_bit(bc, pc->filter_type);
+    vp8_write_literal(bc, pc->filter_level, 6);
+    vp8_write_literal(bc, pc->sharpness_level, 3);
+
+    /* Write out loop filter deltas applied at the MB level based on mode
+     * or ref frame (if they are enabled).
+     */
+    vp8_write_bit(bc, xd->mode_ref_lf_delta_enabled);
+
+    if (xd->mode_ref_lf_delta_enabled)
+    {
+        /* Do the deltas need to be updated */
+        int send_update = xd->mode_ref_lf_delta_update
+                          || cpi->oxcf.error_resilient_mode;
+
+        vp8_write_bit(bc, send_update);
+        if (send_update)
+        {
+            int Data;
+
+            /* Send update */
+            for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+            {
+                Data = xd->ref_lf_deltas[i];
+
+                /* Frame level data */
+                if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]
+                    || cpi->oxcf.error_resilient_mode)
+                {
+                    xd->last_ref_lf_deltas[i] = xd->ref_lf_deltas[i];
+                    vp8_write_bit(bc, 1);
+
+                    if (Data > 0)
+                    {
+                        vp8_write_literal(bc, (Data & 0x3F), 6);
+                        vp8_write_bit(bc, 0);    /* sign */
+                    }
+                    else
+                    {
+                        Data = -Data;
+                        vp8_write_literal(bc, (Data & 0x3F), 6);
+                        vp8_write_bit(bc, 1);    /* sign */
+                    }
+                }
+                else
+                    vp8_write_bit(bc, 0);
+            }
+
+            /* Send update */
+            for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+            {
+                Data = xd->mode_lf_deltas[i];
+
+                if (xd->mode_lf_deltas[i] != xd->last_mode_lf_deltas[i]
+                    || cpi->oxcf.error_resilient_mode)
+                {
+                    xd->last_mode_lf_deltas[i] = xd->mode_lf_deltas[i];
+                    vp8_write_bit(bc, 1);
+
+                    if (Data > 0)
+                    {
+                        vp8_write_literal(bc, (Data & 0x3F), 6);
+                        vp8_write_bit(bc, 0);    /* sign */
+                    }
+                    else
+                    {
+                        Data = -Data;
+                        vp8_write_literal(bc, (Data & 0x3F), 6);
+                        vp8_write_bit(bc, 1);    /* sign */
+                    }
+                }
+                else
+                    vp8_write_bit(bc, 0);
+            }
+        }
+    }
+
+    /* signal here is multi token partition is enabled */
+    vp8_write_literal(bc, pc->multi_token_partition, 2);
+
+    /* Frame Qbaseline quantizer index */
+    vp8_write_literal(bc, pc->base_qindex, 7);
+
+    /* Transmit Dc, Second order and Uv quantizer delta information */
+    put_delta_q(bc, pc->y1dc_delta_q);
+    put_delta_q(bc, pc->y2dc_delta_q);
+    put_delta_q(bc, pc->y2ac_delta_q);
+    put_delta_q(bc, pc->uvdc_delta_q);
+    put_delta_q(bc, pc->uvac_delta_q);
+
+    /* When there is a key frame all reference buffers are updated using
+     * the new key frame
+     */
+    if (pc->frame_type != KEY_FRAME)
+    {
+        /* Should the GF or ARF be updated using the transmitted frame
+         * or buffer
+         */
+        vp8_write_bit(bc, pc->refresh_golden_frame);
+        vp8_write_bit(bc, pc->refresh_alt_ref_frame);
+
+        /* If not being updated from current frame should either GF or ARF
+         * be updated from another buffer
+         */
+        if (!pc->refresh_golden_frame)
+            vp8_write_literal(bc, pc->copy_buffer_to_gf, 2);
+
+        if (!pc->refresh_alt_ref_frame)
+            vp8_write_literal(bc, pc->copy_buffer_to_arf, 2);
+
+        /* Indicate reference frame sign bias for Golden and ARF frames
+         * (always 0 for last frame buffer)
+         */
+        vp8_write_bit(bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
+        vp8_write_bit(bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
+    }
+
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+    {
+        if (pc->frame_type == KEY_FRAME)
+            pc->refresh_entropy_probs = 1;
+        else
+            pc->refresh_entropy_probs = 0;
+    }
+#endif
+
+    vp8_write_bit(bc, pc->refresh_entropy_probs);
+
+    if (pc->frame_type != KEY_FRAME)
+        vp8_write_bit(bc, pc->refresh_last_frame);
+
+#ifdef ENTROPY_STATS
+
+    if (pc->frame_type == INTER_FRAME)
+        active_section = 0;
+    else
+        active_section = 7;
+
+#endif
+
+    vp8_clear_system_state();
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    pack_coef_probs(cpi);
+#else
+    if (pc->refresh_entropy_probs == 0)
+    {
+        /* save a copy for later refresh */
+        vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
+    }
+
+    vp8_update_coef_probs(cpi);
+#endif
+
+#ifdef ENTROPY_STATS
+    active_section = 2;
+#endif
+
+    /* Write out the mb_no_coeff_skip flag */
+    vp8_write_bit(bc, pc->mb_no_coeff_skip);
+
+    if (pc->frame_type == KEY_FRAME)
+    {
+        write_kfmodes(cpi);
+
+#ifdef ENTROPY_STATS
+        active_section = 8;
+#endif
+    }
+    else
+    {
+        pack_inter_mode_mvs(cpi);
+
+#ifdef ENTROPY_STATS
+        active_section = 1;
+#endif
+    }
+
+    vp8_stop_encode(bc);
+
+    cx_data += bc->pos;
+
+    oh.first_partition_length_in_bytes = cpi->bc->pos;
+
+    /* update frame tag */
+    {
+        int v = (oh.first_partition_length_in_bytes << 5) |
+                (oh.show_frame << 4) |
+                (oh.version << 1) |
+                oh.type;
+
+        dest[0] = v;
+        dest[1] = v >> 8;
+        dest[2] = v >> 16;
+    }
+
+    *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc->pos;
+
+    cpi->partition_sz[0] = *size;
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    {
+        const int num_part = (1 << pc->multi_token_partition);
+        unsigned char * dp = cpi->partition_d[0] + cpi->partition_sz[0];
+
+        if (num_part > 1)
+        {
+            /* write token part sizes (all but last) if more than 1 */
+            validate_buffer(dp, 3 * (num_part - 1), cpi->partition_d_end[0],
+                            &pc->error);
+
+            cpi->partition_sz[0] += 3*(num_part-1);
+
+            for(i = 1; i < num_part; i++)
+            {
+                write_partition_size(dp, cpi->partition_sz[i]);
+                dp += 3;
+            }
+        }
+
+        if (!cpi->output_partition)
+        {
+            /* concatenate partition buffers */
+            for(i = 0; i < num_part; i++)
+            {
+                vpx_memmove(dp, cpi->partition_d[i+1], cpi->partition_sz[i+1]);
+                cpi->partition_d[i+1] = dp;
+                dp += cpi->partition_sz[i+1];
+            }
+        }
+
+        /* update total size */
+        *size = 0;
+        for(i = 0; i < num_part+1; i++)
+        {
+            *size += cpi->partition_sz[i];
+        }
+    }
+#else
+    if (pc->multi_token_partition != ONE_PARTITION)
+    {
+        int num_part = 1 << pc->multi_token_partition;
+
+        /* partition size table at the end of first partition */
+        cpi->partition_sz[0] += 3 * (num_part - 1);
+        *size += 3 * (num_part - 1);
+
+        validate_buffer(cx_data, 3 * (num_part - 1), cx_data_end,
+                        &pc->error);
+
+        for(i = 1; i < num_part + 1; i++)
+        {
+            cpi->bc[i].error = &pc->error;
+        }
+
+        pack_tokens_into_partitions(cpi, cx_data + 3 * (num_part - 1),
+                                    cx_data_end, num_part);
+
+        for(i = 1; i < num_part; i++)
+        {
+            cpi->partition_sz[i] = cpi->bc[i].pos;
+            write_partition_size(cx_data, cpi->partition_sz[i]);
+            cx_data += 3;
+            *size += cpi->partition_sz[i]; /* add to total */
+        }
+
+        /* add last partition to total size */
+        cpi->partition_sz[i] = cpi->bc[i].pos;
+        *size += cpi->partition_sz[i];
+    }
+    else
+    {
+        bc[1].error = &pc->error;
+
+        vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end);
+
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded)
+            pack_mb_row_tokens(cpi, &cpi->bc[1]);
+        else
+#endif
+            pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
+
+        vp8_stop_encode(&cpi->bc[1]);
+
+        *size += cpi->bc[1].pos;
+        cpi->partition_sz[1] = cpi->bc[1].pos;
+    }
+#endif
+}
+
+#ifdef ENTROPY_STATS
+void print_tree_update_probs()
+{
+    int i, j, k, l;
+    FILE *f = fopen("context.c", "a");
+    int Sum;
+    fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
+    fprintf(f, "const vp8_prob tree_update_probs[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {\n");
+
+    for (i = 0; i < BLOCK_TYPES; i++)
+    {
+        fprintf(f, "  { \n");
+
+        for (j = 0; j < COEF_BANDS; j++)
+        {
+            fprintf(f, "    {\n");
+
+            for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+            {
+                fprintf(f, "      {");
+
+                for (l = 0; l < ENTROPY_NODES; l++)
+                {
+                    Sum = tree_update_hist[i][j][k][l][0] + tree_update_hist[i][j][k][l][1];
+
+                    if (Sum > 0)
+                    {
+                        if (((tree_update_hist[i][j][k][l][0] * 255) / Sum) > 0)
+                            fprintf(f, "%3ld, ", (tree_update_hist[i][j][k][l][0] * 255) / Sum);
+                        else
+                            fprintf(f, "%3ld, ", 1);
+                    }
+                    else
+                        fprintf(f, "%3ld, ", 128);
+                }
+
+                fprintf(f, "},\n");
+            }
+
+            fprintf(f, "    },\n");
+        }
+
+        fprintf(f, "  },\n");
+    }
+
+    fprintf(f, "};\n");
+    fclose(f);
+}
+#endif
diff --git a/libvpx/vp8/encoder/bitstream.h b/libvpx/vp8/encoder/bitstream.h
new file mode 100644
index 0000000..455a94f
--- /dev/null
+++ b/libvpx/vp8/encoder/bitstream.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BITSTREAM_H
+#define __INC_BITSTREAM_H
+
+#if HAVE_EDSP
+void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
+                             const vp8_token *,
+                             const vp8_extra_bit_struct *,
+                             const vp8_tree_index *);
+void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
+                                             unsigned char * cx_data,
+                                             const unsigned char *cx_data_end,
+                                             int num_parts,
+                                             const vp8_token *,
+                                             const vp8_extra_bit_struct *,
+                                             const vp8_tree_index *);
+void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
+                                    const vp8_token *,
+                                    const vp8_extra_bit_struct *,
+                                    const vp8_tree_index *);
+# define pack_tokens(a,b,c)                  \
+    vp8cx_pack_tokens_armv5(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+# define pack_tokens_into_partitions(a,b,c,d)  \
+    vp8cx_pack_tokens_into_partitions_armv5(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+# define pack_mb_row_tokens(a,b)               \
+    vp8cx_pack_mb_row_tokens_armv5(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+#else
+
+void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount);
+
+# define pack_tokens(a,b,c)                    vp8_pack_tokens_c(a,b,c)
+# define pack_tokens_into_partitions(a,b,c,d)  pack_tokens_into_partitions_c(a,b,c,d)
+# define pack_mb_row_tokens(a,b)               pack_mb_row_tokens_c(a,b)
+#endif
+
+#endif
diff --git a/libvpx/vp8/encoder/block.h b/libvpx/vp8/encoder/block.h
new file mode 100644
index 0000000..f9d63eb
--- /dev/null
+++ b/libvpx/vp8/encoder/block.h
@@ -0,0 +1,148 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BLOCK_H
+#define __INC_BLOCK_H
+
+#include "vp8/common/onyx.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/entropymv.h"
+#include "vp8/common/entropy.h"
+#include "vpx_ports/mem.h"
+
+/* motion search site */
+typedef struct
+{
+    MV mv;
+    int offset;
+} search_site;
+
+typedef struct block
+{
+    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
+    short *src_diff;
+    short *coeff;
+
+    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
+    short *quant;
+    short *quant_fast;
+    unsigned char *quant_shift;
+    short *zbin;
+    short *zrun_zbin_boost;
+    short *round;
+
+    /* Zbin Over Quant value */
+    short zbin_extra;
+
+    unsigned char **base_src;
+    int src;
+    int src_stride;
+} BLOCK;
+
+typedef struct
+{
+    int count;
+    struct
+    {
+        B_PREDICTION_MODE mode;
+        int_mv mv;
+    } bmi[16];
+} PARTITION_INFO;
+
+typedef struct macroblock
+{
+    DECLARE_ALIGNED(16, short, src_diff[400]); /* 25 blocks Y,U,V,Y2 */
+    DECLARE_ALIGNED(16, short, coeff[400]); /* 25 blocks Y,U,V,Y2 */
+    DECLARE_ALIGNED(16, unsigned char, thismb[256]);
+
+    unsigned char *thismb_ptr;
+    /* 16 Y, 4 U, 4 V, 1 DC 2nd order block */
+    BLOCK block[25];
+
+    YV12_BUFFER_CONFIG src;
+
+    MACROBLOCKD e_mbd;
+    PARTITION_INFO *partition_info; /* work pointer */
+    PARTITION_INFO *pi;   /* Corresponds to upper left visible macroblock */
+    PARTITION_INFO *pip;  /* Base of allocated array */
+
+    int ref_frame_cost[MAX_REF_FRAMES];
+
+    search_site *ss;
+    int ss_count;
+    int searches_per_step;
+
+    int errorperbit;
+    int sadperbit16;
+    int sadperbit4;
+    int rddiv;
+    int rdmult;
+    unsigned int * mb_activity_ptr;
+    int * mb_norm_activity_ptr;
+    signed int act_zbin_adj;
+    signed int last_act_zbin_adj;
+
+    int *mvcost[2];
+    int *mvsadcost[2];
+    int (*mbmode_cost)[MB_MODE_COUNT];
+    int (*intra_uv_mode_cost)[MB_MODE_COUNT];
+    int (*bmode_costs)[10][10];
+    int *inter_bmode_costs;
+    int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS]
+    [MAX_ENTROPY_TOKENS];
+
+    /* These define limits to motion vector components to prevent
+     * them from extending outside the UMV borders.
+     */
+    int mv_col_min;
+    int mv_col_max;
+    int mv_row_min;
+    int mv_row_max;
+
+    int skip;
+
+    unsigned int encode_breakout;
+
+    signed char *gf_active_ptr;
+
+    unsigned char *active_ptr;
+    MV_CONTEXT *mvc;
+
+    int optimize;
+    int q_index;
+
+#if CONFIG_TEMPORAL_DENOISING
+    MB_PREDICTION_MODE best_sse_inter_mode;
+    int_mv best_sse_mv;
+    MV_REFERENCE_FRAME best_reference_frame;
+    MV_REFERENCE_FRAME best_zeromv_reference_frame;
+    unsigned char need_to_clamp_best_mvs;
+#endif
+
+    int skip_true_count;
+    unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+    unsigned int MVcount [2] [MVvals];  /* (row,col) MV cts this frame */
+    int ymode_count [VP8_YMODES];        /* intra MB type cts this frame */
+    int uv_mode_count[VP8_UV_MODES];     /* intra MB type cts this frame */
+    int64_t prediction_error;
+    int64_t intra_error;
+
+
+    void (*short_fdct4x4)(short *input, short *output, int pitch);
+    void (*short_fdct8x4)(short *input, short *output, int pitch);
+    void (*short_walsh4x4)(short *input, short *output, int pitch);
+    void (*quantize_b)(BLOCK *b, BLOCKD *d);
+    void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
+
+} MACROBLOCK;
+
+
+#endif
diff --git a/libvpx/vp8/encoder/boolhuff.c b/libvpx/vp8/encoder/boolhuff.c
new file mode 100644
index 0000000..74770a2
--- /dev/null
+++ b/libvpx/vp8/encoder/boolhuff.c
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "boolhuff.h"
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+
+#endif
+
+#ifdef ENTROPY_STATS
+unsigned int active_section = 0;
+#endif
+
+const unsigned int vp8_prob_cost[256] =
+{
+    2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
+    1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
+    767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
+    617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
+    511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
+    428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
+    361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
+    304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
+    255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
+    211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
+    172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
+    137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
+    105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
+    75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
+    48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
+    22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
+};
+
+void vp8_start_encode(BOOL_CODER *br, unsigned char *source, unsigned char *source_end)
+{
+
+    br->lowvalue   = 0;
+    br->range      = 255;
+    br->count      = -24;
+    br->buffer     = source;
+    br->buffer_end = source_end;
+    br->pos        = 0;
+}
+
+void vp8_stop_encode(BOOL_CODER *br)
+{
+    int i;
+
+    for (i = 0; i < 32; i++)
+        vp8_encode_bool(br, 0, 128);
+}
+
+
+void vp8_encode_value(BOOL_CODER *br, int data, int bits)
+{
+    int bit;
+
+    for (bit = bits - 1; bit >= 0; bit--)
+        vp8_encode_bool(br, (1 & (data >> bit)), 0x80);
+
+}
diff --git a/libvpx/vp8/encoder/boolhuff.h b/libvpx/vp8/encoder/boolhuff.h
new file mode 100644
index 0000000..8309063
--- /dev/null
+++ b/libvpx/vp8/encoder/boolhuff.h
@@ -0,0 +1,128 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     boolhuff.h
+*
+*   Description  :     Bool Coder header file.
+*
+****************************************************************************/
+#ifndef __INC_BOOLHUFF_H
+#define __INC_BOOLHUFF_H
+
+#include "vpx_ports/mem.h"
+#include "vpx/internal/vpx_codec_internal.h"
+
+typedef struct
+{
+    unsigned int lowvalue;
+    unsigned int range;
+    int count;
+    unsigned int pos;
+    unsigned char *buffer;
+    unsigned char *buffer_end;
+    struct vpx_internal_error_info *error;
+
+    /* Variables used to track bit costs without outputing to the bitstream */
+    unsigned int  measure_cost;
+    unsigned long bit_counter;
+} BOOL_CODER;
+
+extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer, unsigned char *buffer_end);
+
+extern void vp8_encode_value(BOOL_CODER *br, int data, int bits);
+extern void vp8_stop_encode(BOOL_CODER *bc);
+extern const unsigned int vp8_prob_cost[256];
+
+
+DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
+
+static int validate_buffer(const unsigned char *start,
+                           size_t               len,
+                           const unsigned char *end,
+                           struct vpx_internal_error_info *error)
+{
+    if (start + len > start && start + len < end)
+        return 1;
+    else
+        vpx_internal_error(error, VPX_CODEC_CORRUPT_FRAME,
+            "Truncated packet or corrupt partition ");
+
+    return 0;
+}
+static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability)
+{
+    unsigned int split;
+    int count = br->count;
+    unsigned int range = br->range;
+    unsigned int lowvalue = br->lowvalue;
+    register unsigned int shift;
+
+#ifdef ENTROPY_STATS
+#if defined(SECTIONBITS_OUTPUT)
+
+    if (bit)
+        Sectionbits[active_section] += vp8_prob_cost[255-probability];
+    else
+        Sectionbits[active_section] += vp8_prob_cost[probability];
+
+#endif
+#endif
+
+    split = 1 + (((range - 1) * probability) >> 8);
+
+    range = split;
+
+    if (bit)
+    {
+        lowvalue += split;
+        range = br->range - split;
+    }
+
+    shift = vp8_norm[range];
+
+    range <<= shift;
+    count += shift;
+
+    if (count >= 0)
+    {
+        int offset = shift - count;
+
+        if ((lowvalue << (offset - 1)) & 0x80000000)
+        {
+            int x = br->pos - 1;
+
+            while (x >= 0 && br->buffer[x] == 0xff)
+            {
+                br->buffer[x] = (unsigned char)0;
+                x--;
+            }
+
+            br->buffer[x] += 1;
+        }
+
+        validate_buffer(br->buffer + br->pos, 1, br->buffer_end, br->error);
+        br->buffer[br->pos++] = (lowvalue >> (24 - offset));
+
+        lowvalue <<= offset;
+        shift = count;
+        lowvalue &= 0xffffff;
+        count -= 8 ;
+    }
+
+    lowvalue <<= shift;
+    br->count = count;
+    br->lowvalue = lowvalue;
+    br->range = range;
+}
+
+#endif
diff --git a/libvpx/vp8/encoder/dct.c b/libvpx/vp8/encoder/dct.c
new file mode 100644
index 0000000..b5a11ae
--- /dev/null
+++ b/libvpx/vp8/encoder/dct.c
@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+{
+    int i;
+    int a1, b1, c1, d1;
+    short *ip = input;
+    short *op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ((ip[0] + ip[3])<<3);
+        b1 = ((ip[1] + ip[2])<<3);
+        c1 = ((ip[1] - ip[2])<<3);
+        d1 = ((ip[0] - ip[3])<<3);
+
+        op[0] = a1 + b1;
+        op[2] = a1 - b1;
+
+        op[1] = (c1 * 2217 + d1 * 5352 +  14500)>>12;
+        op[3] = (d1 * 2217 - c1 * 5352 +   7500)>>12;
+
+        ip += pitch / 2;
+        op += 4;
+
+    }
+    ip = output;
+    op = output;
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[12];
+        b1 = ip[4] + ip[8];
+        c1 = ip[4] - ip[8];
+        d1 = ip[0] - ip[12];
+
+        op[0]  = ( a1 + b1 + 7)>>4;
+        op[8]  = ( a1 - b1 + 7)>>4;
+
+        op[4]  =((c1 * 2217 + d1 * 5352 +  12000)>>16) + (d1!=0);
+        op[12] = (d1 * 2217 - c1 * 5352 +  51000)>>16;
+
+        ip++;
+        op++;
+    }
+}
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
+{
+    vp8_short_fdct4x4_c(input,   output,    pitch);
+    vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
+}
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
+{
+    int i;
+    int a1, b1, c1, d1;
+    int a2, b2, c2, d2;
+    short *ip = input;
+    short *op = output;
+
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ((ip[0] + ip[2])<<2);
+        d1 = ((ip[1] + ip[3])<<2);
+        c1 = ((ip[1] - ip[3])<<2);
+        b1 = ((ip[0] - ip[2])<<2);
+
+        op[0] = a1 + d1 + (a1!=0);
+        op[1] = b1 + c1;
+        op[2] = b1 - c1;
+        op[3] = a1 - d1;
+        ip += pitch / 2;
+        op += 4;
+    }
+
+    ip = output;
+    op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[8];
+        d1 = ip[4] + ip[12];
+        c1 = ip[4] - ip[12];
+        b1 = ip[0] - ip[8];
+
+        a2 = a1 + d1;
+        b2 = b1 + c1;
+        c2 = b1 - c1;
+        d2 = a1 - d1;
+
+        a2 += a2<0;
+        b2 += b2<0;
+        c2 += c2<0;
+        d2 += d2<0;
+
+        op[0] = (a2+3) >> 3;
+        op[4] = (b2+3) >> 3;
+        op[8] = (c2+3) >> 3;
+        op[12]= (d2+3) >> 3;
+
+        ip++;
+        op++;
+    }
+}
diff --git a/libvpx/vp8/encoder/dct_value_cost.h b/libvpx/vp8/encoder/dct_value_cost.h
new file mode 100644
index 0000000..e892765
--- /dev/null
+++ b/libvpx/vp8/encoder/dct_value_cost.h
@@ -0,0 +1,358 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* Generated file, included by tokenize.c  */
+/* Values generated by fill_value_tokens() */
+
+static const short dct_value_cost[2048*2] =
+{
+    8285, 8277, 8267, 8259, 8253, 8245, 8226, 8218, 8212, 8204, 8194, 8186,
+    8180, 8172, 8150, 8142, 8136, 8128, 8118, 8110, 8104, 8096, 8077, 8069,
+    8063, 8055, 8045, 8037, 8031, 8023, 7997, 7989, 7983, 7975, 7965, 7957,
+    7951, 7943, 7924, 7916, 7910, 7902, 7892, 7884, 7878, 7870, 7848, 7840,
+    7834, 7826, 7816, 7808, 7802, 7794, 7775, 7767, 7761, 7753, 7743, 7735,
+    7729, 7721, 7923, 7915, 7909, 7901, 7891, 7883, 7877, 7869, 7850, 7842,
+    7836, 7828, 7818, 7810, 7804, 7796, 7774, 7766, 7760, 7752, 7742, 7734,
+    7728, 7720, 7701, 7693, 7687, 7679, 7669, 7661, 7655, 7647, 7621, 7613,
+    7607, 7599, 7589, 7581, 7575, 7567, 7548, 7540, 7534, 7526, 7516, 7508,
+    7502, 7494, 7472, 7464, 7458, 7450, 7440, 7432, 7426, 7418, 7399, 7391,
+    7385, 7377, 7367, 7359, 7353, 7345, 7479, 7471, 7465, 7457, 7447, 7439,
+    7433, 7425, 7406, 7398, 7392, 7384, 7374, 7366, 7360, 7352, 7330, 7322,
+    7316, 7308, 7298, 7290, 7284, 7276, 7257, 7249, 7243, 7235, 7225, 7217,
+    7211, 7203, 7177, 7169, 7163, 7155, 7145, 7137, 7131, 7123, 7104, 7096,
+    7090, 7082, 7072, 7064, 7058, 7050, 7028, 7020, 7014, 7006, 6996, 6988,
+    6982, 6974, 6955, 6947, 6941, 6933, 6923, 6915, 6909, 6901, 7632, 7624,
+    7618, 7610, 7600, 7592, 7586, 7578, 7559, 7551, 7545, 7537, 7527, 7519,
+    7513, 7505, 7483, 7475, 7469, 7461, 7451, 7443, 7437, 7429, 7410, 7402,
+    7396, 7388, 7378, 7370, 7364, 7356, 7330, 7322, 7316, 7308, 7298, 7290,
+    7284, 7276, 7257, 7249, 7243, 7235, 7225, 7217, 7211, 7203, 7181, 7173,
+    7167, 7159, 7149, 7141, 7135, 7127, 7108, 7100, 7094, 7086, 7076, 7068,
+    7062, 7054, 7188, 7180, 7174, 7166, 7156, 7148, 7142, 7134, 7115, 7107,
+    7101, 7093, 7083, 7075, 7069, 7061, 7039, 7031, 7025, 7017, 7007, 6999,
+    6993, 6985, 6966, 6958, 6952, 6944, 6934, 6926, 6920, 6912, 6886, 6878,
+    6872, 6864, 6854, 6846, 6840, 6832, 6813, 6805, 6799, 6791, 6781, 6773,
+    6767, 6759, 6737, 6729, 6723, 6715, 6705, 6697, 6691, 6683, 6664, 6656,
+    6650, 6642, 6632, 6624, 6618, 6610, 6812, 6804, 6798, 6790, 6780, 6772,
+    6766, 6758, 6739, 6731, 6725, 6717, 6707, 6699, 6693, 6685, 6663, 6655,
+    6649, 6641, 6631, 6623, 6617, 6609, 6590, 6582, 6576, 6568, 6558, 6550,
+    6544, 6536, 6510, 6502, 6496, 6488, 6478, 6470, 6464, 6456, 6437, 6429,
+    6423, 6415, 6405, 6397, 6391, 6383, 6361, 6353, 6347, 6339, 6329, 6321,
+    6315, 6307, 6288, 6280, 6274, 6266, 6256, 6248, 6242, 6234, 6368, 6360,
+    6354, 6346, 6336, 6328, 6322, 6314, 6295, 6287, 6281, 6273, 6263, 6255,
+    6249, 6241, 6219, 6211, 6205, 6197, 6187, 6179, 6173, 6165, 6146, 6138,
+    6132, 6124, 6114, 6106, 6100, 6092, 6066, 6058, 6052, 6044, 6034, 6026,
+    6020, 6012, 5993, 5985, 5979, 5971, 5961, 5953, 5947, 5939, 5917, 5909,
+    5903, 5895, 5885, 5877, 5871, 5863, 5844, 5836, 5830, 5822, 5812, 5804,
+    5798, 5790, 6697, 6689, 6683, 6675, 6665, 6657, 6651, 6643, 6624, 6616,
+    6610, 6602, 6592, 6584, 6578, 6570, 6548, 6540, 6534, 6526, 6516, 6508,
+    6502, 6494, 6475, 6467, 6461, 6453, 6443, 6435, 6429, 6421, 6395, 6387,
+    6381, 6373, 6363, 6355, 6349, 6341, 6322, 6314, 6308, 6300, 6290, 6282,
+    6276, 6268, 6246, 6238, 6232, 6224, 6214, 6206, 6200, 6192, 6173, 6165,
+    6159, 6151, 6141, 6133, 6127, 6119, 6253, 6245, 6239, 6231, 6221, 6213,
+    6207, 6199, 6180, 6172, 6166, 6158, 6148, 6140, 6134, 6126, 6104, 6096,
+    6090, 6082, 6072, 6064, 6058, 6050, 6031, 6023, 6017, 6009, 5999, 5991,
+    5985, 5977, 5951, 5943, 5937, 5929, 5919, 5911, 5905, 5897, 5878, 5870,
+    5864, 5856, 5846, 5838, 5832, 5824, 5802, 5794, 5788, 5780, 5770, 5762,
+    5756, 5748, 5729, 5721, 5715, 5707, 5697, 5689, 5683, 5675, 5877, 5869,
+    5863, 5855, 5845, 5837, 5831, 5823, 5804, 5796, 5790, 5782, 5772, 5764,
+    5758, 5750, 5728, 5720, 5714, 5706, 5696, 5688, 5682, 5674, 5655, 5647,
+    5641, 5633, 5623, 5615, 5609, 5601, 5575, 5567, 5561, 5553, 5543, 5535,
+    5529, 5521, 5502, 5494, 5488, 5480, 5470, 5462, 5456, 5448, 5426, 5418,
+    5412, 5404, 5394, 5386, 5380, 5372, 5353, 5345, 5339, 5331, 5321, 5313,
+    5307, 5299, 5433, 5425, 5419, 5411, 5401, 5393, 5387, 5379, 5360, 5352,
+    5346, 5338, 5328, 5320, 5314, 5306, 5284, 5276, 5270, 5262, 5252, 5244,
+    5238, 5230, 5211, 5203, 5197, 5189, 5179, 5171, 5165, 5157, 5131, 5123,
+    5117, 5109, 5099, 5091, 5085, 5077, 5058, 5050, 5044, 5036, 5026, 5018,
+    5012, 5004, 4982, 4974, 4968, 4960, 4950, 4942, 4936, 4928, 4909, 4901,
+    4895, 4887, 4877, 4869, 4863, 4855, 5586, 5578, 5572, 5564, 5554, 5546,
+    5540, 5532, 5513, 5505, 5499, 5491, 5481, 5473, 5467, 5459, 5437, 5429,
+    5423, 5415, 5405, 5397, 5391, 5383, 5364, 5356, 5350, 5342, 5332, 5324,
+    5318, 5310, 5284, 5276, 5270, 5262, 5252, 5244, 5238, 5230, 5211, 5203,
+    5197, 5189, 5179, 5171, 5165, 5157, 5135, 5127, 5121, 5113, 5103, 5095,
+    5089, 5081, 5062, 5054, 5048, 5040, 5030, 5022, 5016, 5008, 5142, 5134,
+    5128, 5120, 5110, 5102, 5096, 5088, 5069, 5061, 5055, 5047, 5037, 5029,
+    5023, 5015, 4993, 4985, 4979, 4971, 4961, 4953, 4947, 4939, 4920, 4912,
+    4906, 4898, 4888, 4880, 4874, 4866, 4840, 4832, 4826, 4818, 4808, 4800,
+    4794, 4786, 4767, 4759, 4753, 4745, 4735, 4727, 4721, 4713, 4691, 4683,
+    4677, 4669, 4659, 4651, 4645, 4637, 4618, 4610, 4604, 4596, 4586, 4578,
+    4572, 4564, 4766, 4758, 4752, 4744, 4734, 4726, 4720, 4712, 4693, 4685,
+    4679, 4671, 4661, 4653, 4647, 4639, 4617, 4609, 4603, 4595, 4585, 4577,
+    4571, 4563, 4544, 4536, 4530, 4522, 4512, 4504, 4498, 4490, 4464, 4456,
+    4450, 4442, 4432, 4424, 4418, 4410, 4391, 4383, 4377, 4369, 4359, 4351,
+    4345, 4337, 4315, 4307, 4301, 4293, 4283, 4275, 4269, 4261, 4242, 4234,
+    4228, 4220, 4210, 4202, 4196, 4188, 4322, 4314, 4308, 4300, 4290, 4282,
+    4276, 4268, 4249, 4241, 4235, 4227, 4217, 4209, 4203, 4195, 4173, 4165,
+    4159, 4151, 4141, 4133, 4127, 4119, 4100, 4092, 4086, 4078, 4068, 4060,
+    4054, 4046, 4020, 4012, 4006, 3998, 3988, 3980, 3974, 3966, 3947, 3939,
+    3933, 3925, 3915, 3907, 3901, 3893, 3871, 3863, 3857, 3849, 3839, 3831,
+    3825, 3817, 3798, 3790, 3784, 3776, 3766, 3758, 3752, 3744, 6697, 6689,
+    6683, 6675, 6665, 6657, 6651, 6643, 6624, 6616, 6610, 6602, 6592, 6584,
+    6578, 6570, 6548, 6540, 6534, 6526, 6516, 6508, 6502, 6494, 6475, 6467,
+    6461, 6453, 6443, 6435, 6429, 6421, 6395, 6387, 6381, 6373, 6363, 6355,
+    6349, 6341, 6322, 6314, 6308, 6300, 6290, 6282, 6276, 6268, 6246, 6238,
+    6232, 6224, 6214, 6206, 6200, 6192, 6173, 6165, 6159, 6151, 6141, 6133,
+    6127, 6119, 6253, 6245, 6239, 6231, 6221, 6213, 6207, 6199, 6180, 6172,
+    6166, 6158, 6148, 6140, 6134, 6126, 6104, 6096, 6090, 6082, 6072, 6064,
+    6058, 6050, 6031, 6023, 6017, 6009, 5999, 5991, 5985, 5977, 5951, 5943,
+    5937, 5929, 5919, 5911, 5905, 5897, 5878, 5870, 5864, 5856, 5846, 5838,
+    5832, 5824, 5802, 5794, 5788, 5780, 5770, 5762, 5756, 5748, 5729, 5721,
+    5715, 5707, 5697, 5689, 5683, 5675, 5877, 5869, 5863, 5855, 5845, 5837,
+    5831, 5823, 5804, 5796, 5790, 5782, 5772, 5764, 5758, 5750, 5728, 5720,
+    5714, 5706, 5696, 5688, 5682, 5674, 5655, 5647, 5641, 5633, 5623, 5615,
+    5609, 5601, 5575, 5567, 5561, 5553, 5543, 5535, 5529, 5521, 5502, 5494,
+    5488, 5480, 5470, 5462, 5456, 5448, 5426, 5418, 5412, 5404, 5394, 5386,
+    5380, 5372, 5353, 5345, 5339, 5331, 5321, 5313, 5307, 5299, 5433, 5425,
+    5419, 5411, 5401, 5393, 5387, 5379, 5360, 5352, 5346, 5338, 5328, 5320,
+    5314, 5306, 5284, 5276, 5270, 5262, 5252, 5244, 5238, 5230, 5211, 5203,
+    5197, 5189, 5179, 5171, 5165, 5157, 5131, 5123, 5117, 5109, 5099, 5091,
+    5085, 5077, 5058, 5050, 5044, 5036, 5026, 5018, 5012, 5004, 4982, 4974,
+    4968, 4960, 4950, 4942, 4936, 4928, 4909, 4901, 4895, 4887, 4877, 4869,
+    4863, 4855, 5586, 5578, 5572, 5564, 5554, 5546, 5540, 5532, 5513, 5505,
+    5499, 5491, 5481, 5473, 5467, 5459, 5437, 5429, 5423, 5415, 5405, 5397,
+    5391, 5383, 5364, 5356, 5350, 5342, 5332, 5324, 5318, 5310, 5284, 5276,
+    5270, 5262, 5252, 5244, 5238, 5230, 5211, 5203, 5197, 5189, 5179, 5171,
+    5165, 5157, 5135, 5127, 5121, 5113, 5103, 5095, 5089, 5081, 5062, 5054,
+    5048, 5040, 5030, 5022, 5016, 5008, 5142, 5134, 5128, 5120, 5110, 5102,
+    5096, 5088, 5069, 5061, 5055, 5047, 5037, 5029, 5023, 5015, 4993, 4985,
+    4979, 4971, 4961, 4953, 4947, 4939, 4920, 4912, 4906, 4898, 4888, 4880,
+    4874, 4866, 4840, 4832, 4826, 4818, 4808, 4800, 4794, 4786, 4767, 4759,
+    4753, 4745, 4735, 4727, 4721, 4713, 4691, 4683, 4677, 4669, 4659, 4651,
+    4645, 4637, 4618, 4610, 4604, 4596, 4586, 4578, 4572, 4564, 4766, 4758,
+    4752, 4744, 4734, 4726, 4720, 4712, 4693, 4685, 4679, 4671, 4661, 4653,
+    4647, 4639, 4617, 4609, 4603, 4595, 4585, 4577, 4571, 4563, 4544, 4536,
+    4530, 4522, 4512, 4504, 4498, 4490, 4464, 4456, 4450, 4442, 4432, 4424,
+    4418, 4410, 4391, 4383, 4377, 4369, 4359, 4351, 4345, 4337, 4315, 4307,
+    4301, 4293, 4283, 4275, 4269, 4261, 4242, 4234, 4228, 4220, 4210, 4202,
+    4196, 4188, 4322, 4314, 4308, 4300, 4290, 4282, 4276, 4268, 4249, 4241,
+    4235, 4227, 4217, 4209, 4203, 4195, 4173, 4165, 4159, 4151, 4141, 4133,
+    4127, 4119, 4100, 4092, 4086, 4078, 4068, 4060, 4054, 4046, 4020, 4012,
+    4006, 3998, 3988, 3980, 3974, 3966, 3947, 3939, 3933, 3925, 3915, 3907,
+    3901, 3893, 3871, 3863, 3857, 3849, 3839, 3831, 3825, 3817, 3798, 3790,
+    3784, 3776, 3766, 3758, 3752, 3744, 4651, 4643, 4637, 4629, 4619, 4611,
+    4605, 4597, 4578, 4570, 4564, 4556, 4546, 4538, 4532, 4524, 4502, 4494,
+    4488, 4480, 4470, 4462, 4456, 4448, 4429, 4421, 4415, 4407, 4397, 4389,
+    4383, 4375, 4349, 4341, 4335, 4327, 4317, 4309, 4303, 4295, 4276, 4268,
+    4262, 4254, 4244, 4236, 4230, 4222, 4200, 4192, 4186, 4178, 4168, 4160,
+    4154, 4146, 4127, 4119, 4113, 4105, 4095, 4087, 4081, 4073, 4207, 4199,
+    4193, 4185, 4175, 4167, 4161, 4153, 4134, 4126, 4120, 4112, 4102, 4094,
+    4088, 4080, 4058, 4050, 4044, 4036, 4026, 4018, 4012, 4004, 3985, 3977,
+    3971, 3963, 3953, 3945, 3939, 3931, 3905, 3897, 3891, 3883, 3873, 3865,
+    3859, 3851, 3832, 3824, 3818, 3810, 3800, 3792, 3786, 3778, 3756, 3748,
+    3742, 3734, 3724, 3716, 3710, 3702, 3683, 3675, 3669, 3661, 3651, 3643,
+    3637, 3629, 3831, 3823, 3817, 3809, 3799, 3791, 3785, 3777, 3758, 3750,
+    3744, 3736, 3726, 3718, 3712, 3704, 3682, 3674, 3668, 3660, 3650, 3642,
+    3636, 3628, 3609, 3601, 3595, 3587, 3577, 3569, 3563, 3555, 3529, 3521,
+    3515, 3507, 3497, 3489, 3483, 3475, 3456, 3448, 3442, 3434, 3424, 3416,
+    3410, 3402, 3380, 3372, 3366, 3358, 3348, 3340, 3334, 3326, 3307, 3299,
+    3293, 3285, 3275, 3267, 3261, 3253, 3387, 3379, 3373, 3365, 3355, 3347,
+    3341, 3333, 3314, 3306, 3300, 3292, 3282, 3274, 3268, 3260, 3238, 3230,
+    3224, 3216, 3206, 3198, 3192, 3184, 3165, 3157, 3151, 3143, 3133, 3125,
+    3119, 3111, 3085, 3077, 3071, 3063, 3053, 3045, 3039, 3031, 3012, 3004,
+    2998, 2990, 2980, 2972, 2966, 2958, 2936, 2928, 2922, 2914, 2904, 2896,
+    2890, 2882, 2863, 2855, 2849, 2841, 2831, 2823, 2817, 2809, 3540, 3532,
+    3526, 3518, 3508, 3500, 3494, 3486, 3467, 3459, 3453, 3445, 3435, 3427,
+    3421, 3413, 3391, 3383, 3377, 3369, 3359, 3351, 3345, 3337, 3318, 3310,
+    3304, 3296, 3286, 3278, 3272, 3264, 3238, 3230, 3224, 3216, 3206, 3198,
+    3192, 3184, 3165, 3157, 3151, 3143, 3133, 3125, 3119, 3111, 3089, 3081,
+    3075, 3067, 3057, 3049, 3043, 3035, 3016, 3008, 3002, 2994, 2984, 2976,
+    2970, 2962, 3096, 3088, 3082, 3074, 3064, 3056, 3050, 3042, 3023, 3015,
+    3009, 3001, 2991, 2983, 2977, 2969, 2947, 2939, 2933, 2925, 2915, 2907,
+    2901, 2893, 2874, 2866, 2860, 2852, 2842, 2834, 2828, 2820, 2794, 2786,
+    2780, 2772, 2762, 2754, 2748, 2740, 2721, 2713, 2707, 2699, 2689, 2681,
+    2675, 2667, 2645, 2637, 2631, 2623, 2613, 2605, 2599, 2591, 2572, 2564,
+    2558, 2550, 2540, 2532, 2526, 2518, 2720, 2712, 2706, 2698, 2688, 2680,
+    2674, 2666, 2647, 2639, 2633, 2625, 2615, 2607, 2601, 2593, 2571, 2563,
+    2557, 2549, 2539, 2531, 2525, 2517, 2498, 2490, 2484, 2476, 2466, 2458,
+    2452, 2444, 2418, 2410, 2404, 2396, 2386, 2378, 2372, 2364, 2345, 2337,
+    2331, 2323, 2313, 2305, 2299, 2291, 2269, 2261, 2255, 2247, 2237, 2229,
+    2223, 2215, 2196, 2188, 2182, 2174, 2164, 2156, 2150, 2142, 2276, 2268,
+    2262, 2254, 2244, 2236, 2230, 2222, 2203, 2195, 2189, 2181, 2171, 2163,
+    2157, 2149, 2127, 2119, 2113, 2105, 2095, 2087, 2081, 2073, 2054, 2046,
+    2040, 2032, 2022, 2014, 2008, 2000, 1974, 1966, 1960, 1952, 1942, 1934,
+    1928, 1920, 1901, 1893, 1887, 1879, 1869, 1861, 1855, 1847, 1825, 1817,
+    1811, 1803, 1793, 1785, 1779, 1771, 1752, 1744, 1738, 1730, 1720, 1712,
+    1706, 1698, 1897, 1883, 1860, 1846, 1819, 1805, 1782, 1768, 1723, 1709,
+    1686, 1672, 1645, 1631, 1608, 1594, 1574, 1560, 1537, 1523, 1496, 1482,
+    1459, 1445, 1400, 1386, 1363, 1349, 1322, 1308, 1285, 1271, 1608, 1565,
+    1535, 1492, 1446, 1403, 1373, 1330, 1312, 1269, 1239, 1196, 1150, 1107,
+    1077, 1034, 1291, 1218, 1171, 1098, 1015, 942, 895, 822, 953, 850,
+    729, 626, 618, 431, 257, 257, 257, 257, 0, 255, 255, 255,
+    255, 429, 616, 624, 727, 848, 951, 820, 893, 940, 1013, 1096,
+    1169, 1216, 1289, 1032, 1075, 1105, 1148, 1194, 1237, 1267, 1310, 1328,
+    1371, 1401, 1444, 1490, 1533, 1563, 1606, 1269, 1283, 1306, 1320, 1347,
+    1361, 1384, 1398, 1443, 1457, 1480, 1494, 1521, 1535, 1558, 1572, 1592,
+    1606, 1629, 1643, 1670, 1684, 1707, 1721, 1766, 1780, 1803, 1817, 1844,
+    1858, 1881, 1895, 1696, 1704, 1710, 1718, 1728, 1736, 1742, 1750, 1769,
+    1777, 1783, 1791, 1801, 1809, 1815, 1823, 1845, 1853, 1859, 1867, 1877,
+    1885, 1891, 1899, 1918, 1926, 1932, 1940, 1950, 1958, 1964, 1972, 1998,
+    2006, 2012, 2020, 2030, 2038, 2044, 2052, 2071, 2079, 2085, 2093, 2103,
+    2111, 2117, 2125, 2147, 2155, 2161, 2169, 2179, 2187, 2193, 2201, 2220,
+    2228, 2234, 2242, 2252, 2260, 2266, 2274, 2140, 2148, 2154, 2162, 2172,
+    2180, 2186, 2194, 2213, 2221, 2227, 2235, 2245, 2253, 2259, 2267, 2289,
+    2297, 2303, 2311, 2321, 2329, 2335, 2343, 2362, 2370, 2376, 2384, 2394,
+    2402, 2408, 2416, 2442, 2450, 2456, 2464, 2474, 2482, 2488, 2496, 2515,
+    2523, 2529, 2537, 2547, 2555, 2561, 2569, 2591, 2599, 2605, 2613, 2623,
+    2631, 2637, 2645, 2664, 2672, 2678, 2686, 2696, 2704, 2710, 2718, 2516,
+    2524, 2530, 2538, 2548, 2556, 2562, 2570, 2589, 2597, 2603, 2611, 2621,
+    2629, 2635, 2643, 2665, 2673, 2679, 2687, 2697, 2705, 2711, 2719, 2738,
+    2746, 2752, 2760, 2770, 2778, 2784, 2792, 2818, 2826, 2832, 2840, 2850,
+    2858, 2864, 2872, 2891, 2899, 2905, 2913, 2923, 2931, 2937, 2945, 2967,
+    2975, 2981, 2989, 2999, 3007, 3013, 3021, 3040, 3048, 3054, 3062, 3072,
+    3080, 3086, 3094, 2960, 2968, 2974, 2982, 2992, 3000, 3006, 3014, 3033,
+    3041, 3047, 3055, 3065, 3073, 3079, 3087, 3109, 3117, 3123, 3131, 3141,
+    3149, 3155, 3163, 3182, 3190, 3196, 3204, 3214, 3222, 3228, 3236, 3262,
+    3270, 3276, 3284, 3294, 3302, 3308, 3316, 3335, 3343, 3349, 3357, 3367,
+    3375, 3381, 3389, 3411, 3419, 3425, 3433, 3443, 3451, 3457, 3465, 3484,
+    3492, 3498, 3506, 3516, 3524, 3530, 3538, 2807, 2815, 2821, 2829, 2839,
+    2847, 2853, 2861, 2880, 2888, 2894, 2902, 2912, 2920, 2926, 2934, 2956,
+    2964, 2970, 2978, 2988, 2996, 3002, 3010, 3029, 3037, 3043, 3051, 3061,
+    3069, 3075, 3083, 3109, 3117, 3123, 3131, 3141, 3149, 3155, 3163, 3182,
+    3190, 3196, 3204, 3214, 3222, 3228, 3236, 3258, 3266, 3272, 3280, 3290,
+    3298, 3304, 3312, 3331, 3339, 3345, 3353, 3363, 3371, 3377, 3385, 3251,
+    3259, 3265, 3273, 3283, 3291, 3297, 3305, 3324, 3332, 3338, 3346, 3356,
+    3364, 3370, 3378, 3400, 3408, 3414, 3422, 3432, 3440, 3446, 3454, 3473,
+    3481, 3487, 3495, 3505, 3513, 3519, 3527, 3553, 3561, 3567, 3575, 3585,
+    3593, 3599, 3607, 3626, 3634, 3640, 3648, 3658, 3666, 3672, 3680, 3702,
+    3710, 3716, 3724, 3734, 3742, 3748, 3756, 3775, 3783, 3789, 3797, 3807,
+    3815, 3821, 3829, 3627, 3635, 3641, 3649, 3659, 3667, 3673, 3681, 3700,
+    3708, 3714, 3722, 3732, 3740, 3746, 3754, 3776, 3784, 3790, 3798, 3808,
+    3816, 3822, 3830, 3849, 3857, 3863, 3871, 3881, 3889, 3895, 3903, 3929,
+    3937, 3943, 3951, 3961, 3969, 3975, 3983, 4002, 4010, 4016, 4024, 4034,
+    4042, 4048, 4056, 4078, 4086, 4092, 4100, 4110, 4118, 4124, 4132, 4151,
+    4159, 4165, 4173, 4183, 4191, 4197, 4205, 4071, 4079, 4085, 4093, 4103,
+    4111, 4117, 4125, 4144, 4152, 4158, 4166, 4176, 4184, 4190, 4198, 4220,
+    4228, 4234, 4242, 4252, 4260, 4266, 4274, 4293, 4301, 4307, 4315, 4325,
+    4333, 4339, 4347, 4373, 4381, 4387, 4395, 4405, 4413, 4419, 4427, 4446,
+    4454, 4460, 4468, 4478, 4486, 4492, 4500, 4522, 4530, 4536, 4544, 4554,
+    4562, 4568, 4576, 4595, 4603, 4609, 4617, 4627, 4635, 4641, 4649, 3742,
+    3750, 3756, 3764, 3774, 3782, 3788, 3796, 3815, 3823, 3829, 3837, 3847,
+    3855, 3861, 3869, 3891, 3899, 3905, 3913, 3923, 3931, 3937, 3945, 3964,
+    3972, 3978, 3986, 3996, 4004, 4010, 4018, 4044, 4052, 4058, 4066, 4076,
+    4084, 4090, 4098, 4117, 4125, 4131, 4139, 4149, 4157, 4163, 4171, 4193,
+    4201, 4207, 4215, 4225, 4233, 4239, 4247, 4266, 4274, 4280, 4288, 4298,
+    4306, 4312, 4320, 4186, 4194, 4200, 4208, 4218, 4226, 4232, 4240, 4259,
+    4267, 4273, 4281, 4291, 4299, 4305, 4313, 4335, 4343, 4349, 4357, 4367,
+    4375, 4381, 4389, 4408, 4416, 4422, 4430, 4440, 4448, 4454, 4462, 4488,
+    4496, 4502, 4510, 4520, 4528, 4534, 4542, 4561, 4569, 4575, 4583, 4593,
+    4601, 4607, 4615, 4637, 4645, 4651, 4659, 4669, 4677, 4683, 4691, 4710,
+    4718, 4724, 4732, 4742, 4750, 4756, 4764, 4562, 4570, 4576, 4584, 4594,
+    4602, 4608, 4616, 4635, 4643, 4649, 4657, 4667, 4675, 4681, 4689, 4711,
+    4719, 4725, 4733, 4743, 4751, 4757, 4765, 4784, 4792, 4798, 4806, 4816,
+    4824, 4830, 4838, 4864, 4872, 4878, 4886, 4896, 4904, 4910, 4918, 4937,
+    4945, 4951, 4959, 4969, 4977, 4983, 4991, 5013, 5021, 5027, 5035, 5045,
+    5053, 5059, 5067, 5086, 5094, 5100, 5108, 5118, 5126, 5132, 5140, 5006,
+    5014, 5020, 5028, 5038, 5046, 5052, 5060, 5079, 5087, 5093, 5101, 5111,
+    5119, 5125, 5133, 5155, 5163, 5169, 5177, 5187, 5195, 5201, 5209, 5228,
+    5236, 5242, 5250, 5260, 5268, 5274, 5282, 5308, 5316, 5322, 5330, 5340,
+    5348, 5354, 5362, 5381, 5389, 5395, 5403, 5413, 5421, 5427, 5435, 5457,
+    5465, 5471, 5479, 5489, 5497, 5503, 5511, 5530, 5538, 5544, 5552, 5562,
+    5570, 5576, 5584, 4853, 4861, 4867, 4875, 4885, 4893, 4899, 4907, 4926,
+    4934, 4940, 4948, 4958, 4966, 4972, 4980, 5002, 5010, 5016, 5024, 5034,
+    5042, 5048, 5056, 5075, 5083, 5089, 5097, 5107, 5115, 5121, 5129, 5155,
+    5163, 5169, 5177, 5187, 5195, 5201, 5209, 5228, 5236, 5242, 5250, 5260,
+    5268, 5274, 5282, 5304, 5312, 5318, 5326, 5336, 5344, 5350, 5358, 5377,
+    5385, 5391, 5399, 5409, 5417, 5423, 5431, 5297, 5305, 5311, 5319, 5329,
+    5337, 5343, 5351, 5370, 5378, 5384, 5392, 5402, 5410, 5416, 5424, 5446,
+    5454, 5460, 5468, 5478, 5486, 5492, 5500, 5519, 5527, 5533, 5541, 5551,
+    5559, 5565, 5573, 5599, 5607, 5613, 5621, 5631, 5639, 5645, 5653, 5672,
+    5680, 5686, 5694, 5704, 5712, 5718, 5726, 5748, 5756, 5762, 5770, 5780,
+    5788, 5794, 5802, 5821, 5829, 5835, 5843, 5853, 5861, 5867, 5875, 5673,
+    5681, 5687, 5695, 5705, 5713, 5719, 5727, 5746, 5754, 5760, 5768, 5778,
+    5786, 5792, 5800, 5822, 5830, 5836, 5844, 5854, 5862, 5868, 5876, 5895,
+    5903, 5909, 5917, 5927, 5935, 5941, 5949, 5975, 5983, 5989, 5997, 6007,
+    6015, 6021, 6029, 6048, 6056, 6062, 6070, 6080, 6088, 6094, 6102, 6124,
+    6132, 6138, 6146, 6156, 6164, 6170, 6178, 6197, 6205, 6211, 6219, 6229,
+    6237, 6243, 6251, 6117, 6125, 6131, 6139, 6149, 6157, 6163, 6171, 6190,
+    6198, 6204, 6212, 6222, 6230, 6236, 6244, 6266, 6274, 6280, 6288, 6298,
+    6306, 6312, 6320, 6339, 6347, 6353, 6361, 6371, 6379, 6385, 6393, 6419,
+    6427, 6433, 6441, 6451, 6459, 6465, 6473, 6492, 6500, 6506, 6514, 6524,
+    6532, 6538, 6546, 6568, 6576, 6582, 6590, 6600, 6608, 6614, 6622, 6641,
+    6649, 6655, 6663, 6673, 6681, 6687, 6695, 3742, 3750, 3756, 3764, 3774,
+    3782, 3788, 3796, 3815, 3823, 3829, 3837, 3847, 3855, 3861, 3869, 3891,
+    3899, 3905, 3913, 3923, 3931, 3937, 3945, 3964, 3972, 3978, 3986, 3996,
+    4004, 4010, 4018, 4044, 4052, 4058, 4066, 4076, 4084, 4090, 4098, 4117,
+    4125, 4131, 4139, 4149, 4157, 4163, 4171, 4193, 4201, 4207, 4215, 4225,
+    4233, 4239, 4247, 4266, 4274, 4280, 4288, 4298, 4306, 4312, 4320, 4186,
+    4194, 4200, 4208, 4218, 4226, 4232, 4240, 4259, 4267, 4273, 4281, 4291,
+    4299, 4305, 4313, 4335, 4343, 4349, 4357, 4367, 4375, 4381, 4389, 4408,
+    4416, 4422, 4430, 4440, 4448, 4454, 4462, 4488, 4496, 4502, 4510, 4520,
+    4528, 4534, 4542, 4561, 4569, 4575, 4583, 4593, 4601, 4607, 4615, 4637,
+    4645, 4651, 4659, 4669, 4677, 4683, 4691, 4710, 4718, 4724, 4732, 4742,
+    4750, 4756, 4764, 4562, 4570, 4576, 4584, 4594, 4602, 4608, 4616, 4635,
+    4643, 4649, 4657, 4667, 4675, 4681, 4689, 4711, 4719, 4725, 4733, 4743,
+    4751, 4757, 4765, 4784, 4792, 4798, 4806, 4816, 4824, 4830, 4838, 4864,
+    4872, 4878, 4886, 4896, 4904, 4910, 4918, 4937, 4945, 4951, 4959, 4969,
+    4977, 4983, 4991, 5013, 5021, 5027, 5035, 5045, 5053, 5059, 5067, 5086,
+    5094, 5100, 5108, 5118, 5126, 5132, 5140, 5006, 5014, 5020, 5028, 5038,
+    5046, 5052, 5060, 5079, 5087, 5093, 5101, 5111, 5119, 5125, 5133, 5155,
+    5163, 5169, 5177, 5187, 5195, 5201, 5209, 5228, 5236, 5242, 5250, 5260,
+    5268, 5274, 5282, 5308, 5316, 5322, 5330, 5340, 5348, 5354, 5362, 5381,
+    5389, 5395, 5403, 5413, 5421, 5427, 5435, 5457, 5465, 5471, 5479, 5489,
+    5497, 5503, 5511, 5530, 5538, 5544, 5552, 5562, 5570, 5576, 5584, 4853,
+    4861, 4867, 4875, 4885, 4893, 4899, 4907, 4926, 4934, 4940, 4948, 4958,
+    4966, 4972, 4980, 5002, 5010, 5016, 5024, 5034, 5042, 5048, 5056, 5075,
+    5083, 5089, 5097, 5107, 5115, 5121, 5129, 5155, 5163, 5169, 5177, 5187,
+    5195, 5201, 5209, 5228, 5236, 5242, 5250, 5260, 5268, 5274, 5282, 5304,
+    5312, 5318, 5326, 5336, 5344, 5350, 5358, 5377, 5385, 5391, 5399, 5409,
+    5417, 5423, 5431, 5297, 5305, 5311, 5319, 5329, 5337, 5343, 5351, 5370,
+    5378, 5384, 5392, 5402, 5410, 5416, 5424, 5446, 5454, 5460, 5468, 5478,
+    5486, 5492, 5500, 5519, 5527, 5533, 5541, 5551, 5559, 5565, 5573, 5599,
+    5607, 5613, 5621, 5631, 5639, 5645, 5653, 5672, 5680, 5686, 5694, 5704,
+    5712, 5718, 5726, 5748, 5756, 5762, 5770, 5780, 5788, 5794, 5802, 5821,
+    5829, 5835, 5843, 5853, 5861, 5867, 5875, 5673, 5681, 5687, 5695, 5705,
+    5713, 5719, 5727, 5746, 5754, 5760, 5768, 5778, 5786, 5792, 5800, 5822,
+    5830, 5836, 5844, 5854, 5862, 5868, 5876, 5895, 5903, 5909, 5917, 5927,
+    5935, 5941, 5949, 5975, 5983, 5989, 5997, 6007, 6015, 6021, 6029, 6048,
+    6056, 6062, 6070, 6080, 6088, 6094, 6102, 6124, 6132, 6138, 6146, 6156,
+    6164, 6170, 6178, 6197, 6205, 6211, 6219, 6229, 6237, 6243, 6251, 6117,
+    6125, 6131, 6139, 6149, 6157, 6163, 6171, 6190, 6198, 6204, 6212, 6222,
+    6230, 6236, 6244, 6266, 6274, 6280, 6288, 6298, 6306, 6312, 6320, 6339,
+    6347, 6353, 6361, 6371, 6379, 6385, 6393, 6419, 6427, 6433, 6441, 6451,
+    6459, 6465, 6473, 6492, 6500, 6506, 6514, 6524, 6532, 6538, 6546, 6568,
+    6576, 6582, 6590, 6600, 6608, 6614, 6622, 6641, 6649, 6655, 6663, 6673,
+    6681, 6687, 6695, 5788, 5796, 5802, 5810, 5820, 5828, 5834, 5842, 5861,
+    5869, 5875, 5883, 5893, 5901, 5907, 5915, 5937, 5945, 5951, 5959, 5969,
+    5977, 5983, 5991, 6010, 6018, 6024, 6032, 6042, 6050, 6056, 6064, 6090,
+    6098, 6104, 6112, 6122, 6130, 6136, 6144, 6163, 6171, 6177, 6185, 6195,
+    6203, 6209, 6217, 6239, 6247, 6253, 6261, 6271, 6279, 6285, 6293, 6312,
+    6320, 6326, 6334, 6344, 6352, 6358, 6366, 6232, 6240, 6246, 6254, 6264,
+    6272, 6278, 6286, 6305, 6313, 6319, 6327, 6337, 6345, 6351, 6359, 6381,
+    6389, 6395, 6403, 6413, 6421, 6427, 6435, 6454, 6462, 6468, 6476, 6486,
+    6494, 6500, 6508, 6534, 6542, 6548, 6556, 6566, 6574, 6580, 6588, 6607,
+    6615, 6621, 6629, 6639, 6647, 6653, 6661, 6683, 6691, 6697, 6705, 6715,
+    6723, 6729, 6737, 6756, 6764, 6770, 6778, 6788, 6796, 6802, 6810, 6608,
+    6616, 6622, 6630, 6640, 6648, 6654, 6662, 6681, 6689, 6695, 6703, 6713,
+    6721, 6727, 6735, 6757, 6765, 6771, 6779, 6789, 6797, 6803, 6811, 6830,
+    6838, 6844, 6852, 6862, 6870, 6876, 6884, 6910, 6918, 6924, 6932, 6942,
+    6950, 6956, 6964, 6983, 6991, 6997, 7005, 7015, 7023, 7029, 7037, 7059,
+    7067, 7073, 7081, 7091, 7099, 7105, 7113, 7132, 7140, 7146, 7154, 7164,
+    7172, 7178, 7186, 7052, 7060, 7066, 7074, 7084, 7092, 7098, 7106, 7125,
+    7133, 7139, 7147, 7157, 7165, 7171, 7179, 7201, 7209, 7215, 7223, 7233,
+    7241, 7247, 7255, 7274, 7282, 7288, 7296, 7306, 7314, 7320, 7328, 7354,
+    7362, 7368, 7376, 7386, 7394, 7400, 7408, 7427, 7435, 7441, 7449, 7459,
+    7467, 7473, 7481, 7503, 7511, 7517, 7525, 7535, 7543, 7549, 7557, 7576,
+    7584, 7590, 7598, 7608, 7616, 7622, 7630, 6899, 6907, 6913, 6921, 6931,
+    6939, 6945, 6953, 6972, 6980, 6986, 6994, 7004, 7012, 7018, 7026, 7048,
+    7056, 7062, 7070, 7080, 7088, 7094, 7102, 7121, 7129, 7135, 7143, 7153,
+    7161, 7167, 7175, 7201, 7209, 7215, 7223, 7233, 7241, 7247, 7255, 7274,
+    7282, 7288, 7296, 7306, 7314, 7320, 7328, 7350, 7358, 7364, 7372, 7382,
+    7390, 7396, 7404, 7423, 7431, 7437, 7445, 7455, 7463, 7469, 7477, 7343,
+    7351, 7357, 7365, 7375, 7383, 7389, 7397, 7416, 7424, 7430, 7438, 7448,
+    7456, 7462, 7470, 7492, 7500, 7506, 7514, 7524, 7532, 7538, 7546, 7565,
+    7573, 7579, 7587, 7597, 7605, 7611, 7619, 7645, 7653, 7659, 7667, 7677,
+    7685, 7691, 7699, 7718, 7726, 7732, 7740, 7750, 7758, 7764, 7772, 7794,
+    7802, 7808, 7816, 7826, 7834, 7840, 7848, 7867, 7875, 7881, 7889, 7899,
+    7907, 7913, 7921, 7719, 7727, 7733, 7741, 7751, 7759, 7765, 7773, 7792,
+    7800, 7806, 7814, 7824, 7832, 7838, 7846, 7868, 7876, 7882, 7890, 7900,
+    7908, 7914, 7922, 7941, 7949, 7955, 7963, 7973, 7981, 7987, 7995, 8021,
+    8029, 8035, 8043, 8053, 8061, 8067, 8075, 8094, 8102, 8108, 8116, 8126,
+    8134, 8140, 8148, 8170, 8178, 8184, 8192, 8202, 8210, 8216, 8224, 8243,
+    8251, 8257, 8265, 8275
+};
diff --git a/libvpx/vp8/encoder/dct_value_tokens.h b/libvpx/vp8/encoder/dct_value_tokens.h
new file mode 100644
index 0000000..ef08eed
--- /dev/null
+++ b/libvpx/vp8/encoder/dct_value_tokens.h
@@ -0,0 +1,699 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* Generated file, included by tokenize.c  */
+/* Values generated by fill_value_tokens() */
+
+static const TOKENVALUE dct_value_tokens[2048*2] =
+{
+    {10, 3963}, {10, 3961}, {10, 3959}, {10, 3957}, {10, 3955}, {10, 3953},
+    {10, 3951}, {10, 3949}, {10, 3947}, {10, 3945}, {10, 3943}, {10, 3941},
+    {10, 3939}, {10, 3937}, {10, 3935}, {10, 3933}, {10, 3931}, {10, 3929},
+    {10, 3927}, {10, 3925}, {10, 3923}, {10, 3921}, {10, 3919}, {10, 3917},
+    {10, 3915}, {10, 3913}, {10, 3911}, {10, 3909}, {10, 3907}, {10, 3905},
+    {10, 3903}, {10, 3901}, {10, 3899}, {10, 3897}, {10, 3895}, {10, 3893},
+    {10, 3891}, {10, 3889}, {10, 3887}, {10, 3885}, {10, 3883}, {10, 3881},
+    {10, 3879}, {10, 3877}, {10, 3875}, {10, 3873}, {10, 3871}, {10, 3869},
+    {10, 3867}, {10, 3865}, {10, 3863}, {10, 3861}, {10, 3859}, {10, 3857},
+    {10, 3855}, {10, 3853}, {10, 3851}, {10, 3849}, {10, 3847}, {10, 3845},
+    {10, 3843}, {10, 3841}, {10, 3839}, {10, 3837}, {10, 3835}, {10, 3833},
+    {10, 3831}, {10, 3829}, {10, 3827}, {10, 3825}, {10, 3823}, {10, 3821},
+    {10, 3819}, {10, 3817}, {10, 3815}, {10, 3813}, {10, 3811}, {10, 3809},
+    {10, 3807}, {10, 3805}, {10, 3803}, {10, 3801}, {10, 3799}, {10, 3797},
+    {10, 3795}, {10, 3793}, {10, 3791}, {10, 3789}, {10, 3787}, {10, 3785},
+    {10, 3783}, {10, 3781}, {10, 3779}, {10, 3777}, {10, 3775}, {10, 3773},
+    {10, 3771}, {10, 3769}, {10, 3767}, {10, 3765}, {10, 3763}, {10, 3761},
+    {10, 3759}, {10, 3757}, {10, 3755}, {10, 3753}, {10, 3751}, {10, 3749},
+    {10, 3747}, {10, 3745}, {10, 3743}, {10, 3741}, {10, 3739}, {10, 3737},
+    {10, 3735}, {10, 3733}, {10, 3731}, {10, 3729}, {10, 3727}, {10, 3725},
+    {10, 3723}, {10, 3721}, {10, 3719}, {10, 3717}, {10, 3715}, {10, 3713},
+    {10, 3711}, {10, 3709}, {10, 3707}, {10, 3705}, {10, 3703}, {10, 3701},
+    {10, 3699}, {10, 3697}, {10, 3695}, {10, 3693}, {10, 3691}, {10, 3689},
+    {10, 3687}, {10, 3685}, {10, 3683}, {10, 3681}, {10, 3679}, {10, 3677},
+    {10, 3675}, {10, 3673}, {10, 3671}, {10, 3669}, {10, 3667}, {10, 3665},
+    {10, 3663}, {10, 3661}, {10, 3659}, {10, 3657}, {10, 3655}, {10, 3653},
+    {10, 3651}, {10, 3649}, {10, 3647}, {10, 3645}, {10, 3643}, {10, 3641},
+    {10, 3639}, {10, 3637}, {10, 3635}, {10, 3633}, {10, 3631}, {10, 3629},
+    {10, 3627}, {10, 3625}, {10, 3623}, {10, 3621}, {10, 3619}, {10, 3617},
+    {10, 3615}, {10, 3613}, {10, 3611}, {10, 3609}, {10, 3607}, {10, 3605},
+    {10, 3603}, {10, 3601}, {10, 3599}, {10, 3597}, {10, 3595}, {10, 3593},
+    {10, 3591}, {10, 3589}, {10, 3587}, {10, 3585}, {10, 3583}, {10, 3581},
+    {10, 3579}, {10, 3577}, {10, 3575}, {10, 3573}, {10, 3571}, {10, 3569},
+    {10, 3567}, {10, 3565}, {10, 3563}, {10, 3561}, {10, 3559}, {10, 3557},
+    {10, 3555}, {10, 3553}, {10, 3551}, {10, 3549}, {10, 3547}, {10, 3545},
+    {10, 3543}, {10, 3541}, {10, 3539}, {10, 3537}, {10, 3535}, {10, 3533},
+    {10, 3531}, {10, 3529}, {10, 3527}, {10, 3525}, {10, 3523}, {10, 3521},
+    {10, 3519}, {10, 3517}, {10, 3515}, {10, 3513}, {10, 3511}, {10, 3509},
+    {10, 3507}, {10, 3505}, {10, 3503}, {10, 3501}, {10, 3499}, {10, 3497},
+    {10, 3495}, {10, 3493}, {10, 3491}, {10, 3489}, {10, 3487}, {10, 3485},
+    {10, 3483}, {10, 3481}, {10, 3479}, {10, 3477}, {10, 3475}, {10, 3473},
+    {10, 3471}, {10, 3469}, {10, 3467}, {10, 3465}, {10, 3463}, {10, 3461},
+    {10, 3459}, {10, 3457}, {10, 3455}, {10, 3453}, {10, 3451}, {10, 3449},
+    {10, 3447}, {10, 3445}, {10, 3443}, {10, 3441}, {10, 3439}, {10, 3437},
+    {10, 3435}, {10, 3433}, {10, 3431}, {10, 3429}, {10, 3427}, {10, 3425},
+    {10, 3423}, {10, 3421}, {10, 3419}, {10, 3417}, {10, 3415}, {10, 3413},
+    {10, 3411}, {10, 3409}, {10, 3407}, {10, 3405}, {10, 3403}, {10, 3401},
+    {10, 3399}, {10, 3397}, {10, 3395}, {10, 3393}, {10, 3391}, {10, 3389},
+    {10, 3387}, {10, 3385}, {10, 3383}, {10, 3381}, {10, 3379}, {10, 3377},
+    {10, 3375}, {10, 3373}, {10, 3371}, {10, 3369}, {10, 3367}, {10, 3365},
+    {10, 3363}, {10, 3361}, {10, 3359}, {10, 3357}, {10, 3355}, {10, 3353},
+    {10, 3351}, {10, 3349}, {10, 3347}, {10, 3345}, {10, 3343}, {10, 3341},
+    {10, 3339}, {10, 3337}, {10, 3335}, {10, 3333}, {10, 3331}, {10, 3329},
+    {10, 3327}, {10, 3325}, {10, 3323}, {10, 3321}, {10, 3319}, {10, 3317},
+    {10, 3315}, {10, 3313}, {10, 3311}, {10, 3309}, {10, 3307}, {10, 3305},
+    {10, 3303}, {10, 3301}, {10, 3299}, {10, 3297}, {10, 3295}, {10, 3293},
+    {10, 3291}, {10, 3289}, {10, 3287}, {10, 3285}, {10, 3283}, {10, 3281},
+    {10, 3279}, {10, 3277}, {10, 3275}, {10, 3273}, {10, 3271}, {10, 3269},
+    {10, 3267}, {10, 3265}, {10, 3263}, {10, 3261}, {10, 3259}, {10, 3257},
+    {10, 3255}, {10, 3253}, {10, 3251}, {10, 3249}, {10, 3247}, {10, 3245},
+    {10, 3243}, {10, 3241}, {10, 3239}, {10, 3237}, {10, 3235}, {10, 3233},
+    {10, 3231}, {10, 3229}, {10, 3227}, {10, 3225}, {10, 3223}, {10, 3221},
+    {10, 3219}, {10, 3217}, {10, 3215}, {10, 3213}, {10, 3211}, {10, 3209},
+    {10, 3207}, {10, 3205}, {10, 3203}, {10, 3201}, {10, 3199}, {10, 3197},
+    {10, 3195}, {10, 3193}, {10, 3191}, {10, 3189}, {10, 3187}, {10, 3185},
+    {10, 3183}, {10, 3181}, {10, 3179}, {10, 3177}, {10, 3175}, {10, 3173},
+    {10, 3171}, {10, 3169}, {10, 3167}, {10, 3165}, {10, 3163}, {10, 3161},
+    {10, 3159}, {10, 3157}, {10, 3155}, {10, 3153}, {10, 3151}, {10, 3149},
+    {10, 3147}, {10, 3145}, {10, 3143}, {10, 3141}, {10, 3139}, {10, 3137},
+    {10, 3135}, {10, 3133}, {10, 3131}, {10, 3129}, {10, 3127}, {10, 3125},
+    {10, 3123}, {10, 3121}, {10, 3119}, {10, 3117}, {10, 3115}, {10, 3113},
+    {10, 3111}, {10, 3109}, {10, 3107}, {10, 3105}, {10, 3103}, {10, 3101},
+    {10, 3099}, {10, 3097}, {10, 3095}, {10, 3093}, {10, 3091}, {10, 3089},
+    {10, 3087}, {10, 3085}, {10, 3083}, {10, 3081}, {10, 3079}, {10, 3077},
+    {10, 3075}, {10, 3073}, {10, 3071}, {10, 3069}, {10, 3067}, {10, 3065},
+    {10, 3063}, {10, 3061}, {10, 3059}, {10, 3057}, {10, 3055}, {10, 3053},
+    {10, 3051}, {10, 3049}, {10, 3047}, {10, 3045}, {10, 3043}, {10, 3041},
+    {10, 3039}, {10, 3037}, {10, 3035}, {10, 3033}, {10, 3031}, {10, 3029},
+    {10, 3027}, {10, 3025}, {10, 3023}, {10, 3021}, {10, 3019}, {10, 3017},
+    {10, 3015}, {10, 3013}, {10, 3011}, {10, 3009}, {10, 3007}, {10, 3005},
+    {10, 3003}, {10, 3001}, {10, 2999}, {10, 2997}, {10, 2995}, {10, 2993},
+    {10, 2991}, {10, 2989}, {10, 2987}, {10, 2985}, {10, 2983}, {10, 2981},
+    {10, 2979}, {10, 2977}, {10, 2975}, {10, 2973}, {10, 2971}, {10, 2969},
+    {10, 2967}, {10, 2965}, {10, 2963}, {10, 2961}, {10, 2959}, {10, 2957},
+    {10, 2955}, {10, 2953}, {10, 2951}, {10, 2949}, {10, 2947}, {10, 2945},
+    {10, 2943}, {10, 2941}, {10, 2939}, {10, 2937}, {10, 2935}, {10, 2933},
+    {10, 2931}, {10, 2929}, {10, 2927}, {10, 2925}, {10, 2923}, {10, 2921},
+    {10, 2919}, {10, 2917}, {10, 2915}, {10, 2913}, {10, 2911}, {10, 2909},
+    {10, 2907}, {10, 2905}, {10, 2903}, {10, 2901}, {10, 2899}, {10, 2897},
+    {10, 2895}, {10, 2893}, {10, 2891}, {10, 2889}, {10, 2887}, {10, 2885},
+    {10, 2883}, {10, 2881}, {10, 2879}, {10, 2877}, {10, 2875}, {10, 2873},
+    {10, 2871}, {10, 2869}, {10, 2867}, {10, 2865}, {10, 2863}, {10, 2861},
+    {10, 2859}, {10, 2857}, {10, 2855}, {10, 2853}, {10, 2851}, {10, 2849},
+    {10, 2847}, {10, 2845}, {10, 2843}, {10, 2841}, {10, 2839}, {10, 2837},
+    {10, 2835}, {10, 2833}, {10, 2831}, {10, 2829}, {10, 2827}, {10, 2825},
+    {10, 2823}, {10, 2821}, {10, 2819}, {10, 2817}, {10, 2815}, {10, 2813},
+    {10, 2811}, {10, 2809}, {10, 2807}, {10, 2805}, {10, 2803}, {10, 2801},
+    {10, 2799}, {10, 2797}, {10, 2795}, {10, 2793}, {10, 2791}, {10, 2789},
+    {10, 2787}, {10, 2785}, {10, 2783}, {10, 2781}, {10, 2779}, {10, 2777},
+    {10, 2775}, {10, 2773}, {10, 2771}, {10, 2769}, {10, 2767}, {10, 2765},
+    {10, 2763}, {10, 2761}, {10, 2759}, {10, 2757}, {10, 2755}, {10, 2753},
+    {10, 2751}, {10, 2749}, {10, 2747}, {10, 2745}, {10, 2743}, {10, 2741},
+    {10, 2739}, {10, 2737}, {10, 2735}, {10, 2733}, {10, 2731}, {10, 2729},
+    {10, 2727}, {10, 2725}, {10, 2723}, {10, 2721}, {10, 2719}, {10, 2717},
+    {10, 2715}, {10, 2713}, {10, 2711}, {10, 2709}, {10, 2707}, {10, 2705},
+    {10, 2703}, {10, 2701}, {10, 2699}, {10, 2697}, {10, 2695}, {10, 2693},
+    {10, 2691}, {10, 2689}, {10, 2687}, {10, 2685}, {10, 2683}, {10, 2681},
+    {10, 2679}, {10, 2677}, {10, 2675}, {10, 2673}, {10, 2671}, {10, 2669},
+    {10, 2667}, {10, 2665}, {10, 2663}, {10, 2661}, {10, 2659}, {10, 2657},
+    {10, 2655}, {10, 2653}, {10, 2651}, {10, 2649}, {10, 2647}, {10, 2645},
+    {10, 2643}, {10, 2641}, {10, 2639}, {10, 2637}, {10, 2635}, {10, 2633},
+    {10, 2631}, {10, 2629}, {10, 2627}, {10, 2625}, {10, 2623}, {10, 2621},
+    {10, 2619}, {10, 2617}, {10, 2615}, {10, 2613}, {10, 2611}, {10, 2609},
+    {10, 2607}, {10, 2605}, {10, 2603}, {10, 2601}, {10, 2599}, {10, 2597},
+    {10, 2595}, {10, 2593}, {10, 2591}, {10, 2589}, {10, 2587}, {10, 2585},
+    {10, 2583}, {10, 2581}, {10, 2579}, {10, 2577}, {10, 2575}, {10, 2573},
+    {10, 2571}, {10, 2569}, {10, 2567}, {10, 2565}, {10, 2563}, {10, 2561},
+    {10, 2559}, {10, 2557}, {10, 2555}, {10, 2553}, {10, 2551}, {10, 2549},
+    {10, 2547}, {10, 2545}, {10, 2543}, {10, 2541}, {10, 2539}, {10, 2537},
+    {10, 2535}, {10, 2533}, {10, 2531}, {10, 2529}, {10, 2527}, {10, 2525},
+    {10, 2523}, {10, 2521}, {10, 2519}, {10, 2517}, {10, 2515}, {10, 2513},
+    {10, 2511}, {10, 2509}, {10, 2507}, {10, 2505}, {10, 2503}, {10, 2501},
+    {10, 2499}, {10, 2497}, {10, 2495}, {10, 2493}, {10, 2491}, {10, 2489},
+    {10, 2487}, {10, 2485}, {10, 2483}, {10, 2481}, {10, 2479}, {10, 2477},
+    {10, 2475}, {10, 2473}, {10, 2471}, {10, 2469}, {10, 2467}, {10, 2465},
+    {10, 2463}, {10, 2461}, {10, 2459}, {10, 2457}, {10, 2455}, {10, 2453},
+    {10, 2451}, {10, 2449}, {10, 2447}, {10, 2445}, {10, 2443}, {10, 2441},
+    {10, 2439}, {10, 2437}, {10, 2435}, {10, 2433}, {10, 2431}, {10, 2429},
+    {10, 2427}, {10, 2425}, {10, 2423}, {10, 2421}, {10, 2419}, {10, 2417},
+    {10, 2415}, {10, 2413}, {10, 2411}, {10, 2409}, {10, 2407}, {10, 2405},
+    {10, 2403}, {10, 2401}, {10, 2399}, {10, 2397}, {10, 2395}, {10, 2393},
+    {10, 2391}, {10, 2389}, {10, 2387}, {10, 2385}, {10, 2383}, {10, 2381},
+    {10, 2379}, {10, 2377}, {10, 2375}, {10, 2373}, {10, 2371}, {10, 2369},
+    {10, 2367}, {10, 2365}, {10, 2363}, {10, 2361}, {10, 2359}, {10, 2357},
+    {10, 2355}, {10, 2353}, {10, 2351}, {10, 2349}, {10, 2347}, {10, 2345},
+    {10, 2343}, {10, 2341}, {10, 2339}, {10, 2337}, {10, 2335}, {10, 2333},
+    {10, 2331}, {10, 2329}, {10, 2327}, {10, 2325}, {10, 2323}, {10, 2321},
+    {10, 2319}, {10, 2317}, {10, 2315}, {10, 2313}, {10, 2311}, {10, 2309},
+    {10, 2307}, {10, 2305}, {10, 2303}, {10, 2301}, {10, 2299}, {10, 2297},
+    {10, 2295}, {10, 2293}, {10, 2291}, {10, 2289}, {10, 2287}, {10, 2285},
+    {10, 2283}, {10, 2281}, {10, 2279}, {10, 2277}, {10, 2275}, {10, 2273},
+    {10, 2271}, {10, 2269}, {10, 2267}, {10, 2265}, {10, 2263}, {10, 2261},
+    {10, 2259}, {10, 2257}, {10, 2255}, {10, 2253}, {10, 2251}, {10, 2249},
+    {10, 2247}, {10, 2245}, {10, 2243}, {10, 2241}, {10, 2239}, {10, 2237},
+    {10, 2235}, {10, 2233}, {10, 2231}, {10, 2229}, {10, 2227}, {10, 2225},
+    {10, 2223}, {10, 2221}, {10, 2219}, {10, 2217}, {10, 2215}, {10, 2213},
+    {10, 2211}, {10, 2209}, {10, 2207}, {10, 2205}, {10, 2203}, {10, 2201},
+    {10, 2199}, {10, 2197}, {10, 2195}, {10, 2193}, {10, 2191}, {10, 2189},
+    {10, 2187}, {10, 2185}, {10, 2183}, {10, 2181}, {10, 2179}, {10, 2177},
+    {10, 2175}, {10, 2173}, {10, 2171}, {10, 2169}, {10, 2167}, {10, 2165},
+    {10, 2163}, {10, 2161}, {10, 2159}, {10, 2157}, {10, 2155}, {10, 2153},
+    {10, 2151}, {10, 2149}, {10, 2147}, {10, 2145}, {10, 2143}, {10, 2141},
+    {10, 2139}, {10, 2137}, {10, 2135}, {10, 2133}, {10, 2131}, {10, 2129},
+    {10, 2127}, {10, 2125}, {10, 2123}, {10, 2121}, {10, 2119}, {10, 2117},
+    {10, 2115}, {10, 2113}, {10, 2111}, {10, 2109}, {10, 2107}, {10, 2105},
+    {10, 2103}, {10, 2101}, {10, 2099}, {10, 2097}, {10, 2095}, {10, 2093},
+    {10, 2091}, {10, 2089}, {10, 2087}, {10, 2085}, {10, 2083}, {10, 2081},
+    {10, 2079}, {10, 2077}, {10, 2075}, {10, 2073}, {10, 2071}, {10, 2069},
+    {10, 2067}, {10, 2065}, {10, 2063}, {10, 2061}, {10, 2059}, {10, 2057},
+    {10, 2055}, {10, 2053}, {10, 2051}, {10, 2049}, {10, 2047}, {10, 2045},
+    {10, 2043}, {10, 2041}, {10, 2039}, {10, 2037}, {10, 2035}, {10, 2033},
+    {10, 2031}, {10, 2029}, {10, 2027}, {10, 2025}, {10, 2023}, {10, 2021},
+    {10, 2019}, {10, 2017}, {10, 2015}, {10, 2013}, {10, 2011}, {10, 2009},
+    {10, 2007}, {10, 2005}, {10, 2003}, {10, 2001}, {10, 1999}, {10, 1997},
+    {10, 1995}, {10, 1993}, {10, 1991}, {10, 1989}, {10, 1987}, {10, 1985},
+    {10, 1983}, {10, 1981}, {10, 1979}, {10, 1977}, {10, 1975}, {10, 1973},
+    {10, 1971}, {10, 1969}, {10, 1967}, {10, 1965}, {10, 1963}, {10, 1961},
+    {10, 1959}, {10, 1957}, {10, 1955}, {10, 1953}, {10, 1951}, {10, 1949},
+    {10, 1947}, {10, 1945}, {10, 1943}, {10, 1941}, {10, 1939}, {10, 1937},
+    {10, 1935}, {10, 1933}, {10, 1931}, {10, 1929}, {10, 1927}, {10, 1925},
+    {10, 1923}, {10, 1921}, {10, 1919}, {10, 1917}, {10, 1915}, {10, 1913},
+    {10, 1911}, {10, 1909}, {10, 1907}, {10, 1905}, {10, 1903}, {10, 1901},
+    {10, 1899}, {10, 1897}, {10, 1895}, {10, 1893}, {10, 1891}, {10, 1889},
+    {10, 1887}, {10, 1885}, {10, 1883}, {10, 1881}, {10, 1879}, {10, 1877},
+    {10, 1875}, {10, 1873}, {10, 1871}, {10, 1869}, {10, 1867}, {10, 1865},
+    {10, 1863}, {10, 1861}, {10, 1859}, {10, 1857}, {10, 1855}, {10, 1853},
+    {10, 1851}, {10, 1849}, {10, 1847}, {10, 1845}, {10, 1843}, {10, 1841},
+    {10, 1839}, {10, 1837}, {10, 1835}, {10, 1833}, {10, 1831}, {10, 1829},
+    {10, 1827}, {10, 1825}, {10, 1823}, {10, 1821}, {10, 1819}, {10, 1817},
+    {10, 1815}, {10, 1813}, {10, 1811}, {10, 1809}, {10, 1807}, {10, 1805},
+    {10, 1803}, {10, 1801}, {10, 1799}, {10, 1797}, {10, 1795}, {10, 1793},
+    {10, 1791}, {10, 1789}, {10, 1787}, {10, 1785}, {10, 1783}, {10, 1781},
+    {10, 1779}, {10, 1777}, {10, 1775}, {10, 1773}, {10, 1771}, {10, 1769},
+    {10, 1767}, {10, 1765}, {10, 1763}, {10, 1761}, {10, 1759}, {10, 1757},
+    {10, 1755}, {10, 1753}, {10, 1751}, {10, 1749}, {10, 1747}, {10, 1745},
+    {10, 1743}, {10, 1741}, {10, 1739}, {10, 1737}, {10, 1735}, {10, 1733},
+    {10, 1731}, {10, 1729}, {10, 1727}, {10, 1725}, {10, 1723}, {10, 1721},
+    {10, 1719}, {10, 1717}, {10, 1715}, {10, 1713}, {10, 1711}, {10, 1709},
+    {10, 1707}, {10, 1705}, {10, 1703}, {10, 1701}, {10, 1699}, {10, 1697},
+    {10, 1695}, {10, 1693}, {10, 1691}, {10, 1689}, {10, 1687}, {10, 1685},
+    {10, 1683}, {10, 1681}, {10, 1679}, {10, 1677}, {10, 1675}, {10, 1673},
+    {10, 1671}, {10, 1669}, {10, 1667}, {10, 1665}, {10, 1663}, {10, 1661},
+    {10, 1659}, {10, 1657}, {10, 1655}, {10, 1653}, {10, 1651}, {10, 1649},
+    {10, 1647}, {10, 1645}, {10, 1643}, {10, 1641}, {10, 1639}, {10, 1637},
+    {10, 1635}, {10, 1633}, {10, 1631}, {10, 1629}, {10, 1627}, {10, 1625},
+    {10, 1623}, {10, 1621}, {10, 1619}, {10, 1617}, {10, 1615}, {10, 1613},
+    {10, 1611}, {10, 1609}, {10, 1607}, {10, 1605}, {10, 1603}, {10, 1601},
+    {10, 1599}, {10, 1597}, {10, 1595}, {10, 1593}, {10, 1591}, {10, 1589},
+    {10, 1587}, {10, 1585}, {10, 1583}, {10, 1581}, {10, 1579}, {10, 1577},
+    {10, 1575}, {10, 1573}, {10, 1571}, {10, 1569}, {10, 1567}, {10, 1565},
+    {10, 1563}, {10, 1561}, {10, 1559}, {10, 1557}, {10, 1555}, {10, 1553},
+    {10, 1551}, {10, 1549}, {10, 1547}, {10, 1545}, {10, 1543}, {10, 1541},
+    {10, 1539}, {10, 1537}, {10, 1535}, {10, 1533}, {10, 1531}, {10, 1529},
+    {10, 1527}, {10, 1525}, {10, 1523}, {10, 1521}, {10, 1519}, {10, 1517},
+    {10, 1515}, {10, 1513}, {10, 1511}, {10, 1509}, {10, 1507}, {10, 1505},
+    {10, 1503}, {10, 1501}, {10, 1499}, {10, 1497}, {10, 1495}, {10, 1493},
+    {10, 1491}, {10, 1489}, {10, 1487}, {10, 1485}, {10, 1483}, {10, 1481},
+    {10, 1479}, {10, 1477}, {10, 1475}, {10, 1473}, {10, 1471}, {10, 1469},
+    {10, 1467}, {10, 1465}, {10, 1463}, {10, 1461}, {10, 1459}, {10, 1457},
+    {10, 1455}, {10, 1453}, {10, 1451}, {10, 1449}, {10, 1447}, {10, 1445},
+    {10, 1443}, {10, 1441}, {10, 1439}, {10, 1437}, {10, 1435}, {10, 1433},
+    {10, 1431}, {10, 1429}, {10, 1427}, {10, 1425}, {10, 1423}, {10, 1421},
+    {10, 1419}, {10, 1417}, {10, 1415}, {10, 1413}, {10, 1411}, {10, 1409},
+    {10, 1407}, {10, 1405}, {10, 1403}, {10, 1401}, {10, 1399}, {10, 1397},
+    {10, 1395}, {10, 1393}, {10, 1391}, {10, 1389}, {10, 1387}, {10, 1385},
+    {10, 1383}, {10, 1381}, {10, 1379}, {10, 1377}, {10, 1375}, {10, 1373},
+    {10, 1371}, {10, 1369}, {10, 1367}, {10, 1365}, {10, 1363}, {10, 1361},
+    {10, 1359}, {10, 1357}, {10, 1355}, {10, 1353}, {10, 1351}, {10, 1349},
+    {10, 1347}, {10, 1345}, {10, 1343}, {10, 1341}, {10, 1339}, {10, 1337},
+    {10, 1335}, {10, 1333}, {10, 1331}, {10, 1329}, {10, 1327}, {10, 1325},
+    {10, 1323}, {10, 1321}, {10, 1319}, {10, 1317}, {10, 1315}, {10, 1313},
+    {10, 1311}, {10, 1309}, {10, 1307}, {10, 1305}, {10, 1303}, {10, 1301},
+    {10, 1299}, {10, 1297}, {10, 1295}, {10, 1293}, {10, 1291}, {10, 1289},
+    {10, 1287}, {10, 1285}, {10, 1283}, {10, 1281}, {10, 1279}, {10, 1277},
+    {10, 1275}, {10, 1273}, {10, 1271}, {10, 1269}, {10, 1267}, {10, 1265},
+    {10, 1263}, {10, 1261}, {10, 1259}, {10, 1257}, {10, 1255}, {10, 1253},
+    {10, 1251}, {10, 1249}, {10, 1247}, {10, 1245}, {10, 1243}, {10, 1241},
+    {10, 1239}, {10, 1237}, {10, 1235}, {10, 1233}, {10, 1231}, {10, 1229},
+    {10, 1227}, {10, 1225}, {10, 1223}, {10, 1221}, {10, 1219}, {10, 1217},
+    {10, 1215}, {10, 1213}, {10, 1211}, {10, 1209}, {10, 1207}, {10, 1205},
+    {10, 1203}, {10, 1201}, {10, 1199}, {10, 1197}, {10, 1195}, {10, 1193},
+    {10, 1191}, {10, 1189}, {10, 1187}, {10, 1185}, {10, 1183}, {10, 1181},
+    {10, 1179}, {10, 1177}, {10, 1175}, {10, 1173}, {10, 1171}, {10, 1169},
+    {10, 1167}, {10, 1165}, {10, 1163}, {10, 1161}, {10, 1159}, {10, 1157},
+    {10, 1155}, {10, 1153}, {10, 1151}, {10, 1149}, {10, 1147}, {10, 1145},
+    {10, 1143}, {10, 1141}, {10, 1139}, {10, 1137}, {10, 1135}, {10, 1133},
+    {10, 1131}, {10, 1129}, {10, 1127}, {10, 1125}, {10, 1123}, {10, 1121},
+    {10, 1119}, {10, 1117}, {10, 1115}, {10, 1113}, {10, 1111}, {10, 1109},
+    {10, 1107}, {10, 1105}, {10, 1103}, {10, 1101}, {10, 1099}, {10, 1097},
+    {10, 1095}, {10, 1093}, {10, 1091}, {10, 1089}, {10, 1087}, {10, 1085},
+    {10, 1083}, {10, 1081}, {10, 1079}, {10, 1077}, {10, 1075}, {10, 1073},
+    {10, 1071}, {10, 1069}, {10, 1067}, {10, 1065}, {10, 1063}, {10, 1061},
+    {10, 1059}, {10, 1057}, {10, 1055}, {10, 1053}, {10, 1051}, {10, 1049},
+    {10, 1047}, {10, 1045}, {10, 1043}, {10, 1041}, {10, 1039}, {10, 1037},
+    {10, 1035}, {10, 1033}, {10, 1031}, {10, 1029}, {10, 1027}, {10, 1025},
+    {10, 1023}, {10, 1021}, {10, 1019}, {10, 1017}, {10, 1015}, {10, 1013},
+    {10, 1011}, {10, 1009}, {10, 1007}, {10, 1005}, {10, 1003}, {10, 1001},
+    {10, 999}, {10, 997}, {10, 995}, {10, 993}, {10, 991}, {10, 989},
+    {10, 987}, {10, 985}, {10, 983}, {10, 981}, {10, 979}, {10, 977},
+    {10, 975}, {10, 973}, {10, 971}, {10, 969}, {10, 967}, {10, 965},
+    {10, 963}, {10, 961}, {10, 959}, {10, 957}, {10, 955}, {10, 953},
+    {10, 951}, {10, 949}, {10, 947}, {10, 945}, {10, 943}, {10, 941},
+    {10, 939}, {10, 937}, {10, 935}, {10, 933}, {10, 931}, {10, 929},
+    {10, 927}, {10, 925}, {10, 923}, {10, 921}, {10, 919}, {10, 917},
+    {10, 915}, {10, 913}, {10, 911}, {10, 909}, {10, 907}, {10, 905},
+    {10, 903}, {10, 901}, {10, 899}, {10, 897}, {10, 895}, {10, 893},
+    {10, 891}, {10, 889}, {10, 887}, {10, 885}, {10, 883}, {10, 881},
+    {10, 879}, {10, 877}, {10, 875}, {10, 873}, {10, 871}, {10, 869},
+    {10, 867}, {10, 865}, {10, 863}, {10, 861}, {10, 859}, {10, 857},
+    {10, 855}, {10, 853}, {10, 851}, {10, 849}, {10, 847}, {10, 845},
+    {10, 843}, {10, 841}, {10, 839}, {10, 837}, {10, 835}, {10, 833},
+    {10, 831}, {10, 829}, {10, 827}, {10, 825}, {10, 823}, {10, 821},
+    {10, 819}, {10, 817}, {10, 815}, {10, 813}, {10, 811}, {10, 809},
+    {10, 807}, {10, 805}, {10, 803}, {10, 801}, {10, 799}, {10, 797},
+    {10, 795}, {10, 793}, {10, 791}, {10, 789}, {10, 787}, {10, 785},
+    {10, 783}, {10, 781}, {10, 779}, {10, 777}, {10, 775}, {10, 773},
+    {10, 771}, {10, 769}, {10, 767}, {10, 765}, {10, 763}, {10, 761},
+    {10, 759}, {10, 757}, {10, 755}, {10, 753}, {10, 751}, {10, 749},
+    {10, 747}, {10, 745}, {10, 743}, {10, 741}, {10, 739}, {10, 737},
+    {10, 735}, {10, 733}, {10, 731}, {10, 729}, {10, 727}, {10, 725},
+    {10, 723}, {10, 721}, {10, 719}, {10, 717}, {10, 715}, {10, 713},
+    {10, 711}, {10, 709}, {10, 707}, {10, 705}, {10, 703}, {10, 701},
+    {10, 699}, {10, 697}, {10, 695}, {10, 693}, {10, 691}, {10, 689},
+    {10, 687}, {10, 685}, {10, 683}, {10, 681}, {10, 679}, {10, 677},
+    {10, 675}, {10, 673}, {10, 671}, {10, 669}, {10, 667}, {10, 665},
+    {10, 663}, {10, 661}, {10, 659}, {10, 657}, {10, 655}, {10, 653},
+    {10, 651}, {10, 649}, {10, 647}, {10, 645}, {10, 643}, {10, 641},
+    {10, 639}, {10, 637}, {10, 635}, {10, 633}, {10, 631}, {10, 629},
+    {10, 627}, {10, 625}, {10, 623}, {10, 621}, {10, 619}, {10, 617},
+    {10, 615}, {10, 613}, {10, 611}, {10, 609}, {10, 607}, {10, 605},
+    {10, 603}, {10, 601}, {10, 599}, {10, 597}, {10, 595}, {10, 593},
+    {10, 591}, {10, 589}, {10, 587}, {10, 585}, {10, 583}, {10, 581},
+    {10, 579}, {10, 577}, {10, 575}, {10, 573}, {10, 571}, {10, 569},
+    {10, 567}, {10, 565}, {10, 563}, {10, 561}, {10, 559}, {10, 557},
+    {10, 555}, {10, 553}, {10, 551}, {10, 549}, {10, 547}, {10, 545},
+    {10, 543}, {10, 541}, {10, 539}, {10, 537}, {10, 535}, {10, 533},
+    {10, 531}, {10, 529}, {10, 527}, {10, 525}, {10, 523}, {10, 521},
+    {10, 519}, {10, 517}, {10, 515}, {10, 513}, {10, 511}, {10, 509},
+    {10, 507}, {10, 505}, {10, 503}, {10, 501}, {10, 499}, {10, 497},
+    {10, 495}, {10, 493}, {10, 491}, {10, 489}, {10, 487}, {10, 485},
+    {10, 483}, {10, 481}, {10, 479}, {10, 477}, {10, 475}, {10, 473},
+    {10, 471}, {10, 469}, {10, 467}, {10, 465}, {10, 463}, {10, 461},
+    {10, 459}, {10, 457}, {10, 455}, {10, 453}, {10, 451}, {10, 449},
+    {10, 447}, {10, 445}, {10, 443}, {10, 441}, {10, 439}, {10, 437},
+    {10, 435}, {10, 433}, {10, 431}, {10, 429}, {10, 427}, {10, 425},
+    {10, 423}, {10, 421}, {10, 419}, {10, 417}, {10, 415}, {10, 413},
+    {10, 411}, {10, 409}, {10, 407}, {10, 405}, {10, 403}, {10, 401},
+    {10, 399}, {10, 397}, {10, 395}, {10, 393}, {10, 391}, {10, 389},
+    {10, 387}, {10, 385}, {10, 383}, {10, 381}, {10, 379}, {10, 377},
+    {10, 375}, {10, 373}, {10, 371}, {10, 369}, {10, 367}, {10, 365},
+    {10, 363}, {10, 361}, {10, 359}, {10, 357}, {10, 355}, {10, 353},
+    {10, 351}, {10, 349}, {10, 347}, {10, 345}, {10, 343}, {10, 341},
+    {10, 339}, {10, 337}, {10, 335}, {10, 333}, {10, 331}, {10, 329},
+    {10, 327}, {10, 325}, {10, 323}, {10, 321}, {10, 319}, {10, 317},
+    {10, 315}, {10, 313}, {10, 311}, {10, 309}, {10, 307}, {10, 305},
+    {10, 303}, {10, 301}, {10, 299}, {10, 297}, {10, 295}, {10, 293},
+    {10, 291}, {10, 289}, {10, 287}, {10, 285}, {10, 283}, {10, 281},
+    {10, 279}, {10, 277}, {10, 275}, {10, 273}, {10, 271}, {10, 269},
+    {10, 267}, {10, 265}, {10, 263}, {10, 261}, {10, 259}, {10, 257},
+    {10, 255}, {10, 253}, {10, 251}, {10, 249}, {10, 247}, {10, 245},
+    {10, 243}, {10, 241}, {10, 239}, {10, 237}, {10, 235}, {10, 233},
+    {10, 231}, {10, 229}, {10, 227}, {10, 225}, {10, 223}, {10, 221},
+    {10, 219}, {10, 217}, {10, 215}, {10, 213}, {10, 211}, {10, 209},
+    {10, 207}, {10, 205}, {10, 203}, {10, 201}, {10, 199}, {10, 197},
+    {10, 195}, {10, 193}, {10, 191}, {10, 189}, {10, 187}, {10, 185},
+    {10, 183}, {10, 181}, {10, 179}, {10, 177}, {10, 175}, {10, 173},
+    {10, 171}, {10, 169}, {10, 167}, {10, 165}, {10, 163}, {10, 161},
+    {10, 159}, {10, 157}, {10, 155}, {10, 153}, {10, 151}, {10, 149},
+    {10, 147}, {10, 145}, {10, 143}, {10, 141}, {10, 139}, {10, 137},
+    {10, 135}, {10, 133}, {10, 131}, {10, 129}, {10, 127}, {10, 125},
+    {10, 123}, {10, 121}, {10, 119}, {10, 117}, {10, 115}, {10, 113},
+    {10, 111}, {10, 109}, {10, 107}, {10, 105}, {10, 103}, {10, 101},
+    {10, 99}, {10, 97}, {10, 95}, {10, 93}, {10, 91}, {10, 89},
+    {10, 87}, {10, 85}, {10, 83}, {10, 81}, {10, 79}, {10, 77},
+    {10, 75}, {10, 73}, {10, 71}, {10, 69}, {10, 67}, {10, 65},
+    {10, 63}, {10, 61}, {10, 59}, {10, 57}, {10, 55}, {10, 53},
+    {10, 51}, {10, 49}, {10, 47}, {10, 45}, {10, 43}, {10, 41},
+    {10, 39}, {10, 37}, {10, 35}, {10, 33}, {10, 31}, {10, 29},
+    {10, 27}, {10, 25}, {10, 23}, {10, 21}, {10, 19}, {10, 17},
+    {10, 15}, {10, 13}, {10, 11}, {10, 9}, {10, 7}, {10, 5},
+    {10, 3}, {10, 1}, {9, 63}, {9, 61}, {9, 59}, {9, 57},
+    {9, 55}, {9, 53}, {9, 51}, {9, 49}, {9, 47}, {9, 45},
+    {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33},
+    {9, 31}, {9, 29}, {9, 27}, {9, 25}, {9, 23}, {9, 21},
+    {9, 19}, {9, 17}, {9, 15}, {9, 13}, {9, 11}, {9, 9},
+    {9, 7}, {9, 5}, {9, 3}, {9, 1}, {8, 31}, {8, 29},
+    {8, 27}, {8, 25}, {8, 23}, {8, 21}, {8, 19}, {8, 17},
+    {8, 15}, {8, 13}, {8, 11}, {8, 9}, {8, 7}, {8, 5},
+    {8, 3}, {8, 1}, {7, 15}, {7, 13}, {7, 11}, {7, 9},
+    {7, 7}, {7, 5}, {7, 3}, {7, 1}, {6, 7}, {6, 5},
+    {6, 3}, {6, 1}, {5, 3}, {5, 1}, {4, 1}, {3, 1},
+    {2, 1}, {1, 1}, {0, 0}, {1, 0}, {2, 0}, {3, 0},
+    {4, 0}, {5, 0}, {5, 2}, {6, 0}, {6, 2}, {6, 4},
+    {6, 6}, {7, 0}, {7, 2}, {7, 4}, {7, 6}, {7, 8},
+    {7, 10}, {7, 12}, {7, 14}, {8, 0}, {8, 2}, {8, 4},
+    {8, 6}, {8, 8}, {8, 10}, {8, 12}, {8, 14}, {8, 16},
+    {8, 18}, {8, 20}, {8, 22}, {8, 24}, {8, 26}, {8, 28},
+    {8, 30}, {9, 0}, {9, 2}, {9, 4}, {9, 6}, {9, 8},
+    {9, 10}, {9, 12}, {9, 14}, {9, 16}, {9, 18}, {9, 20},
+    {9, 22}, {9, 24}, {9, 26}, {9, 28}, {9, 30}, {9, 32},
+    {9, 34}, {9, 36}, {9, 38}, {9, 40}, {9, 42}, {9, 44},
+    {9, 46}, {9, 48}, {9, 50}, {9, 52}, {9, 54}, {9, 56},
+    {9, 58}, {9, 60}, {9, 62}, {10, 0}, {10, 2}, {10, 4},
+    {10, 6}, {10, 8}, {10, 10}, {10, 12}, {10, 14}, {10, 16},
+    {10, 18}, {10, 20}, {10, 22}, {10, 24}, {10, 26}, {10, 28},
+    {10, 30}, {10, 32}, {10, 34}, {10, 36}, {10, 38}, {10, 40},
+    {10, 42}, {10, 44}, {10, 46}, {10, 48}, {10, 50}, {10, 52},
+    {10, 54}, {10, 56}, {10, 58}, {10, 60}, {10, 62}, {10, 64},
+    {10, 66}, {10, 68}, {10, 70}, {10, 72}, {10, 74}, {10, 76},
+    {10, 78}, {10, 80}, {10, 82}, {10, 84}, {10, 86}, {10, 88},
+    {10, 90}, {10, 92}, {10, 94}, {10, 96}, {10, 98}, {10, 100},
+    {10, 102}, {10, 104}, {10, 106}, {10, 108}, {10, 110}, {10, 112},
+    {10, 114}, {10, 116}, {10, 118}, {10, 120}, {10, 122}, {10, 124},
+    {10, 126}, {10, 128}, {10, 130}, {10, 132}, {10, 134}, {10, 136},
+    {10, 138}, {10, 140}, {10, 142}, {10, 144}, {10, 146}, {10, 148},
+    {10, 150}, {10, 152}, {10, 154}, {10, 156}, {10, 158}, {10, 160},
+    {10, 162}, {10, 164}, {10, 166}, {10, 168}, {10, 170}, {10, 172},
+    {10, 174}, {10, 176}, {10, 178}, {10, 180}, {10, 182}, {10, 184},
+    {10, 186}, {10, 188}, {10, 190}, {10, 192}, {10, 194}, {10, 196},
+    {10, 198}, {10, 200}, {10, 202}, {10, 204}, {10, 206}, {10, 208},
+    {10, 210}, {10, 212}, {10, 214}, {10, 216}, {10, 218}, {10, 220},
+    {10, 222}, {10, 224}, {10, 226}, {10, 228}, {10, 230}, {10, 232},
+    {10, 234}, {10, 236}, {10, 238}, {10, 240}, {10, 242}, {10, 244},
+    {10, 246}, {10, 248}, {10, 250}, {10, 252}, {10, 254}, {10, 256},
+    {10, 258}, {10, 260}, {10, 262}, {10, 264}, {10, 266}, {10, 268},
+    {10, 270}, {10, 272}, {10, 274}, {10, 276}, {10, 278}, {10, 280},
+    {10, 282}, {10, 284}, {10, 286}, {10, 288}, {10, 290}, {10, 292},
+    {10, 294}, {10, 296}, {10, 298}, {10, 300}, {10, 302}, {10, 304},
+    {10, 306}, {10, 308}, {10, 310}, {10, 312}, {10, 314}, {10, 316},
+    {10, 318}, {10, 320}, {10, 322}, {10, 324}, {10, 326}, {10, 328},
+    {10, 330}, {10, 332}, {10, 334}, {10, 336}, {10, 338}, {10, 340},
+    {10, 342}, {10, 344}, {10, 346}, {10, 348}, {10, 350}, {10, 352},
+    {10, 354}, {10, 356}, {10, 358}, {10, 360}, {10, 362}, {10, 364},
+    {10, 366}, {10, 368}, {10, 370}, {10, 372}, {10, 374}, {10, 376},
+    {10, 378}, {10, 380}, {10, 382}, {10, 384}, {10, 386}, {10, 388},
+    {10, 390}, {10, 392}, {10, 394}, {10, 396}, {10, 398}, {10, 400},
+    {10, 402}, {10, 404}, {10, 406}, {10, 408}, {10, 410}, {10, 412},
+    {10, 414}, {10, 416}, {10, 418}, {10, 420}, {10, 422}, {10, 424},
+    {10, 426}, {10, 428}, {10, 430}, {10, 432}, {10, 434}, {10, 436},
+    {10, 438}, {10, 440}, {10, 442}, {10, 444}, {10, 446}, {10, 448},
+    {10, 450}, {10, 452}, {10, 454}, {10, 456}, {10, 458}, {10, 460},
+    {10, 462}, {10, 464}, {10, 466}, {10, 468}, {10, 470}, {10, 472},
+    {10, 474}, {10, 476}, {10, 478}, {10, 480}, {10, 482}, {10, 484},
+    {10, 486}, {10, 488}, {10, 490}, {10, 492}, {10, 494}, {10, 496},
+    {10, 498}, {10, 500}, {10, 502}, {10, 504}, {10, 506}, {10, 508},
+    {10, 510}, {10, 512}, {10, 514}, {10, 516}, {10, 518}, {10, 520},
+    {10, 522}, {10, 524}, {10, 526}, {10, 528}, {10, 530}, {10, 532},
+    {10, 534}, {10, 536}, {10, 538}, {10, 540}, {10, 542}, {10, 544},
+    {10, 546}, {10, 548}, {10, 550}, {10, 552}, {10, 554}, {10, 556},
+    {10, 558}, {10, 560}, {10, 562}, {10, 564}, {10, 566}, {10, 568},
+    {10, 570}, {10, 572}, {10, 574}, {10, 576}, {10, 578}, {10, 580},
+    {10, 582}, {10, 584}, {10, 586}, {10, 588}, {10, 590}, {10, 592},
+    {10, 594}, {10, 596}, {10, 598}, {10, 600}, {10, 602}, {10, 604},
+    {10, 606}, {10, 608}, {10, 610}, {10, 612}, {10, 614}, {10, 616},
+    {10, 618}, {10, 620}, {10, 622}, {10, 624}, {10, 626}, {10, 628},
+    {10, 630}, {10, 632}, {10, 634}, {10, 636}, {10, 638}, {10, 640},
+    {10, 642}, {10, 644}, {10, 646}, {10, 648}, {10, 650}, {10, 652},
+    {10, 654}, {10, 656}, {10, 658}, {10, 660}, {10, 662}, {10, 664},
+    {10, 666}, {10, 668}, {10, 670}, {10, 672}, {10, 674}, {10, 676},
+    {10, 678}, {10, 680}, {10, 682}, {10, 684}, {10, 686}, {10, 688},
+    {10, 690}, {10, 692}, {10, 694}, {10, 696}, {10, 698}, {10, 700},
+    {10, 702}, {10, 704}, {10, 706}, {10, 708}, {10, 710}, {10, 712},
+    {10, 714}, {10, 716}, {10, 718}, {10, 720}, {10, 722}, {10, 724},
+    {10, 726}, {10, 728}, {10, 730}, {10, 732}, {10, 734}, {10, 736},
+    {10, 738}, {10, 740}, {10, 742}, {10, 744}, {10, 746}, {10, 748},
+    {10, 750}, {10, 752}, {10, 754}, {10, 756}, {10, 758}, {10, 760},
+    {10, 762}, {10, 764}, {10, 766}, {10, 768}, {10, 770}, {10, 772},
+    {10, 774}, {10, 776}, {10, 778}, {10, 780}, {10, 782}, {10, 784},
+    {10, 786}, {10, 788}, {10, 790}, {10, 792}, {10, 794}, {10, 796},
+    {10, 798}, {10, 800}, {10, 802}, {10, 804}, {10, 806}, {10, 808},
+    {10, 810}, {10, 812}, {10, 814}, {10, 816}, {10, 818}, {10, 820},
+    {10, 822}, {10, 824}, {10, 826}, {10, 828}, {10, 830}, {10, 832},
+    {10, 834}, {10, 836}, {10, 838}, {10, 840}, {10, 842}, {10, 844},
+    {10, 846}, {10, 848}, {10, 850}, {10, 852}, {10, 854}, {10, 856},
+    {10, 858}, {10, 860}, {10, 862}, {10, 864}, {10, 866}, {10, 868},
+    {10, 870}, {10, 872}, {10, 874}, {10, 876}, {10, 878}, {10, 880},
+    {10, 882}, {10, 884}, {10, 886}, {10, 888}, {10, 890}, {10, 892},
+    {10, 894}, {10, 896}, {10, 898}, {10, 900}, {10, 902}, {10, 904},
+    {10, 906}, {10, 908}, {10, 910}, {10, 912}, {10, 914}, {10, 916},
+    {10, 918}, {10, 920}, {10, 922}, {10, 924}, {10, 926}, {10, 928},
+    {10, 930}, {10, 932}, {10, 934}, {10, 936}, {10, 938}, {10, 940},
+    {10, 942}, {10, 944}, {10, 946}, {10, 948}, {10, 950}, {10, 952},
+    {10, 954}, {10, 956}, {10, 958}, {10, 960}, {10, 962}, {10, 964},
+    {10, 966}, {10, 968}, {10, 970}, {10, 972}, {10, 974}, {10, 976},
+    {10, 978}, {10, 980}, {10, 982}, {10, 984}, {10, 986}, {10, 988},
+    {10, 990}, {10, 992}, {10, 994}, {10, 996}, {10, 998}, {10, 1000},
+    {10, 1002}, {10, 1004}, {10, 1006}, {10, 1008}, {10, 1010}, {10, 1012},
+    {10, 1014}, {10, 1016}, {10, 1018}, {10, 1020}, {10, 1022}, {10, 1024},
+    {10, 1026}, {10, 1028}, {10, 1030}, {10, 1032}, {10, 1034}, {10, 1036},
+    {10, 1038}, {10, 1040}, {10, 1042}, {10, 1044}, {10, 1046}, {10, 1048},
+    {10, 1050}, {10, 1052}, {10, 1054}, {10, 1056}, {10, 1058}, {10, 1060},
+    {10, 1062}, {10, 1064}, {10, 1066}, {10, 1068}, {10, 1070}, {10, 1072},
+    {10, 1074}, {10, 1076}, {10, 1078}, {10, 1080}, {10, 1082}, {10, 1084},
+    {10, 1086}, {10, 1088}, {10, 1090}, {10, 1092}, {10, 1094}, {10, 1096},
+    {10, 1098}, {10, 1100}, {10, 1102}, {10, 1104}, {10, 1106}, {10, 1108},
+    {10, 1110}, {10, 1112}, {10, 1114}, {10, 1116}, {10, 1118}, {10, 1120},
+    {10, 1122}, {10, 1124}, {10, 1126}, {10, 1128}, {10, 1130}, {10, 1132},
+    {10, 1134}, {10, 1136}, {10, 1138}, {10, 1140}, {10, 1142}, {10, 1144},
+    {10, 1146}, {10, 1148}, {10, 1150}, {10, 1152}, {10, 1154}, {10, 1156},
+    {10, 1158}, {10, 1160}, {10, 1162}, {10, 1164}, {10, 1166}, {10, 1168},
+    {10, 1170}, {10, 1172}, {10, 1174}, {10, 1176}, {10, 1178}, {10, 1180},
+    {10, 1182}, {10, 1184}, {10, 1186}, {10, 1188}, {10, 1190}, {10, 1192},
+    {10, 1194}, {10, 1196}, {10, 1198}, {10, 1200}, {10, 1202}, {10, 1204},
+    {10, 1206}, {10, 1208}, {10, 1210}, {10, 1212}, {10, 1214}, {10, 1216},
+    {10, 1218}, {10, 1220}, {10, 1222}, {10, 1224}, {10, 1226}, {10, 1228},
+    {10, 1230}, {10, 1232}, {10, 1234}, {10, 1236}, {10, 1238}, {10, 1240},
+    {10, 1242}, {10, 1244}, {10, 1246}, {10, 1248}, {10, 1250}, {10, 1252},
+    {10, 1254}, {10, 1256}, {10, 1258}, {10, 1260}, {10, 1262}, {10, 1264},
+    {10, 1266}, {10, 1268}, {10, 1270}, {10, 1272}, {10, 1274}, {10, 1276},
+    {10, 1278}, {10, 1280}, {10, 1282}, {10, 1284}, {10, 1286}, {10, 1288},
+    {10, 1290}, {10, 1292}, {10, 1294}, {10, 1296}, {10, 1298}, {10, 1300},
+    {10, 1302}, {10, 1304}, {10, 1306}, {10, 1308}, {10, 1310}, {10, 1312},
+    {10, 1314}, {10, 1316}, {10, 1318}, {10, 1320}, {10, 1322}, {10, 1324},
+    {10, 1326}, {10, 1328}, {10, 1330}, {10, 1332}, {10, 1334}, {10, 1336},
+    {10, 1338}, {10, 1340}, {10, 1342}, {10, 1344}, {10, 1346}, {10, 1348},
+    {10, 1350}, {10, 1352}, {10, 1354}, {10, 1356}, {10, 1358}, {10, 1360},
+    {10, 1362}, {10, 1364}, {10, 1366}, {10, 1368}, {10, 1370}, {10, 1372},
+    {10, 1374}, {10, 1376}, {10, 1378}, {10, 1380}, {10, 1382}, {10, 1384},
+    {10, 1386}, {10, 1388}, {10, 1390}, {10, 1392}, {10, 1394}, {10, 1396},
+    {10, 1398}, {10, 1400}, {10, 1402}, {10, 1404}, {10, 1406}, {10, 1408},
+    {10, 1410}, {10, 1412}, {10, 1414}, {10, 1416}, {10, 1418}, {10, 1420},
+    {10, 1422}, {10, 1424}, {10, 1426}, {10, 1428}, {10, 1430}, {10, 1432},
+    {10, 1434}, {10, 1436}, {10, 1438}, {10, 1440}, {10, 1442}, {10, 1444},
+    {10, 1446}, {10, 1448}, {10, 1450}, {10, 1452}, {10, 1454}, {10, 1456},
+    {10, 1458}, {10, 1460}, {10, 1462}, {10, 1464}, {10, 1466}, {10, 1468},
+    {10, 1470}, {10, 1472}, {10, 1474}, {10, 1476}, {10, 1478}, {10, 1480},
+    {10, 1482}, {10, 1484}, {10, 1486}, {10, 1488}, {10, 1490}, {10, 1492},
+    {10, 1494}, {10, 1496}, {10, 1498}, {10, 1500}, {10, 1502}, {10, 1504},
+    {10, 1506}, {10, 1508}, {10, 1510}, {10, 1512}, {10, 1514}, {10, 1516},
+    {10, 1518}, {10, 1520}, {10, 1522}, {10, 1524}, {10, 1526}, {10, 1528},
+    {10, 1530}, {10, 1532}, {10, 1534}, {10, 1536}, {10, 1538}, {10, 1540},
+    {10, 1542}, {10, 1544}, {10, 1546}, {10, 1548}, {10, 1550}, {10, 1552},
+    {10, 1554}, {10, 1556}, {10, 1558}, {10, 1560}, {10, 1562}, {10, 1564},
+    {10, 1566}, {10, 1568}, {10, 1570}, {10, 1572}, {10, 1574}, {10, 1576},
+    {10, 1578}, {10, 1580}, {10, 1582}, {10, 1584}, {10, 1586}, {10, 1588},
+    {10, 1590}, {10, 1592}, {10, 1594}, {10, 1596}, {10, 1598}, {10, 1600},
+    {10, 1602}, {10, 1604}, {10, 1606}, {10, 1608}, {10, 1610}, {10, 1612},
+    {10, 1614}, {10, 1616}, {10, 1618}, {10, 1620}, {10, 1622}, {10, 1624},
+    {10, 1626}, {10, 1628}, {10, 1630}, {10, 1632}, {10, 1634}, {10, 1636},
+    {10, 1638}, {10, 1640}, {10, 1642}, {10, 1644}, {10, 1646}, {10, 1648},
+    {10, 1650}, {10, 1652}, {10, 1654}, {10, 1656}, {10, 1658}, {10, 1660},
+    {10, 1662}, {10, 1664}, {10, 1666}, {10, 1668}, {10, 1670}, {10, 1672},
+    {10, 1674}, {10, 1676}, {10, 1678}, {10, 1680}, {10, 1682}, {10, 1684},
+    {10, 1686}, {10, 1688}, {10, 1690}, {10, 1692}, {10, 1694}, {10, 1696},
+    {10, 1698}, {10, 1700}, {10, 1702}, {10, 1704}, {10, 1706}, {10, 1708},
+    {10, 1710}, {10, 1712}, {10, 1714}, {10, 1716}, {10, 1718}, {10, 1720},
+    {10, 1722}, {10, 1724}, {10, 1726}, {10, 1728}, {10, 1730}, {10, 1732},
+    {10, 1734}, {10, 1736}, {10, 1738}, {10, 1740}, {10, 1742}, {10, 1744},
+    {10, 1746}, {10, 1748}, {10, 1750}, {10, 1752}, {10, 1754}, {10, 1756},
+    {10, 1758}, {10, 1760}, {10, 1762}, {10, 1764}, {10, 1766}, {10, 1768},
+    {10, 1770}, {10, 1772}, {10, 1774}, {10, 1776}, {10, 1778}, {10, 1780},
+    {10, 1782}, {10, 1784}, {10, 1786}, {10, 1788}, {10, 1790}, {10, 1792},
+    {10, 1794}, {10, 1796}, {10, 1798}, {10, 1800}, {10, 1802}, {10, 1804},
+    {10, 1806}, {10, 1808}, {10, 1810}, {10, 1812}, {10, 1814}, {10, 1816},
+    {10, 1818}, {10, 1820}, {10, 1822}, {10, 1824}, {10, 1826}, {10, 1828},
+    {10, 1830}, {10, 1832}, {10, 1834}, {10, 1836}, {10, 1838}, {10, 1840},
+    {10, 1842}, {10, 1844}, {10, 1846}, {10, 1848}, {10, 1850}, {10, 1852},
+    {10, 1854}, {10, 1856}, {10, 1858}, {10, 1860}, {10, 1862}, {10, 1864},
+    {10, 1866}, {10, 1868}, {10, 1870}, {10, 1872}, {10, 1874}, {10, 1876},
+    {10, 1878}, {10, 1880}, {10, 1882}, {10, 1884}, {10, 1886}, {10, 1888},
+    {10, 1890}, {10, 1892}, {10, 1894}, {10, 1896}, {10, 1898}, {10, 1900},
+    {10, 1902}, {10, 1904}, {10, 1906}, {10, 1908}, {10, 1910}, {10, 1912},
+    {10, 1914}, {10, 1916}, {10, 1918}, {10, 1920}, {10, 1922}, {10, 1924},
+    {10, 1926}, {10, 1928}, {10, 1930}, {10, 1932}, {10, 1934}, {10, 1936},
+    {10, 1938}, {10, 1940}, {10, 1942}, {10, 1944}, {10, 1946}, {10, 1948},
+    {10, 1950}, {10, 1952}, {10, 1954}, {10, 1956}, {10, 1958}, {10, 1960},
+    {10, 1962}, {10, 1964}, {10, 1966}, {10, 1968}, {10, 1970}, {10, 1972},
+    {10, 1974}, {10, 1976}, {10, 1978}, {10, 1980}, {10, 1982}, {10, 1984},
+    {10, 1986}, {10, 1988}, {10, 1990}, {10, 1992}, {10, 1994}, {10, 1996},
+    {10, 1998}, {10, 2000}, {10, 2002}, {10, 2004}, {10, 2006}, {10, 2008},
+    {10, 2010}, {10, 2012}, {10, 2014}, {10, 2016}, {10, 2018}, {10, 2020},
+    {10, 2022}, {10, 2024}, {10, 2026}, {10, 2028}, {10, 2030}, {10, 2032},
+    {10, 2034}, {10, 2036}, {10, 2038}, {10, 2040}, {10, 2042}, {10, 2044},
+    {10, 2046}, {10, 2048}, {10, 2050}, {10, 2052}, {10, 2054}, {10, 2056},
+    {10, 2058}, {10, 2060}, {10, 2062}, {10, 2064}, {10, 2066}, {10, 2068},
+    {10, 2070}, {10, 2072}, {10, 2074}, {10, 2076}, {10, 2078}, {10, 2080},
+    {10, 2082}, {10, 2084}, {10, 2086}, {10, 2088}, {10, 2090}, {10, 2092},
+    {10, 2094}, {10, 2096}, {10, 2098}, {10, 2100}, {10, 2102}, {10, 2104},
+    {10, 2106}, {10, 2108}, {10, 2110}, {10, 2112}, {10, 2114}, {10, 2116},
+    {10, 2118}, {10, 2120}, {10, 2122}, {10, 2124}, {10, 2126}, {10, 2128},
+    {10, 2130}, {10, 2132}, {10, 2134}, {10, 2136}, {10, 2138}, {10, 2140},
+    {10, 2142}, {10, 2144}, {10, 2146}, {10, 2148}, {10, 2150}, {10, 2152},
+    {10, 2154}, {10, 2156}, {10, 2158}, {10, 2160}, {10, 2162}, {10, 2164},
+    {10, 2166}, {10, 2168}, {10, 2170}, {10, 2172}, {10, 2174}, {10, 2176},
+    {10, 2178}, {10, 2180}, {10, 2182}, {10, 2184}, {10, 2186}, {10, 2188},
+    {10, 2190}, {10, 2192}, {10, 2194}, {10, 2196}, {10, 2198}, {10, 2200},
+    {10, 2202}, {10, 2204}, {10, 2206}, {10, 2208}, {10, 2210}, {10, 2212},
+    {10, 2214}, {10, 2216}, {10, 2218}, {10, 2220}, {10, 2222}, {10, 2224},
+    {10, 2226}, {10, 2228}, {10, 2230}, {10, 2232}, {10, 2234}, {10, 2236},
+    {10, 2238}, {10, 2240}, {10, 2242}, {10, 2244}, {10, 2246}, {10, 2248},
+    {10, 2250}, {10, 2252}, {10, 2254}, {10, 2256}, {10, 2258}, {10, 2260},
+    {10, 2262}, {10, 2264}, {10, 2266}, {10, 2268}, {10, 2270}, {10, 2272},
+    {10, 2274}, {10, 2276}, {10, 2278}, {10, 2280}, {10, 2282}, {10, 2284},
+    {10, 2286}, {10, 2288}, {10, 2290}, {10, 2292}, {10, 2294}, {10, 2296},
+    {10, 2298}, {10, 2300}, {10, 2302}, {10, 2304}, {10, 2306}, {10, 2308},
+    {10, 2310}, {10, 2312}, {10, 2314}, {10, 2316}, {10, 2318}, {10, 2320},
+    {10, 2322}, {10, 2324}, {10, 2326}, {10, 2328}, {10, 2330}, {10, 2332},
+    {10, 2334}, {10, 2336}, {10, 2338}, {10, 2340}, {10, 2342}, {10, 2344},
+    {10, 2346}, {10, 2348}, {10, 2350}, {10, 2352}, {10, 2354}, {10, 2356},
+    {10, 2358}, {10, 2360}, {10, 2362}, {10, 2364}, {10, 2366}, {10, 2368},
+    {10, 2370}, {10, 2372}, {10, 2374}, {10, 2376}, {10, 2378}, {10, 2380},
+    {10, 2382}, {10, 2384}, {10, 2386}, {10, 2388}, {10, 2390}, {10, 2392},
+    {10, 2394}, {10, 2396}, {10, 2398}, {10, 2400}, {10, 2402}, {10, 2404},
+    {10, 2406}, {10, 2408}, {10, 2410}, {10, 2412}, {10, 2414}, {10, 2416},
+    {10, 2418}, {10, 2420}, {10, 2422}, {10, 2424}, {10, 2426}, {10, 2428},
+    {10, 2430}, {10, 2432}, {10, 2434}, {10, 2436}, {10, 2438}, {10, 2440},
+    {10, 2442}, {10, 2444}, {10, 2446}, {10, 2448}, {10, 2450}, {10, 2452},
+    {10, 2454}, {10, 2456}, {10, 2458}, {10, 2460}, {10, 2462}, {10, 2464},
+    {10, 2466}, {10, 2468}, {10, 2470}, {10, 2472}, {10, 2474}, {10, 2476},
+    {10, 2478}, {10, 2480}, {10, 2482}, {10, 2484}, {10, 2486}, {10, 2488},
+    {10, 2490}, {10, 2492}, {10, 2494}, {10, 2496}, {10, 2498}, {10, 2500},
+    {10, 2502}, {10, 2504}, {10, 2506}, {10, 2508}, {10, 2510}, {10, 2512},
+    {10, 2514}, {10, 2516}, {10, 2518}, {10, 2520}, {10, 2522}, {10, 2524},
+    {10, 2526}, {10, 2528}, {10, 2530}, {10, 2532}, {10, 2534}, {10, 2536},
+    {10, 2538}, {10, 2540}, {10, 2542}, {10, 2544}, {10, 2546}, {10, 2548},
+    {10, 2550}, {10, 2552}, {10, 2554}, {10, 2556}, {10, 2558}, {10, 2560},
+    {10, 2562}, {10, 2564}, {10, 2566}, {10, 2568}, {10, 2570}, {10, 2572},
+    {10, 2574}, {10, 2576}, {10, 2578}, {10, 2580}, {10, 2582}, {10, 2584},
+    {10, 2586}, {10, 2588}, {10, 2590}, {10, 2592}, {10, 2594}, {10, 2596},
+    {10, 2598}, {10, 2600}, {10, 2602}, {10, 2604}, {10, 2606}, {10, 2608},
+    {10, 2610}, {10, 2612}, {10, 2614}, {10, 2616}, {10, 2618}, {10, 2620},
+    {10, 2622}, {10, 2624}, {10, 2626}, {10, 2628}, {10, 2630}, {10, 2632},
+    {10, 2634}, {10, 2636}, {10, 2638}, {10, 2640}, {10, 2642}, {10, 2644},
+    {10, 2646}, {10, 2648}, {10, 2650}, {10, 2652}, {10, 2654}, {10, 2656},
+    {10, 2658}, {10, 2660}, {10, 2662}, {10, 2664}, {10, 2666}, {10, 2668},
+    {10, 2670}, {10, 2672}, {10, 2674}, {10, 2676}, {10, 2678}, {10, 2680},
+    {10, 2682}, {10, 2684}, {10, 2686}, {10, 2688}, {10, 2690}, {10, 2692},
+    {10, 2694}, {10, 2696}, {10, 2698}, {10, 2700}, {10, 2702}, {10, 2704},
+    {10, 2706}, {10, 2708}, {10, 2710}, {10, 2712}, {10, 2714}, {10, 2716},
+    {10, 2718}, {10, 2720}, {10, 2722}, {10, 2724}, {10, 2726}, {10, 2728},
+    {10, 2730}, {10, 2732}, {10, 2734}, {10, 2736}, {10, 2738}, {10, 2740},
+    {10, 2742}, {10, 2744}, {10, 2746}, {10, 2748}, {10, 2750}, {10, 2752},
+    {10, 2754}, {10, 2756}, {10, 2758}, {10, 2760}, {10, 2762}, {10, 2764},
+    {10, 2766}, {10, 2768}, {10, 2770}, {10, 2772}, {10, 2774}, {10, 2776},
+    {10, 2778}, {10, 2780}, {10, 2782}, {10, 2784}, {10, 2786}, {10, 2788},
+    {10, 2790}, {10, 2792}, {10, 2794}, {10, 2796}, {10, 2798}, {10, 2800},
+    {10, 2802}, {10, 2804}, {10, 2806}, {10, 2808}, {10, 2810}, {10, 2812},
+    {10, 2814}, {10, 2816}, {10, 2818}, {10, 2820}, {10, 2822}, {10, 2824},
+    {10, 2826}, {10, 2828}, {10, 2830}, {10, 2832}, {10, 2834}, {10, 2836},
+    {10, 2838}, {10, 2840}, {10, 2842}, {10, 2844}, {10, 2846}, {10, 2848},
+    {10, 2850}, {10, 2852}, {10, 2854}, {10, 2856}, {10, 2858}, {10, 2860},
+    {10, 2862}, {10, 2864}, {10, 2866}, {10, 2868}, {10, 2870}, {10, 2872},
+    {10, 2874}, {10, 2876}, {10, 2878}, {10, 2880}, {10, 2882}, {10, 2884},
+    {10, 2886}, {10, 2888}, {10, 2890}, {10, 2892}, {10, 2894}, {10, 2896},
+    {10, 2898}, {10, 2900}, {10, 2902}, {10, 2904}, {10, 2906}, {10, 2908},
+    {10, 2910}, {10, 2912}, {10, 2914}, {10, 2916}, {10, 2918}, {10, 2920},
+    {10, 2922}, {10, 2924}, {10, 2926}, {10, 2928}, {10, 2930}, {10, 2932},
+    {10, 2934}, {10, 2936}, {10, 2938}, {10, 2940}, {10, 2942}, {10, 2944},
+    {10, 2946}, {10, 2948}, {10, 2950}, {10, 2952}, {10, 2954}, {10, 2956},
+    {10, 2958}, {10, 2960}, {10, 2962}, {10, 2964}, {10, 2966}, {10, 2968},
+    {10, 2970}, {10, 2972}, {10, 2974}, {10, 2976}, {10, 2978}, {10, 2980},
+    {10, 2982}, {10, 2984}, {10, 2986}, {10, 2988}, {10, 2990}, {10, 2992},
+    {10, 2994}, {10, 2996}, {10, 2998}, {10, 3000}, {10, 3002}, {10, 3004},
+    {10, 3006}, {10, 3008}, {10, 3010}, {10, 3012}, {10, 3014}, {10, 3016},
+    {10, 3018}, {10, 3020}, {10, 3022}, {10, 3024}, {10, 3026}, {10, 3028},
+    {10, 3030}, {10, 3032}, {10, 3034}, {10, 3036}, {10, 3038}, {10, 3040},
+    {10, 3042}, {10, 3044}, {10, 3046}, {10, 3048}, {10, 3050}, {10, 3052},
+    {10, 3054}, {10, 3056}, {10, 3058}, {10, 3060}, {10, 3062}, {10, 3064},
+    {10, 3066}, {10, 3068}, {10, 3070}, {10, 3072}, {10, 3074}, {10, 3076},
+    {10, 3078}, {10, 3080}, {10, 3082}, {10, 3084}, {10, 3086}, {10, 3088},
+    {10, 3090}, {10, 3092}, {10, 3094}, {10, 3096}, {10, 3098}, {10, 3100},
+    {10, 3102}, {10, 3104}, {10, 3106}, {10, 3108}, {10, 3110}, {10, 3112},
+    {10, 3114}, {10, 3116}, {10, 3118}, {10, 3120}, {10, 3122}, {10, 3124},
+    {10, 3126}, {10, 3128}, {10, 3130}, {10, 3132}, {10, 3134}, {10, 3136},
+    {10, 3138}, {10, 3140}, {10, 3142}, {10, 3144}, {10, 3146}, {10, 3148},
+    {10, 3150}, {10, 3152}, {10, 3154}, {10, 3156}, {10, 3158}, {10, 3160},
+    {10, 3162}, {10, 3164}, {10, 3166}, {10, 3168}, {10, 3170}, {10, 3172},
+    {10, 3174}, {10, 3176}, {10, 3178}, {10, 3180}, {10, 3182}, {10, 3184},
+    {10, 3186}, {10, 3188}, {10, 3190}, {10, 3192}, {10, 3194}, {10, 3196},
+    {10, 3198}, {10, 3200}, {10, 3202}, {10, 3204}, {10, 3206}, {10, 3208},
+    {10, 3210}, {10, 3212}, {10, 3214}, {10, 3216}, {10, 3218}, {10, 3220},
+    {10, 3222}, {10, 3224}, {10, 3226}, {10, 3228}, {10, 3230}, {10, 3232},
+    {10, 3234}, {10, 3236}, {10, 3238}, {10, 3240}, {10, 3242}, {10, 3244},
+    {10, 3246}, {10, 3248}, {10, 3250}, {10, 3252}, {10, 3254}, {10, 3256},
+    {10, 3258}, {10, 3260}, {10, 3262}, {10, 3264}, {10, 3266}, {10, 3268},
+    {10, 3270}, {10, 3272}, {10, 3274}, {10, 3276}, {10, 3278}, {10, 3280},
+    {10, 3282}, {10, 3284}, {10, 3286}, {10, 3288}, {10, 3290}, {10, 3292},
+    {10, 3294}, {10, 3296}, {10, 3298}, {10, 3300}, {10, 3302}, {10, 3304},
+    {10, 3306}, {10, 3308}, {10, 3310}, {10, 3312}, {10, 3314}, {10, 3316},
+    {10, 3318}, {10, 3320}, {10, 3322}, {10, 3324}, {10, 3326}, {10, 3328},
+    {10, 3330}, {10, 3332}, {10, 3334}, {10, 3336}, {10, 3338}, {10, 3340},
+    {10, 3342}, {10, 3344}, {10, 3346}, {10, 3348}, {10, 3350}, {10, 3352},
+    {10, 3354}, {10, 3356}, {10, 3358}, {10, 3360}, {10, 3362}, {10, 3364},
+    {10, 3366}, {10, 3368}, {10, 3370}, {10, 3372}, {10, 3374}, {10, 3376},
+    {10, 3378}, {10, 3380}, {10, 3382}, {10, 3384}, {10, 3386}, {10, 3388},
+    {10, 3390}, {10, 3392}, {10, 3394}, {10, 3396}, {10, 3398}, {10, 3400},
+    {10, 3402}, {10, 3404}, {10, 3406}, {10, 3408}, {10, 3410}, {10, 3412},
+    {10, 3414}, {10, 3416}, {10, 3418}, {10, 3420}, {10, 3422}, {10, 3424},
+    {10, 3426}, {10, 3428}, {10, 3430}, {10, 3432}, {10, 3434}, {10, 3436},
+    {10, 3438}, {10, 3440}, {10, 3442}, {10, 3444}, {10, 3446}, {10, 3448},
+    {10, 3450}, {10, 3452}, {10, 3454}, {10, 3456}, {10, 3458}, {10, 3460},
+    {10, 3462}, {10, 3464}, {10, 3466}, {10, 3468}, {10, 3470}, {10, 3472},
+    {10, 3474}, {10, 3476}, {10, 3478}, {10, 3480}, {10, 3482}, {10, 3484},
+    {10, 3486}, {10, 3488}, {10, 3490}, {10, 3492}, {10, 3494}, {10, 3496},
+    {10, 3498}, {10, 3500}, {10, 3502}, {10, 3504}, {10, 3506}, {10, 3508},
+    {10, 3510}, {10, 3512}, {10, 3514}, {10, 3516}, {10, 3518}, {10, 3520},
+    {10, 3522}, {10, 3524}, {10, 3526}, {10, 3528}, {10, 3530}, {10, 3532},
+    {10, 3534}, {10, 3536}, {10, 3538}, {10, 3540}, {10, 3542}, {10, 3544},
+    {10, 3546}, {10, 3548}, {10, 3550}, {10, 3552}, {10, 3554}, {10, 3556},
+    {10, 3558}, {10, 3560}, {10, 3562}, {10, 3564}, {10, 3566}, {10, 3568},
+    {10, 3570}, {10, 3572}, {10, 3574}, {10, 3576}, {10, 3578}, {10, 3580},
+    {10, 3582}, {10, 3584}, {10, 3586}, {10, 3588}, {10, 3590}, {10, 3592},
+    {10, 3594}, {10, 3596}, {10, 3598}, {10, 3600}, {10, 3602}, {10, 3604},
+    {10, 3606}, {10, 3608}, {10, 3610}, {10, 3612}, {10, 3614}, {10, 3616},
+    {10, 3618}, {10, 3620}, {10, 3622}, {10, 3624}, {10, 3626}, {10, 3628},
+    {10, 3630}, {10, 3632}, {10, 3634}, {10, 3636}, {10, 3638}, {10, 3640},
+    {10, 3642}, {10, 3644}, {10, 3646}, {10, 3648}, {10, 3650}, {10, 3652},
+    {10, 3654}, {10, 3656}, {10, 3658}, {10, 3660}, {10, 3662}, {10, 3664},
+    {10, 3666}, {10, 3668}, {10, 3670}, {10, 3672}, {10, 3674}, {10, 3676},
+    {10, 3678}, {10, 3680}, {10, 3682}, {10, 3684}, {10, 3686}, {10, 3688},
+    {10, 3690}, {10, 3692}, {10, 3694}, {10, 3696}, {10, 3698}, {10, 3700},
+    {10, 3702}, {10, 3704}, {10, 3706}, {10, 3708}, {10, 3710}, {10, 3712},
+    {10, 3714}, {10, 3716}, {10, 3718}, {10, 3720}, {10, 3722}, {10, 3724},
+    {10, 3726}, {10, 3728}, {10, 3730}, {10, 3732}, {10, 3734}, {10, 3736},
+    {10, 3738}, {10, 3740}, {10, 3742}, {10, 3744}, {10, 3746}, {10, 3748},
+    {10, 3750}, {10, 3752}, {10, 3754}, {10, 3756}, {10, 3758}, {10, 3760},
+    {10, 3762}, {10, 3764}, {10, 3766}, {10, 3768}, {10, 3770}, {10, 3772},
+    {10, 3774}, {10, 3776}, {10, 3778}, {10, 3780}, {10, 3782}, {10, 3784},
+    {10, 3786}, {10, 3788}, {10, 3790}, {10, 3792}, {10, 3794}, {10, 3796},
+    {10, 3798}, {10, 3800}, {10, 3802}, {10, 3804}, {10, 3806}, {10, 3808},
+    {10, 3810}, {10, 3812}, {10, 3814}, {10, 3816}, {10, 3818}, {10, 3820},
+    {10, 3822}, {10, 3824}, {10, 3826}, {10, 3828}, {10, 3830}, {10, 3832},
+    {10, 3834}, {10, 3836}, {10, 3838}, {10, 3840}, {10, 3842}, {10, 3844},
+    {10, 3846}, {10, 3848}, {10, 3850}, {10, 3852}, {10, 3854}, {10, 3856},
+    {10, 3858}, {10, 3860}, {10, 3862}, {10, 3864}, {10, 3866}, {10, 3868},
+    {10, 3870}, {10, 3872}, {10, 3874}, {10, 3876}, {10, 3878}, {10, 3880},
+    {10, 3882}, {10, 3884}, {10, 3886}, {10, 3888}, {10, 3890}, {10, 3892},
+    {10, 3894}, {10, 3896}, {10, 3898}, {10, 3900}, {10, 3902}, {10, 3904},
+    {10, 3906}, {10, 3908}, {10, 3910}, {10, 3912}, {10, 3914}, {10, 3916},
+    {10, 3918}, {10, 3920}, {10, 3922}, {10, 3924}, {10, 3926}, {10, 3928},
+    {10, 3930}, {10, 3932}, {10, 3934}, {10, 3936}, {10, 3938}, {10, 3940},
+    {10, 3942}, {10, 3944}, {10, 3946}, {10, 3948}, {10, 3950}, {10, 3952},
+    {10, 3954}, {10, 3956}, {10, 3958}, {10, 3960}
+};
diff --git a/libvpx/vp8/encoder/defaultcoefcounts.h b/libvpx/vp8/encoder/defaultcoefcounts.h
new file mode 100644
index 0000000..2c0f3dd
--- /dev/null
+++ b/libvpx/vp8/encoder/defaultcoefcounts.h
@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* Generated file, included by entropy.c */
+
+static const unsigned int default_coef_counts[BLOCK_TYPES]
+                                             [COEF_BANDS]
+                                             [PREV_COEF_CONTEXTS]
+                                             [MAX_ENTROPY_TOKENS] =
+{
+
+    {
+        /* Block Type ( 0 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {30190, 26544, 225,  24,   4,   0,   0,   0,   0,   0,   0, 4171593,},
+            {26846, 25157, 1241, 130,  26,   6,   1,   0,   0,   0,   0, 149987,},
+            {10484, 9538, 1006, 160,  36,  18,   0,   0,   0,   0,   0, 15104,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            {25842, 40456, 1126,  83,  11,   2,   0,   0,   0,   0,   0,   0,},
+            {9338, 8010, 512,  73,   7,   3,   2,   0,   0,   0,   0, 43294,},
+            {1047, 751, 149,  31,  13,   6,   1,   0,   0,   0,   0, 879,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            {26136, 9826, 252,  13,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {8134, 5574, 191,  14,   2,   0,   0,   0,   0,   0,   0, 35302,},
+            { 605, 677, 116,   9,   1,   0,   0,   0,   0,   0,   0, 611,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {10263, 15463, 283,  17,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2773, 2191, 128,   9,   2,   2,   0,   0,   0,   0,   0, 10073,},
+            { 134, 125,  32,   4,   0,   2,   0,   0,   0,   0,   0,  50,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {10483, 2663,  23,   1,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {2137, 1251,  27,   1,   1,   0,   0,   0,   0,   0,   0, 14362,},
+            { 116, 156,  14,   2,   1,   0,   0,   0,   0,   0,   0, 190,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            {40977, 27614, 412,  28,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {6113, 5213, 261,  22,   3,   0,   0,   0,   0,   0,   0, 26164,},
+            { 382, 312,  50,  14,   2,   0,   0,   0,   0,   0,   0, 345,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0,  26,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  13,   0,   0,   0,   0,   0,   0,   0,   0,   0, 319,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,},
+        },
+    },
+    {
+        /* Block Type ( 1 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            {3268, 19382, 1043, 250,  93,  82,  49,  26,  17,   8,  25, 82289,},
+            {8758, 32110, 5436, 1832, 827, 668, 420, 153,  24,   0,   3, 52914,},
+            {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399,  59,   0,   0, 18620,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {12419, 8420, 452,  62,   9,   1,   0,   0,   0,   0,   0,   0,},
+            {11715, 8705, 693,  92,  15,   7,   2,   0,   0,   0,   0, 53988,},
+            {7603, 8585, 2306, 778, 270, 145,  39,   5,   0,   0,   0, 9136,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            {15938, 14335, 1207, 184,  55,  13,   4,   1,   0,   0,   0,   0,},
+            {7415, 6829, 1138, 244,  71,  26,   7,   0,   0,   0,   0, 9980,},
+            {1580, 1824, 655, 241,  89,  46,  10,   2,   0,   0,   0, 429,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            {19453, 5260, 201,  19,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {9173, 3758, 213,  22,   1,   1,   0,   0,   0,   0,   0, 9820,},
+            {1689, 1277, 276,  51,  17,   4,   0,   0,   0,   0,   0, 679,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {12076, 10667, 620,  85,  19,   9,   5,   0,   0,   0,   0,   0,},
+            {4665, 3625, 423,  55,  19,   9,   0,   0,   0,   0,   0, 5127,},
+            { 415, 440, 143,  34,  20,   7,   2,   0,   0,   0,   0, 101,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {12183, 4846, 115,  11,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {4226, 3149, 177,  21,   2,   0,   0,   0,   0,   0,   0, 7157,},
+            { 375, 621, 189,  51,  11,   4,   1,   0,   0,   0,   0, 198,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            {61658, 37743, 1203,  94,  10,   3,   0,   0,   0,   0,   0,   0,},
+            {15514, 11563, 903, 111,  14,   5,   0,   0,   0,   0,   0, 25195,},
+            { 929, 1077, 291,  78,  14,   7,   1,   0,   0,   0,   0, 507,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0, 990,  15,   3,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0, 412,  13,   0,   0,   0,   0,   0,   0,   0,   0, 1641,},
+            {   0,  18,   7,   1,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+    },
+    {
+        /* Block Type ( 2 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            { 953, 24519, 628, 120,  28,  12,   4,   0,   0,   0,   0, 2248798,},
+            {1525, 25654, 2647, 617, 239, 143,  42,   5,   0,   0,   0, 66837,},
+            {1180, 11011, 3001, 1237, 532, 448, 239,  54,   5,   0,   0, 7122,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {1356, 2220,  67,  10,   4,   1,   0,   0,   0,   0,   0,   0,},
+            {1450, 2544, 102,  18,   4,   3,   0,   0,   0,   0,   0, 57063,},
+            {1182, 2110, 470, 130,  41,  21,   0,   0,   0,   0,   0, 6047,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            { 370, 3378, 200,  30,   5,   4,   1,   0,   0,   0,   0,   0,},
+            { 293, 1006, 131,  29,  11,   0,   0,   0,   0,   0,   0, 5404,},
+            { 114, 387,  98,  23,   4,   8,   1,   0,   0,   0,   0, 236,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            { 579, 194,   4,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            { 395, 213,   5,   1,   0,   0,   0,   0,   0,   0,   0, 4157,},
+            { 119, 122,   4,   0,   0,   0,   0,   0,   0,   0,   0, 300,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {  38, 557,  19,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  21, 114,  12,   1,   0,   0,   0,   0,   0,   0,   0, 427,},
+            {   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   7,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {  52,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  18,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0, 652,},
+            {   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  30,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            { 640, 569,  10,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {  25,  77,   2,   0,   0,   0,   0,   0,   0,   0,   0, 517,},
+            {   4,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0,   3,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+        },
+    },
+    {
+        /* Block Type ( 3 ) */
+        {
+            /* Coeff Band ( 0 ) */
+            {2506, 20161, 2707, 767, 261, 178, 107,  30,  14,   3,   0, 100694,},
+            {8806, 36478, 8817, 3268, 1280, 850, 401, 114,  42,   0,   0, 58572,},
+            {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175,  32,   0,   0, 19284,},
+        },
+        {
+            /* Coeff Band ( 1 ) */
+            {9738, 11313, 959, 205,  70,  18,  11,   1,   0,   0,   0,   0,},
+            {12628, 15085, 1507, 273,  52,  19,   9,   0,   0,   0,   0, 54280,},
+            {10701, 15846, 5561, 1926, 813, 570, 249,  36,   0,   0,   0, 6460,},
+        },
+        {
+            /* Coeff Band ( 2 ) */
+            {6781, 22539, 2784, 634, 182, 123,  20,   4,   0,   0,   0,   0,},
+            {6263, 11544, 2649, 790, 259, 168,  27,   5,   0,   0,   0, 20539,},
+            {3109, 4075, 2031, 896, 457, 386, 158,  29,   0,   0,   0, 1138,},
+        },
+        {
+            /* Coeff Band ( 3 ) */
+            {11515, 4079, 465,  73,   5,  14,   2,   0,   0,   0,   0,   0,},
+            {9361, 5834, 650,  96,  24,   8,   4,   0,   0,   0,   0, 22181,},
+            {4343, 3974, 1360, 415, 132,  96,  14,   1,   0,   0,   0, 1267,},
+        },
+        {
+            /* Coeff Band ( 4 ) */
+            {4787, 9297, 823, 168,  44,  12,   4,   0,   0,   0,   0,   0,},
+            {3619, 4472, 719, 198,  60,  31,   3,   0,   0,   0,   0, 8401,},
+            {1157, 1175, 483, 182,  88,  31,   8,   0,   0,   0,   0, 268,},
+        },
+        {
+            /* Coeff Band ( 5 ) */
+            {8299, 1226,  32,   5,   1,   0,   0,   0,   0,   0,   0,   0,},
+            {3502, 1568,  57,   4,   1,   1,   0,   0,   0,   0,   0, 9811,},
+            {1055, 1070, 166,  29,   6,   1,   0,   0,   0,   0,   0, 527,},
+        },
+        {
+            /* Coeff Band ( 6 ) */
+            {27414, 27927, 1989, 347,  69,  26,   0,   0,   0,   0,   0,   0,},
+            {5876, 10074, 1574, 341,  91,  24,   4,   0,   0,   0,   0, 21954,},
+            {1571, 2171, 778, 324, 124,  65,  16,   0,   0,   0,   0, 979,},
+        },
+        {
+            /* Coeff Band ( 7 ) */
+            {   0,  29,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},
+            {   0,  23,   0,   0,   0,   0,   0,   0,   0,   0,   0, 459,},
+            {   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,  13,},
+        },
+    },
+};
diff --git a/libvpx/vp8/encoder/denoising.c b/libvpx/vp8/encoder/denoising.c
new file mode 100644
index 0000000..c0dd7c1
--- /dev/null
+++ b/libvpx/vp8/encoder/denoising.c
@@ -0,0 +1,309 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "denoising.h"
+
+#include "vp8/common/reconinter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_rtcd.h"
+
+static const unsigned int NOISE_MOTION_THRESHOLD = 25 * 25;
+/* SSE_DIFF_THRESHOLD is selected as ~95% confidence assuming
+ * var(noise) ~= 100.
+ */
+static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
+static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
+
+/*
+ * The filter function was modified to reduce the computational complexity.
+ * Step 1:
+ * Instead of applying tap coefficients for each pixel, we calculated the
+ * pixel adjustments vs. pixel diff value ahead of time.
+ *     adjustment = filtered_value - current_raw
+ *                = (filter_coefficient * diff + 128) >> 8
+ * where
+ *     filter_coefficient = (255 << 8) / (256 + ((absdiff * 330) >> 3));
+ *     filter_coefficient += filter_coefficient /
+ *                           (3 + motion_magnitude_adjustment);
+ *     filter_coefficient is clamped to 0 ~ 255.
+ *
+ * Step 2:
+ * The adjustment vs. diff curve becomes flat very quick when diff increases.
+ * This allowed us to use only several levels to approximate the curve without
+ * changing the filtering algorithm too much.
+ * The adjustments were further corrected by checking the motion magnitude.
+ * The levels used are:
+ * diff       adjustment w/o motion correction   adjustment w/ motion correction
+ * [-255, -16]           -6                                   -7
+ * [-15, -8]             -4                                   -5
+ * [-7, -4]              -3                                   -4
+ * [-3, 3]               diff                                 diff
+ * [4, 7]                 3                                    4
+ * [8, 15]                4                                    5
+ * [16, 255]              6                                    7
+ */
+
+int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg,
+                          YV12_BUFFER_CONFIG *running_avg, MACROBLOCK *signal,
+                          unsigned int motion_magnitude, int y_offset,
+                          int uv_offset)
+{
+    unsigned char *sig = signal->thismb;
+    int sig_stride = 16;
+    unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
+    int mc_avg_y_stride = mc_running_avg->y_stride;
+    unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
+    int avg_y_stride = running_avg->y_stride;
+    int r, c, i;
+    int sum_diff = 0;
+    int adj_val[3] = {3, 4, 6};
+
+    /* If motion_magnitude is small, making the denoiser more aggressive by
+     * increasing the adjustment for each level. */
+    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+    {
+        for (i = 0; i < 3; i++)
+            adj_val[i] += 1;
+    }
+
+    for (r = 0; r < 16; ++r)
+    {
+        for (c = 0; c < 16; ++c)
+        {
+            int diff = 0;
+            int adjustment = 0;
+            int absdiff = 0;
+
+            diff = mc_running_avg_y[c] - sig[c];
+            absdiff = abs(diff);
+
+            /* When |diff| < 4, use pixel value from last denoised raw. */
+            if (absdiff <= 3)
+            {
+                running_avg_y[c] = mc_running_avg_y[c];
+                sum_diff += diff;
+            }
+            else
+            {
+                if (absdiff >= 4 && absdiff <= 7)
+                    adjustment = adj_val[0];
+                else if (absdiff >= 8 && absdiff <= 15)
+                    adjustment = adj_val[1];
+                else
+                    adjustment = adj_val[2];
+
+                if (diff > 0)
+                {
+                    if ((sig[c] + adjustment) > 255)
+                        running_avg_y[c] = 255;
+                    else
+                        running_avg_y[c] = sig[c] + adjustment;
+
+                    sum_diff += adjustment;
+                }
+                else
+                {
+                    if ((sig[c] - adjustment) < 0)
+                        running_avg_y[c] = 0;
+                    else
+                        running_avg_y[c] = sig[c] - adjustment;
+
+                    sum_diff -= adjustment;
+                }
+            }
+        }
+
+        /* Update pointers for next iteration. */
+        sig += sig_stride;
+        mc_running_avg_y += mc_avg_y_stride;
+        running_avg_y += avg_y_stride;
+    }
+
+    if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
+        return COPY_BLOCK;
+
+    vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
+                      signal->thismb, sig_stride);
+    return FILTER_BLOCK;
+}
+
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)
+{
+    int i;
+    assert(denoiser);
+
+    /* don't need one for intra start at 1 */
+    for (i = 1; i < MAX_REF_FRAMES; i++)
+    {
+        denoiser->yv12_running_avg[i].flags = 0;
+
+        if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_running_avg[i]), width,
+                                        height, VP8BORDERINPIXELS)
+            < 0)
+        {
+            vp8_denoiser_free(denoiser);
+            return 1;
+        }
+        vpx_memset(denoiser->yv12_running_avg[i].buffer_alloc, 0,
+                   denoiser->yv12_running_avg[i].frame_size);
+
+    }
+    denoiser->yv12_mc_running_avg.flags = 0;
+
+    if (vp8_yv12_alloc_frame_buffer(&(denoiser->yv12_mc_running_avg), width,
+                                   height, VP8BORDERINPIXELS) < 0)
+    {
+        vp8_denoiser_free(denoiser);
+        return 1;
+    }
+
+    vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
+               denoiser->yv12_mc_running_avg.frame_size);
+    return 0;
+}
+
+void vp8_denoiser_free(VP8_DENOISER *denoiser)
+{
+    int i;
+    assert(denoiser);
+
+    /* we don't have one for intra ref frame */
+    for (i = 1; i < MAX_REF_FRAMES ; i++)
+    {
+        vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
+    }
+    vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
+}
+
+
+void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
+                             MACROBLOCK *x,
+                             unsigned int best_sse,
+                             unsigned int zero_mv_sse,
+                             int recon_yoffset,
+                             int recon_uvoffset)
+{
+    int mv_row;
+    int mv_col;
+    unsigned int motion_magnitude2;
+
+    MV_REFERENCE_FRAME frame = x->best_reference_frame;
+    MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;
+
+    enum vp8_denoiser_decision decision = FILTER_BLOCK;
+
+    if (zero_frame)
+    {
+        YV12_BUFFER_CONFIG *src = &denoiser->yv12_running_avg[frame];
+        YV12_BUFFER_CONFIG *dst = &denoiser->yv12_mc_running_avg;
+        YV12_BUFFER_CONFIG saved_pre,saved_dst;
+        MB_MODE_INFO saved_mbmi;
+        MACROBLOCKD *filter_xd = &x->e_mbd;
+        MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
+        int mv_col;
+        int mv_row;
+        int sse_diff = zero_mv_sse - best_sse;
+
+        saved_mbmi = *mbmi;
+
+        /* Use the best MV for the compensation. */
+        mbmi->ref_frame = x->best_reference_frame;
+        mbmi->mode = x->best_sse_inter_mode;
+        mbmi->mv = x->best_sse_mv;
+        mbmi->need_to_clamp_mvs = x->need_to_clamp_best_mvs;
+        mv_col = x->best_sse_mv.as_mv.col;
+        mv_row = x->best_sse_mv.as_mv.row;
+
+        if (frame == INTRA_FRAME ||
+            ((unsigned int)(mv_row *mv_row + mv_col *mv_col)
+              <= NOISE_MOTION_THRESHOLD &&
+             sse_diff < (int)SSE_DIFF_THRESHOLD))
+        {
+            /*
+             * Handle intra blocks as referring to last frame with zero motion
+             * and let the absolute pixel difference affect the filter factor.
+             * Also consider small amount of motion as being random walk due
+             * to noise, if it doesn't mean that we get a much bigger error.
+             * Note that any changes to the mode info only affects the
+             * denoising.
+             */
+            mbmi->ref_frame =
+                    x->best_zeromv_reference_frame;
+
+            src = &denoiser->yv12_running_avg[zero_frame];
+
+            mbmi->mode = ZEROMV;
+            mbmi->mv.as_int = 0;
+            x->best_sse_inter_mode = ZEROMV;
+            x->best_sse_mv.as_int = 0;
+            best_sse = zero_mv_sse;
+        }
+
+        saved_pre = filter_xd->pre;
+        saved_dst = filter_xd->dst;
+
+        /* Compensate the running average. */
+        filter_xd->pre.y_buffer = src->y_buffer + recon_yoffset;
+        filter_xd->pre.u_buffer = src->u_buffer + recon_uvoffset;
+        filter_xd->pre.v_buffer = src->v_buffer + recon_uvoffset;
+        /* Write the compensated running average to the destination buffer. */
+        filter_xd->dst.y_buffer = dst->y_buffer + recon_yoffset;
+        filter_xd->dst.u_buffer = dst->u_buffer + recon_uvoffset;
+        filter_xd->dst.v_buffer = dst->v_buffer + recon_uvoffset;
+
+        if (!x->skip)
+        {
+            vp8_build_inter_predictors_mb(filter_xd);
+        }
+        else
+        {
+            vp8_build_inter16x16_predictors_mb(filter_xd,
+                                               filter_xd->dst.y_buffer,
+                                               filter_xd->dst.u_buffer,
+                                               filter_xd->dst.v_buffer,
+                                               filter_xd->dst.y_stride,
+                                               filter_xd->dst.uv_stride);
+        }
+        filter_xd->pre = saved_pre;
+        filter_xd->dst = saved_dst;
+        *mbmi = saved_mbmi;
+
+    }
+
+    mv_row = x->best_sse_mv.as_mv.row;
+    mv_col = x->best_sse_mv.as_mv.col;
+    motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
+    if (best_sse > SSE_THRESHOLD || motion_magnitude2
+           > 8 * NOISE_MOTION_THRESHOLD)
+    {
+        decision = COPY_BLOCK;
+    }
+
+    if (decision == FILTER_BLOCK)
+    {
+        /* Filter. */
+        decision = vp8_denoiser_filter(&denoiser->yv12_mc_running_avg,
+                                       &denoiser->yv12_running_avg[LAST_FRAME],
+                                       x,
+                                       motion_magnitude2,
+                                       recon_yoffset, recon_uvoffset);
+    }
+    if (decision == COPY_BLOCK)
+    {
+        /* No filtering of this block; it differs too much from the predictor,
+         * or the motion vector magnitude is considered too big.
+         */
+        vp8_copy_mem16x16(
+                x->thismb, 16,
+                denoiser->yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset,
+                denoiser->yv12_running_avg[LAST_FRAME].y_stride);
+    }
+}
diff --git a/libvpx/vp8/encoder/denoising.h b/libvpx/vp8/encoder/denoising.h
new file mode 100644
index 0000000..b025f5c
--- /dev/null
+++ b/libvpx/vp8/encoder/denoising.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_DENOISING_H_
+#define VP8_ENCODER_DENOISING_H_
+
+#include "block.h"
+
+#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
+#define MOTION_MAGNITUDE_THRESHOLD (8*3)
+
+enum vp8_denoiser_decision
+{
+  COPY_BLOCK,
+  FILTER_BLOCK
+};
+
+typedef struct vp8_denoiser
+{
+    YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
+    YV12_BUFFER_CONFIG yv12_mc_running_avg;
+} VP8_DENOISER;
+
+int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height);
+
+void vp8_denoiser_free(VP8_DENOISER *denoiser);
+
+void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
+                             MACROBLOCK *x,
+                             unsigned int best_sse,
+                             unsigned int zero_mv_sse,
+                             int recon_yoffset,
+                             int recon_uvoffset);
+
+#endif  /* VP8_ENCODER_DENOISING_H_ */
diff --git a/libvpx/vp8/encoder/encodeframe.c b/libvpx/vp8/encoder/encodeframe.c
new file mode 100644
index 0000000..8828dd9
--- /dev/null
+++ b/libvpx/vp8/encoder/encodeframe.c
@@ -0,0 +1,1393 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "encodemb.h"
+#include "encodemv.h"
+#include "vp8/common/common.h"
+#include "onyx_int.h"
+#include "vp8/common/extend.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/quant_common.h"
+#include "segmentation.h"
+#include "vp8/common/setupintrarecon.h"
+#include "encodeintra.h"
+#include "vp8/common/reconinter.h"
+#include "rdopt.h"
+#include "pickinter.h"
+#include "vp8/common/findnearmv.h"
+#include <stdio.h>
+#include <limits.h>
+#include "vp8/common/invtrans.h"
+#include "vpx_ports/vpx_timer.h"
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+#include "bitstream.h"
+#endif
+#include "encodeframe.h"
+
+extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) ;
+extern void vp8_calc_ref_frame_costs(int *ref_frame_cost,
+                                     int prob_intra,
+                                     int prob_last,
+                                     int prob_garf
+                                    );
+extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi);
+extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
+extern void vp8_auto_select_speed(VP8_COMP *cpi);
+extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
+                                      MACROBLOCK *x,
+                                      MB_ROW_COMP *mbr_ei,
+                                      int mb_row,
+                                      int count);
+static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x );
+
+#ifdef MODE_STATS
+unsigned int inter_y_modes[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int inter_uv_modes[4] = {0, 0, 0, 0};
+unsigned int inter_b_modes[15]  = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int y_modes[5]   = {0, 0, 0, 0, 0};
+unsigned int uv_modes[4]  = {0, 0, 0, 0};
+unsigned int b_modes[14]  = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+#endif
+
+
+/* activity_avg must be positive, or flat regions could get a zero weight
+ *  (infinite lambda), which confounds analysis.
+ * This also avoids the need for divide by zero checks in
+ *  vp8_activity_masking().
+ */
+#define VP8_ACTIVITY_AVG_MIN (64)
+
+/* This is used as a reference when computing the source variance for the
+ *  purposes of activity masking.
+ * Eventually this should be replaced by custom no-reference routines,
+ *  which will be faster.
+ */
+static const unsigned char VP8_VAR_OFFS[16]=
+{
+    128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+};
+
+
+/* Original activity measure from Tim T's code. */
+static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
+{
+    unsigned int act;
+    unsigned int sse;
+    /* TODO: This could also be done over smaller areas (8x8), but that would
+     *  require extensive changes elsewhere, as lambda is assumed to be fixed
+     *  over an entire MB in most of the code.
+     * Another option is to compute four 8x8 variances, and pick a single
+     *  lambda using a non-linear combination (e.g., the smallest, or second
+     *  smallest, etc.).
+     */
+    act =  vp8_variance16x16(x->src.y_buffer,
+                    x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
+    act = act<<4;
+
+    /* If the region is flat, lower the activity some more. */
+    if (act < 8<<12)
+        act = act < 5<<12 ? act : 5<<12;
+
+    return act;
+}
+
+/* Stub for alternative experimental activity measures. */
+static unsigned int alt_activity_measure( VP8_COMP *cpi,
+                                          MACROBLOCK *x, int use_dc_pred )
+{
+    return vp8_encode_intra(cpi,x, use_dc_pred);
+}
+
+
+/* Measure the activity of the current macroblock
+ * What we measure here is TBD so abstracted to this function
+ */
+#define ALT_ACT_MEASURE 1
+static unsigned int mb_activity_measure( VP8_COMP *cpi, MACROBLOCK *x,
+                                  int mb_row, int mb_col)
+{
+    unsigned int mb_activity;
+
+    if  ( ALT_ACT_MEASURE )
+    {
+        int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+
+        /* Or use and alternative. */
+        mb_activity = alt_activity_measure( cpi, x, use_dc_pred );
+    }
+    else
+    {
+        /* Original activity measure from Tim T's code. */
+        mb_activity = tt_activity_measure( cpi, x );
+    }
+
+    if ( mb_activity < VP8_ACTIVITY_AVG_MIN )
+        mb_activity = VP8_ACTIVITY_AVG_MIN;
+
+    return mb_activity;
+}
+
+/* Calculate an "average" mb activity value for the frame */
+#define ACT_MEDIAN 0
+static void calc_av_activity( VP8_COMP *cpi, int64_t activity_sum )
+{
+#if ACT_MEDIAN
+    /* Find median: Simple n^2 algorithm for experimentation */
+    {
+        unsigned int median;
+        unsigned int i,j;
+        unsigned int * sortlist;
+        unsigned int tmp;
+
+        /* Create a list to sort to */
+        CHECK_MEM_ERROR(sortlist,
+                        vpx_calloc(sizeof(unsigned int),
+                        cpi->common.MBs));
+
+        /* Copy map to sort list */
+        vpx_memcpy( sortlist, cpi->mb_activity_map,
+                    sizeof(unsigned int) * cpi->common.MBs );
+
+
+        /* Ripple each value down to its correct position */
+        for ( i = 1; i < cpi->common.MBs; i ++ )
+        {
+            for ( j = i; j > 0; j -- )
+            {
+                if ( sortlist[j] < sortlist[j-1] )
+                {
+                    /* Swap values */
+                    tmp = sortlist[j-1];
+                    sortlist[j-1] = sortlist[j];
+                    sortlist[j] = tmp;
+                }
+                else
+                    break;
+            }
+        }
+
+        /* Even number MBs so estimate median as mean of two either side. */
+        median = ( 1 + sortlist[cpi->common.MBs >> 1] +
+                   sortlist[(cpi->common.MBs >> 1) + 1] ) >> 1;
+
+        cpi->activity_avg = median;
+
+        vpx_free(sortlist);
+    }
+#else
+    /* Simple mean for now */
+    cpi->activity_avg = (unsigned int)(activity_sum/cpi->common.MBs);
+#endif
+
+    if (cpi->activity_avg < VP8_ACTIVITY_AVG_MIN)
+        cpi->activity_avg = VP8_ACTIVITY_AVG_MIN;
+
+    /* Experimental code: return fixed value normalized for several clips */
+    if  ( ALT_ACT_MEASURE )
+        cpi->activity_avg = 100000;
+}
+
+#define USE_ACT_INDEX   0
+#define OUTPUT_NORM_ACT_STATS   0
+
+#if USE_ACT_INDEX
+/* Calculate and activity index for each mb */
+static void calc_activity_index( VP8_COMP *cpi, MACROBLOCK *x )
+{
+    VP8_COMMON *const cm = & cpi->common;
+    int mb_row, mb_col;
+
+    int64_t act;
+    int64_t a;
+    int64_t b;
+
+#if OUTPUT_NORM_ACT_STATS
+    FILE *f = fopen("norm_act.stt", "a");
+    fprintf(f, "\n%12d\n", cpi->activity_avg );
+#endif
+
+    /* Reset pointers to start of activity map */
+    x->mb_activity_ptr = cpi->mb_activity_map;
+
+    /* Calculate normalized mb activity number. */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        /* for each macroblock col in image */
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            /* Read activity from the map */
+            act = *(x->mb_activity_ptr);
+
+            /* Calculate a normalized activity number */
+            a = act + 4*cpi->activity_avg;
+            b = 4*act + cpi->activity_avg;
+
+            if ( b >= a )
+                *(x->activity_ptr) = (int)((b + (a>>1))/a) - 1;
+            else
+                *(x->activity_ptr) = 1 - (int)((a + (b>>1))/b);
+
+#if OUTPUT_NORM_ACT_STATS
+            fprintf(f, " %6d", *(x->mb_activity_ptr));
+#endif
+            /* Increment activity map pointers */
+            x->mb_activity_ptr++;
+        }
+
+#if OUTPUT_NORM_ACT_STATS
+        fprintf(f, "\n");
+#endif
+
+    }
+
+#if OUTPUT_NORM_ACT_STATS
+    fclose(f);
+#endif
+
+}
+#endif
+
+/* Loop through all MBs. Note activity of each, average activity and
+ * calculate a normalized activity for each
+ */
+static void build_activity_map( VP8_COMP *cpi )
+{
+    MACROBLOCK *const x = & cpi->mb;
+    MACROBLOCKD *xd = &x->e_mbd;
+    VP8_COMMON *const cm = & cpi->common;
+
+#if ALT_ACT_MEASURE
+    YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+    int recon_yoffset;
+    int recon_y_stride = new_yv12->y_stride;
+#endif
+
+    int mb_row, mb_col;
+    unsigned int mb_activity;
+    int64_t activity_sum = 0;
+
+    /* for each macroblock row in image */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+#if ALT_ACT_MEASURE
+        /* reset above block coeffs */
+        xd->up_available = (mb_row != 0);
+        recon_yoffset = (mb_row * recon_y_stride * 16);
+#endif
+        /* for each macroblock col in image */
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+#if ALT_ACT_MEASURE
+            xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
+            xd->left_available = (mb_col != 0);
+            recon_yoffset += 16;
+#endif
+            /* Copy current mb to a buffer */
+            vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+            /* measure activity */
+            mb_activity = mb_activity_measure( cpi, x, mb_row, mb_col );
+
+            /* Keep frame sum */
+            activity_sum += mb_activity;
+
+            /* Store MB level activity details. */
+            *x->mb_activity_ptr = mb_activity;
+
+            /* Increment activity map pointer */
+            x->mb_activity_ptr++;
+
+            /* adjust to the next column of source macroblocks */
+            x->src.y_buffer += 16;
+        }
+
+
+        /* adjust to the next row of mbs */
+        x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+
+#if ALT_ACT_MEASURE
+        /* extend the recon for intra prediction */
+        vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
+                          xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+#endif
+
+    }
+
+    /* Calculate an "average" MB activity */
+    calc_av_activity(cpi, activity_sum);
+
+#if USE_ACT_INDEX
+    /* Calculate an activity index number of each mb */
+    calc_activity_index( cpi, x );
+#endif
+
+}
+
+/* Macroblock activity masking */
+void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)
+{
+#if USE_ACT_INDEX
+    x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2);
+    x->errorperbit = x->rdmult * 100 /(110 * x->rddiv);
+    x->errorperbit += (x->errorperbit==0);
+#else
+    int64_t a;
+    int64_t b;
+    int64_t act = *(x->mb_activity_ptr);
+
+    /* Apply the masking to the RD multiplier. */
+    a = act + (2*cpi->activity_avg);
+    b = (2*act) + cpi->activity_avg;
+
+    x->rdmult = (unsigned int)(((int64_t)x->rdmult*b + (a>>1))/a);
+    x->errorperbit = x->rdmult * 100 /(110 * x->rddiv);
+    x->errorperbit += (x->errorperbit==0);
+#endif
+
+    /* Activity based Zbin adjustment */
+    adjust_act_zbin(cpi, x);
+}
+
+static
+void encode_mb_row(VP8_COMP *cpi,
+                   VP8_COMMON *cm,
+                   int mb_row,
+                   MACROBLOCK  *x,
+                   MACROBLOCKD *xd,
+                   TOKENEXTRA **tp,
+                   int *segment_counts,
+                   int *totalrate)
+{
+    int recon_yoffset, recon_uvoffset;
+    int mb_col;
+    int ref_fb_idx = cm->lst_fb_idx;
+    int dst_fb_idx = cm->new_fb_idx;
+    int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+    int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+    int map_index = (mb_row * cpi->common.mb_cols);
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    const int num_part = (1 << cm->multi_token_partition);
+    TOKENEXTRA * tp_start = cpi->tok;
+    vp8_writer *w;
+#endif
+
+#if CONFIG_MULTITHREAD
+    const int nsync = cpi->mt_sync_range;
+    const int rightmost_col = cm->mb_cols + nsync;
+    volatile const int *last_row_current_mb_col;
+    volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+
+    if ((cpi->b_multi_threaded != 0) && (mb_row != 0))
+        last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
+    else
+        last_row_current_mb_col = &rightmost_col;
+#endif
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    if(num_part > 1)
+        w= &cpi->bc[1 + (mb_row % num_part)];
+    else
+        w = &cpi->bc[1];
+#endif
+
+    /* reset above block coeffs */
+    xd->above_context = cm->above_context;
+
+    xd->up_available = (mb_row != 0);
+    recon_yoffset = (mb_row * recon_y_stride * 16);
+    recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+    cpi->tplist[mb_row].start = *tp;
+    /* printf("Main mb_row = %d\n", mb_row); */
+
+    /* Distance of Mb to the top & bottom edges, specified in 1/8th pel
+     * units as they are always compared to values that are in 1/8th pel
+     */
+    xd->mb_to_top_edge = -((mb_row * 16) << 3);
+    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+
+    /* Set up limit values for vertical motion vector components
+     * to prevent them extending beyond the UMV borders
+     */
+    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
+                        + (VP8BORDERINPIXELS - 16);
+
+    /* Set the mb activity pointer to the start of the row. */
+    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+    /* for each macroblock col in image */
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+    {
+
+#if  (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+        *tp = cpi->tok;
+#endif
+        /* Distance of Mb to the left & right edges, specified in
+         * 1/8th pel units as they are always compared to values
+         * that are in 1/8th pel units
+         */
+        xd->mb_to_left_edge = -((mb_col * 16) << 3);
+        xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+
+        /* Set up limit values for horizontal motion vector components
+         * to prevent them extending beyond the UMV borders
+         */
+        x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
+                            + (VP8BORDERINPIXELS - 16);
+
+        xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+        xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+        xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+        xd->left_available = (mb_col != 0);
+
+        x->rddiv = cpi->RDDIV;
+        x->rdmult = cpi->RDMULT;
+
+        /* Copy current mb to a buffer */
+        vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded != 0)
+        {
+            *current_mb_col = mb_col - 1; /* set previous MB done */
+
+            if ((mb_col & (nsync - 1)) == 0)
+            {
+                while (mb_col > (*last_row_current_mb_col - nsync))
+                {
+                    x86_pause_hint();
+                    thread_sleep(0);
+                }
+            }
+        }
+#endif
+
+        if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+            vp8_activity_masking(cpi, x);
+
+        /* Is segmentation enabled */
+        /* MB level adjustment to quantizer */
+        if (xd->segmentation_enabled)
+        {
+            /* Code to set segment id in xd->mbmi.segment_id for current MB
+             * (with range checking)
+             */
+            if (cpi->segmentation_map[map_index+mb_col] <= 3)
+                xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[map_index+mb_col];
+            else
+                xd->mode_info_context->mbmi.segment_id = 0;
+
+            vp8cx_mb_init_quantizer(cpi, x, 1);
+        }
+        else
+            /* Set to Segment 0 by default */
+            xd->mode_info_context->mbmi.segment_id = 0;
+
+        x->active_ptr = cpi->active_map + map_index + mb_col;
+
+        if (cm->frame_type == KEY_FRAME)
+        {
+            *totalrate += vp8cx_encode_intra_macroblock(cpi, x, tp);
+#ifdef MODE_STATS
+            y_modes[xd->mbmi.mode] ++;
+#endif
+        }
+        else
+        {
+            *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col);
+
+#ifdef MODE_STATS
+            inter_y_modes[xd->mbmi.mode] ++;
+
+            if (xd->mbmi.mode == SPLITMV)
+            {
+                int b;
+
+                for (b = 0; b < xd->mbmi.partition_count; b++)
+                {
+                    inter_b_modes[x->partition->bmi[b].mode] ++;
+                }
+            }
+
+#endif
+
+            /* Special case code for cyclic refresh
+             * If cyclic update enabled then copy xd->mbmi.segment_id; (which
+             * may have been updated based on mode during
+             * vp8cx_encode_inter_macroblock()) back into the global
+             * segmentation map
+             */
+            if ((cpi->current_layer == 0) &&
+                (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled))
+            {
+                cpi->segmentation_map[map_index+mb_col] = xd->mode_info_context->mbmi.segment_id;
+
+                /* If the block has been refreshed mark it as clean (the
+                 * magnitude of the -ve influences how long it will be before
+                 * we consider another refresh):
+                 * Else if it was coded (last frame 0,0) and has not already
+                 * been refreshed then mark it as a candidate for cleanup
+                 * next time (marked 0) else mark it as dirty (1).
+                 */
+                if (xd->mode_info_context->mbmi.segment_id)
+                    cpi->cyclic_refresh_map[map_index+mb_col] = -1;
+                else if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
+                {
+                    if (cpi->cyclic_refresh_map[map_index+mb_col] == 1)
+                        cpi->cyclic_refresh_map[map_index+mb_col] = 0;
+                }
+                else
+                    cpi->cyclic_refresh_map[map_index+mb_col] = 1;
+
+            }
+        }
+
+        cpi->tplist[mb_row].stop = *tp;
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        /* pack tokens for this MB */
+        {
+            int tok_count = *tp - tp_start;
+            pack_tokens(w, tp_start, tok_count);
+        }
+#endif
+        /* Increment pointer into gf usage flags structure. */
+        x->gf_active_ptr++;
+
+        /* Increment the activity mask pointers. */
+        x->mb_activity_ptr++;
+
+        /* adjust to the next column of macroblocks */
+        x->src.y_buffer += 16;
+        x->src.u_buffer += 8;
+        x->src.v_buffer += 8;
+
+        recon_yoffset += 16;
+        recon_uvoffset += 8;
+
+        /* Keep track of segment usage */
+        segment_counts[xd->mode_info_context->mbmi.segment_id] ++;
+
+        /* skip to next mb */
+        xd->mode_info_context++;
+        x->partition_info++;
+        xd->above_context++;
+    }
+
+    /* extend the recon for intra prediction */
+    vp8_extend_mb_row( &cm->yv12_fb[dst_fb_idx],
+                        xd->dst.y_buffer + 16,
+                        xd->dst.u_buffer + 8,
+                        xd->dst.v_buffer + 8);
+
+#if CONFIG_MULTITHREAD
+    if (cpi->b_multi_threaded != 0)
+        *current_mb_col = rightmost_col;
+#endif
+
+    /* this is to account for the border */
+    xd->mode_info_context++;
+    x->partition_info++;
+}
+
+static void init_encode_frame_mb_context(VP8_COMP *cpi)
+{
+    MACROBLOCK *const x = & cpi->mb;
+    VP8_COMMON *const cm = & cpi->common;
+    MACROBLOCKD *const xd = & x->e_mbd;
+
+    /* GF active flags data structure */
+    x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
+
+    /* Activity map pointer */
+    x->mb_activity_ptr = cpi->mb_activity_map;
+
+    x->act_zbin_adj = 0;
+
+    x->partition_info = x->pi;
+
+    xd->mode_info_context = cm->mi;
+    xd->mode_info_stride = cm->mode_info_stride;
+
+    xd->frame_type = cm->frame_type;
+
+    /* reset intra mode contexts */
+    if (cm->frame_type == KEY_FRAME)
+        vp8_init_mbmode_probs(cm);
+
+    /* Copy data over into macro block data structures. */
+    x->src = * cpi->Source;
+    xd->pre = cm->yv12_fb[cm->lst_fb_idx];
+    xd->dst = cm->yv12_fb[cm->new_fb_idx];
+
+    /* set up frame for intra coded blocks */
+    vp8_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
+
+    vp8_build_block_offsets(x);
+
+    xd->mode_info_context->mbmi.mode = DC_PRED;
+    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+
+    xd->left_context = &cm->left_context;
+
+    vp8_zero(cpi->count_mb_ref_frame_usage)
+
+    x->mvc = cm->fc.mvc;
+
+    vpx_memset(cm->above_context, 0,
+               sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
+
+    /* Special case treatment when GF and ARF are not sensible options
+     * for reference
+     */
+    if (cpi->ref_frame_flags == VP8_LAST_FRAME)
+        vp8_calc_ref_frame_costs(x->ref_frame_cost,
+                                 cpi->prob_intra_coded,255,128);
+    else if ((cpi->oxcf.number_of_layers > 1) &&
+               (cpi->ref_frame_flags == VP8_GOLD_FRAME))
+        vp8_calc_ref_frame_costs(x->ref_frame_cost,
+                                 cpi->prob_intra_coded,1,255);
+    else if ((cpi->oxcf.number_of_layers > 1) &&
+                (cpi->ref_frame_flags == VP8_ALTR_FRAME))
+        vp8_calc_ref_frame_costs(x->ref_frame_cost,
+                                 cpi->prob_intra_coded,1,1);
+    else
+        vp8_calc_ref_frame_costs(x->ref_frame_cost,
+                                 cpi->prob_intra_coded,
+                                 cpi->prob_last_coded,
+                                 cpi->prob_gf_coded);
+
+    xd->fullpixel_mask = 0xffffffff;
+    if(cm->full_pixel)
+        xd->fullpixel_mask = 0xfffffff8;
+
+    vp8_zero(x->coef_counts);
+    vp8_zero(x->ymode_count);
+    vp8_zero(x->uv_mode_count)
+    x->prediction_error = 0;
+    x->intra_error = 0;
+}
+
+static void sum_coef_counts(MACROBLOCK *x, MACROBLOCK *x_thread)
+{
+    int i = 0;
+    do
+    {
+        int j = 0;
+        do
+        {
+            int k = 0;
+            do
+            {
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                int t = 0;      /* token/prob index */
+
+                do
+                {
+                    x->coef_counts [i][j][k][t] +=
+                        x_thread->coef_counts [i][j][k][t];
+                }
+                while (++t < ENTROPY_NODES);
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+}
+
+void vp8_encode_frame(VP8_COMP *cpi)
+{
+    int mb_row;
+    MACROBLOCK *const x = & cpi->mb;
+    VP8_COMMON *const cm = & cpi->common;
+    MACROBLOCKD *const xd = & x->e_mbd;
+    TOKENEXTRA *tp = cpi->tok;
+    int segment_counts[MAX_MB_SEGMENTS];
+    int totalrate;
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    BOOL_CODER * bc = &cpi->bc[1]; /* bc[0] is for control partition */
+    const int num_part = (1 << cm->multi_token_partition);
+#endif
+
+    vpx_memset(segment_counts, 0, sizeof(segment_counts));
+    totalrate = 0;
+
+    if (cpi->compressor_speed == 2)
+    {
+        if (cpi->oxcf.cpu_used < 0)
+            cpi->Speed = -(cpi->oxcf.cpu_used);
+        else
+            vp8_auto_select_speed(cpi);
+    }
+
+    /* Functions setup for all frame types so we can use MC in AltRef */
+    if(!cm->use_bilinear_mc_filter)
+    {
+        xd->subpixel_predict        = vp8_sixtap_predict4x4;
+        xd->subpixel_predict8x4     = vp8_sixtap_predict8x4;
+        xd->subpixel_predict8x8     = vp8_sixtap_predict8x8;
+        xd->subpixel_predict16x16   = vp8_sixtap_predict16x16;
+    }
+    else
+    {
+        xd->subpixel_predict        = vp8_bilinear_predict4x4;
+        xd->subpixel_predict8x4     = vp8_bilinear_predict8x4;
+        xd->subpixel_predict8x8     = vp8_bilinear_predict8x8;
+        xd->subpixel_predict16x16   = vp8_bilinear_predict16x16;
+    }
+
+    cpi->mb.skip_true_count = 0;
+    cpi->tok_count = 0;
+
+#if 0
+    /* Experimental code */
+    cpi->frame_distortion = 0;
+    cpi->last_mb_distortion = 0;
+#endif
+
+    xd->mode_info_context = cm->mi;
+
+    vp8_zero(cpi->mb.MVcount);
+
+    vp8cx_frame_init_quantizer(cpi);
+
+    vp8_initialize_rd_consts(cpi,
+                             vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
+
+    vp8cx_initialize_me_consts(cpi, cm->base_qindex);
+
+    if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    {
+        /* Initialize encode frame context. */
+        init_encode_frame_mb_context(cpi);
+
+        /* Build a frame level activity map */
+        build_activity_map(cpi);
+    }
+
+    /* re-init encode frame context. */
+    init_encode_frame_mb_context(cpi);
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    {
+        int i;
+        for(i = 0; i < num_part; i++)
+        {
+            vp8_start_encode(&bc[i], cpi->partition_d[i + 1],
+                    cpi->partition_d_end[i + 1]);
+            bc[i].error = &cm->error;
+        }
+    }
+
+#endif
+
+    {
+        struct vpx_usec_timer  emr_timer;
+        vpx_usec_timer_start(&emr_timer);
+
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded)
+        {
+            int i;
+
+            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);
+
+            for (i = 0; i < cm->mb_rows; i++)
+                cpi->mt_current_mb_col[i] = -1;
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                sem_post(&cpi->h_event_start_encoding[i]);
+            }
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
+            {
+                vp8_zero(cm->left_context)
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                tp = cpi->tok;
+#else
+                tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+#endif
+
+                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+                /* adjust to the next row of mbs */
+                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+                x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+                x->src.v_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+                x->partition_info  += xd->mode_info_stride * cpi->encoding_thread_count;
+                x->gf_active_ptr   += cm->mb_cols * cpi->encoding_thread_count;
+
+                if(mb_row == cm->mb_rows - 1)
+                {
+                    sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
+                }
+            }
+
+            sem_wait(&cpi->h_event_end_encoding); /* wait for other threads to finish */
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+            {
+                cpi->tok_count += (unsigned int)
+                  (cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start);
+            }
+
+            if (xd->segmentation_enabled)
+            {
+                int i, j;
+
+                if (xd->segmentation_enabled)
+                {
+
+                    for (i = 0; i < cpi->encoding_thread_count; i++)
+                    {
+                        for (j = 0; j < 4; j++)
+                            segment_counts[j] += cpi->mb_row_ei[i].segment_counts[j];
+                    }
+                }
+            }
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                int mode_count;
+                int mv_vals;
+                totalrate += cpi->mb_row_ei[i].totalrate;
+
+                cpi->mb.skip_true_count += cpi->mb_row_ei[i].mb.skip_true_count;
+
+                for(mode_count = 0; mode_count < VP8_YMODES; mode_count++)
+                    cpi->mb.ymode_count[mode_count] +=
+                        cpi->mb_row_ei[i].mb.ymode_count[mode_count];
+
+                for(mode_count = 0; mode_count < VP8_UV_MODES; mode_count++)
+                    cpi->mb.uv_mode_count[mode_count] +=
+                        cpi->mb_row_ei[i].mb.uv_mode_count[mode_count];
+
+                for(mv_vals = 0; mv_vals < MVvals; mv_vals++)
+                {
+                    cpi->mb.MVcount[0][mv_vals] +=
+                        cpi->mb_row_ei[i].mb.MVcount[0][mv_vals];
+                    cpi->mb.MVcount[1][mv_vals] +=
+                        cpi->mb_row_ei[i].mb.MVcount[1][mv_vals];
+                }
+
+                cpi->mb.prediction_error +=
+                    cpi->mb_row_ei[i].mb.prediction_error;
+                cpi->mb.intra_error += cpi->mb_row_ei[i].mb.intra_error;
+
+                /* add up counts for each thread */
+                sum_coef_counts(x, &cpi->mb_row_ei[i].mb);
+            }
+
+        }
+        else
+#endif
+        {
+
+            /* for each macroblock row in image */
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+            {
+                vp8_zero(cm->left_context)
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                tp = cpi->tok;
+#endif
+
+                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+                /* adjust to the next row of mbs */
+                x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+                x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+                x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+            }
+
+            cpi->tok_count = (unsigned int)(tp - cpi->tok);
+        }
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        {
+            int i;
+            for(i = 0; i < num_part; i++)
+            {
+                vp8_stop_encode(&bc[i]);
+                cpi->partition_sz[i+1] = bc[i].pos;
+            }
+        }
+#endif
+
+        vpx_usec_timer_mark(&emr_timer);
+        cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
+    }
+
+
+    // Work out the segment probabilities if segmentation is enabled
+    // and needs to be updated
+    if (xd->segmentation_enabled && xd->update_mb_segmentation_map)
+    {
+        int tot_count;
+        int i;
+
+        /* Set to defaults */
+        vpx_memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs));
+
+        tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + segment_counts[3];
+
+        if (tot_count)
+        {
+            xd->mb_segment_tree_probs[0] = ((segment_counts[0] + segment_counts[1]) * 255) / tot_count;
+
+            tot_count = segment_counts[0] + segment_counts[1];
+
+            if (tot_count > 0)
+            {
+                xd->mb_segment_tree_probs[1] = (segment_counts[0] * 255) / tot_count;
+            }
+
+            tot_count = segment_counts[2] + segment_counts[3];
+
+            if (tot_count > 0)
+                xd->mb_segment_tree_probs[2] = (segment_counts[2] * 255) / tot_count;
+
+            /* Zero probabilities not allowed */
+            for (i = 0; i < MB_FEATURE_TREE_PROBS; i ++)
+            {
+                if (xd->mb_segment_tree_probs[i] == 0)
+                    xd->mb_segment_tree_probs[i] = 1;
+            }
+        }
+    }
+
+    /* projected_frame_size in units of BYTES */
+    cpi->projected_frame_size = totalrate >> 8;
+
+    /* Make a note of the percentage MBs coded Intra. */
+    if (cm->frame_type == KEY_FRAME)
+    {
+        cpi->this_frame_percent_intra = 100;
+    }
+    else
+    {
+        int tot_modes;
+
+        tot_modes = cpi->count_mb_ref_frame_usage[INTRA_FRAME]
+                    + cpi->count_mb_ref_frame_usage[LAST_FRAME]
+                    + cpi->count_mb_ref_frame_usage[GOLDEN_FRAME]
+                    + cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+
+        if (tot_modes)
+            cpi->this_frame_percent_intra = cpi->count_mb_ref_frame_usage[INTRA_FRAME] * 100 / tot_modes;
+
+    }
+
+#if ! CONFIG_REALTIME_ONLY
+    /* Adjust the projected reference frame usage probability numbers to
+     * reflect what we have just seen. This may be useful when we make
+     * multiple iterations of the recode loop rather than continuing to use
+     * values from the previous frame.
+     */
+    if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
+        (!cm->refresh_alt_ref_frame && !cm->refresh_golden_frame)))
+    {
+      vp8_convert_rfct_to_prob(cpi);
+    }
+#endif
+}
+void vp8_setup_block_ptrs(MACROBLOCK *x)
+{
+    int r, c;
+    int i;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            x->block[r*4+c].src_diff = x->src_diff + r * 4 * 16 + c * 4;
+        }
+    }
+
+    for (r = 0; r < 2; r++)
+    {
+        for (c = 0; c < 2; c++)
+        {
+            x->block[16 + r*2+c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;
+        }
+    }
+
+
+    for (r = 0; r < 2; r++)
+    {
+        for (c = 0; c < 2; c++)
+        {
+            x->block[20 + r*2+c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;
+        }
+    }
+
+    x->block[24].src_diff = x->src_diff + 384;
+
+
+    for (i = 0; i < 25; i++)
+    {
+        x->block[i].coeff = x->coeff + i * 16;
+    }
+}
+
+void vp8_build_block_offsets(MACROBLOCK *x)
+{
+    int block = 0;
+    int br, bc;
+
+    vp8_build_block_doffsets(&x->e_mbd);
+
+    /* y blocks */
+    x->thismb_ptr = &x->thismb[0];
+    for (br = 0; br < 4; br++)
+    {
+        for (bc = 0; bc < 4; bc++)
+        {
+            BLOCK *this_block = &x->block[block];
+            this_block->base_src = &x->thismb_ptr;
+            this_block->src_stride = 16;
+            this_block->src = 4 * br * 16 + 4 * bc;
+            ++block;
+        }
+    }
+
+    /* u blocks */
+    for (br = 0; br < 2; br++)
+    {
+        for (bc = 0; bc < 2; bc++)
+        {
+            BLOCK *this_block = &x->block[block];
+            this_block->base_src = &x->src.u_buffer;
+            this_block->src_stride = x->src.uv_stride;
+            this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+            ++block;
+        }
+    }
+
+    /* v blocks */
+    for (br = 0; br < 2; br++)
+    {
+        for (bc = 0; bc < 2; bc++)
+        {
+            BLOCK *this_block = &x->block[block];
+            this_block->base_src = &x->src.v_buffer;
+            this_block->src_stride = x->src.uv_stride;
+            this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+            ++block;
+        }
+    }
+}
+
+static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)
+{
+    const MACROBLOCKD *xd = & x->e_mbd;
+    const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode;
+    const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode;
+
+#ifdef MODE_STATS
+    const int is_key = cpi->common.frame_type == KEY_FRAME;
+
+    ++ (is_key ? uv_modes : inter_uv_modes)[uvm];
+
+    if (m == B_PRED)
+    {
+        unsigned int *const bct = is_key ? b_modes : inter_b_modes;
+
+        int b = 0;
+
+        do
+        {
+            ++ bct[xd->block[b].bmi.mode];
+        }
+        while (++b < 16);
+    }
+
+#endif
+
+    ++x->ymode_count[m];
+    ++x->uv_mode_count[uvm];
+
+}
+
+/* Experimental stub function to create a per MB zbin adjustment based on
+ * some previously calculated measure of MB activity.
+ */
+static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x )
+{
+#if USE_ACT_INDEX
+    x->act_zbin_adj = *(x->mb_activity_ptr);
+#else
+    int64_t a;
+    int64_t b;
+    int64_t act = *(x->mb_activity_ptr);
+
+    /* Apply the masking to the RD multiplier. */
+    a = act + 4*cpi->activity_avg;
+    b = 4*act + cpi->activity_avg;
+
+    if ( act > cpi->activity_avg )
+        x->act_zbin_adj = (int)(((int64_t)b + (a>>1))/a) - 1;
+    else
+        x->act_zbin_adj = 1 - (int)(((int64_t)a + (b>>1))/b);
+#endif
+}
+
+int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+                                  TOKENEXTRA **t)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+    int rate;
+
+    if (cpi->sf.RD && cpi->compressor_speed != 2)
+        vp8_rd_pick_intra_mode(x, &rate);
+    else
+        vp8_pick_intra_mode(x, &rate);
+
+    if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    {
+        adjust_act_zbin( cpi, x );
+        vp8_update_zbin_extra(cpi, x);
+    }
+
+    if (x->e_mbd.mode_info_context->mbmi.mode == B_PRED)
+        vp8_encode_intra4x4mby(x);
+    else
+        vp8_encode_intra16x16mby(x);
+
+    vp8_encode_intra16x16mbuv(x);
+
+    sum_intra_stats(cpi, x);
+
+    vp8_tokenize_mb(cpi, x, t);
+
+    if (xd->mode_info_context->mbmi.mode != B_PRED)
+        vp8_inverse_transform_mby(xd);
+
+    vp8_dequant_idct_add_uv_block
+                    (xd->qcoeff+16*16, xd->dequant_uv,
+                     xd->dst.u_buffer, xd->dst.v_buffer,
+                     xd->dst.uv_stride, xd->eobs+16);
+    return rate;
+}
+#ifdef SPEEDSTATS
+extern int cnt_pm;
+#endif
+
+extern void vp8_fix_contexts(MACROBLOCKD *x);
+
+int vp8cx_encode_inter_macroblock
+(
+    VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+    int recon_yoffset, int recon_uvoffset,
+    int mb_row, int mb_col
+)
+{
+    MACROBLOCKD *const xd = &x->e_mbd;
+    int intra_error = 0;
+    int rate;
+    int distortion;
+
+    x->skip = 0;
+
+    if (xd->segmentation_enabled)
+        x->encode_breakout = cpi->segment_encode_breakout[xd->mode_info_context->mbmi.segment_id];
+    else
+        x->encode_breakout = cpi->oxcf.encode_breakout;
+
+#if CONFIG_TEMPORAL_DENOISING
+    /* Reset the best sse mode/mv for each macroblock. */
+    x->best_reference_frame = INTRA_FRAME;
+    x->best_zeromv_reference_frame = INTRA_FRAME;
+    x->best_sse_inter_mode = 0;
+    x->best_sse_mv.as_int = 0;
+    x->need_to_clamp_best_mvs = 0;
+#endif
+
+    if (cpi->sf.RD)
+    {
+        int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
+
+        /* Are we using the fast quantizer for the mode selection? */
+        if(cpi->sf.use_fastquant_for_pick)
+        {
+            cpi->mb.quantize_b      = vp8_fast_quantize_b;
+            cpi->mb.quantize_b_pair = vp8_fast_quantize_b_pair;
+
+            /* the fast quantizer does not use zbin_extra, so
+             * do not recalculate */
+            cpi->zbin_mode_boost_enabled = 0;
+        }
+        vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
+                               &distortion, &intra_error);
+
+        /* switch back to the regular quantizer for the encode */
+        if (cpi->sf.improved_quant)
+        {
+            cpi->mb.quantize_b      = vp8_regular_quantize_b;
+            cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair;
+        }
+
+        /* restore cpi->zbin_mode_boost_enabled */
+        cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
+
+    }
+    else
+    {
+        vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
+                            &distortion, &intra_error, mb_row, mb_col);
+    }
+
+    x->prediction_error += distortion;
+    x->intra_error += intra_error;
+
+    if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    {
+        /* Adjust the zbin based on this MB rate. */
+        adjust_act_zbin( cpi, x );
+    }
+
+#if 0
+    /* Experimental RD code */
+    cpi->frame_distortion += distortion;
+    cpi->last_mb_distortion = distortion;
+#endif
+
+    /* MB level adjutment to quantizer setup */
+    if (xd->segmentation_enabled)
+    {
+        /* If cyclic update enabled */
+        if (cpi->current_layer == 0 && cpi->cyclic_refresh_mode_enabled)
+        {
+            /* Clear segment_id back to 0 if not coded (last frame 0,0) */
+            if ((xd->mode_info_context->mbmi.segment_id == 1) &&
+                ((xd->mode_info_context->mbmi.ref_frame != LAST_FRAME) || (xd->mode_info_context->mbmi.mode != ZEROMV)))
+            {
+                xd->mode_info_context->mbmi.segment_id = 0;
+
+                /* segment_id changed, so update */
+                vp8cx_mb_init_quantizer(cpi, x, 1);
+            }
+        }
+    }
+
+    {
+        /* Experimental code. Special case for gf and arf zeromv modes.
+         * Increase zbin size to supress noise
+         */
+        cpi->zbin_mode_boost = 0;
+        if (cpi->zbin_mode_boost_enabled)
+        {
+            if ( xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME )
+            {
+                if (xd->mode_info_context->mbmi.mode == ZEROMV)
+                {
+                    if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+                        cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+                    else
+                        cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+                }
+                else if (xd->mode_info_context->mbmi.mode == SPLITMV)
+                    cpi->zbin_mode_boost = 0;
+                else
+                    cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+            }
+        }
+
+        /* The fast quantizer doesn't use zbin_extra, only do so with
+         * the regular quantizer. */
+        if (cpi->sf.improved_quant)
+            vp8_update_zbin_extra(cpi, x);
+    }
+
+    cpi->count_mb_ref_frame_usage[xd->mode_info_context->mbmi.ref_frame] ++;
+
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+    {
+        vp8_encode_intra16x16mbuv(x);
+
+        if (xd->mode_info_context->mbmi.mode == B_PRED)
+        {
+            vp8_encode_intra4x4mby(x);
+        }
+        else
+        {
+            vp8_encode_intra16x16mby(x);
+        }
+
+        sum_intra_stats(cpi, x);
+    }
+    else
+    {
+        int ref_fb_idx;
+
+        if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+            ref_fb_idx = cpi->common.lst_fb_idx;
+        else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+            ref_fb_idx = cpi->common.gld_fb_idx;
+        else
+            ref_fb_idx = cpi->common.alt_fb_idx;
+
+        xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+        xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+        xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+        if (!x->skip)
+        {
+            vp8_encode_inter16x16(x);
+        }
+        else
+            vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+                                           xd->dst.u_buffer, xd->dst.v_buffer,
+                                           xd->dst.y_stride, xd->dst.uv_stride);
+
+    }
+
+    if (!x->skip)
+    {
+        vp8_tokenize_mb(cpi, x, t);
+
+        if (xd->mode_info_context->mbmi.mode != B_PRED)
+            vp8_inverse_transform_mby(xd);
+
+        vp8_dequant_idct_add_uv_block
+                        (xd->qcoeff+16*16, xd->dequant_uv,
+                         xd->dst.u_buffer, xd->dst.v_buffer,
+                         xd->dst.uv_stride, xd->eobs+16);
+    }
+    else
+    {
+        /* always set mb_skip_coeff as it is needed by the loopfilter */
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+
+        if (cpi->common.mb_no_coeff_skip)
+        {
+            x->skip_true_count ++;
+            vp8_fix_contexts(xd);
+        }
+        else
+        {
+            vp8_stuff_mb(cpi, x, t);
+        }
+    }
+
+    return rate;
+}
diff --git a/libvpx/vp8/encoder/encodeframe.h b/libvpx/vp8/encoder/encodeframe.h
new file mode 100644
index 0000000..4dd6ba0
--- /dev/null
+++ b/libvpx/vp8/encoder/encodeframe.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef ENCODEFRAME_H
+#define ENCODEFRAME_H
+extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
+
+extern void vp8_build_block_offsets(MACROBLOCK *x);
+
+extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+
+extern void vp8_encode_frame(VP8_COMP *cpi);
+
+extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+        TOKENEXTRA **t,
+        int recon_yoffset, int recon_uvoffset,
+        int mb_row, int mb_col);
+
+extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+        TOKENEXTRA **t);
+#endif
diff --git a/libvpx/vp8/encoder/encodeintra.c b/libvpx/vp8/encoder/encodeintra.c
new file mode 100644
index 0000000..340dd63
--- /dev/null
+++ b/libvpx/vp8/encoder/encodeintra.c
@@ -0,0 +1,138 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "quantize.h"
+#include "vp8/common/reconintra4x4.h"
+#include "encodemb.h"
+#include "vp8/common/invtrans.h"
+#include "encodeintra.h"
+
+
+int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
+{
+
+    int i;
+    int intra_pred_var = 0;
+    (void) cpi;
+
+    if (use_dc_pred)
+    {
+        x->e_mbd.mode_info_context->mbmi.mode = DC_PRED;
+        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+        x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+        vp8_encode_intra16x16mby(x);
+
+        vp8_inverse_transform_mby(&x->e_mbd);
+    }
+    else
+    {
+        for (i = 0; i < 16; i++)
+        {
+            x->e_mbd.block[i].bmi.as_mode = B_DC_PRED;
+            vp8_encode_intra4x4block(x, i);
+        }
+    }
+
+    intra_pred_var = vp8_get_mb_ss(x->src_diff);
+
+    return intra_pred_var;
+}
+
+void vp8_encode_intra4x4block(MACROBLOCK *x, int ib)
+{
+    BLOCKD *b = &x->e_mbd.block[ib];
+    BLOCK *be = &x->block[ib];
+    int dst_stride = x->e_mbd.dst.y_stride;
+    unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
+    unsigned char *Above = dst - dst_stride;
+    unsigned char *yleft = dst - 1;
+    unsigned char top_left = Above[-1];
+
+    vp8_intra4x4_predict(Above, yleft, dst_stride, b->bmi.as_mode,
+                         b->predictor, 16, top_left);
+
+    vp8_subtract_b(be, b, 16);
+
+    x->short_fdct4x4(be->src_diff, be->coeff, 32);
+
+    x->quantize_b(be, b);
+
+    if (*b->eob > 1)
+    {
+      vp8_short_idct4x4llm(b->dqcoeff, b->predictor, 16, dst, dst_stride);
+    }
+    else
+    {
+      vp8_dc_only_idct_add(b->dqcoeff[0], b->predictor, 16, dst, dst_stride);
+    }
+}
+
+void vp8_encode_intra4x4mby(MACROBLOCK *mb)
+{
+    int i;
+
+    MACROBLOCKD *xd = &mb->e_mbd;
+    intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16);
+
+    for (i = 0; i < 16; i++)
+        vp8_encode_intra4x4block(mb, i);
+    return;
+}
+
+void vp8_encode_intra16x16mby(MACROBLOCK *x)
+{
+    BLOCK *b = &x->block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    vp8_build_intra_predictors_mby_s(xd,
+                                         xd->dst.y_buffer - xd->dst.y_stride,
+                                         xd->dst.y_buffer - 1,
+                                         xd->dst.y_stride,
+                                         xd->dst.y_buffer,
+                                         xd->dst.y_stride);
+
+    vp8_subtract_mby(x->src_diff, *(b->base_src),
+        b->src_stride, xd->dst.y_buffer, xd->dst.y_stride);
+
+    vp8_transform_intra_mby(x);
+
+    vp8_quantize_mby(x);
+
+    if (x->optimize)
+        vp8_optimize_mby(x);
+}
+
+void vp8_encode_intra16x16mbuv(MACROBLOCK *x)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    vp8_build_intra_predictors_mbuv_s(xd, xd->dst.u_buffer - xd->dst.uv_stride,
+                                      xd->dst.v_buffer - xd->dst.uv_stride,
+                                      xd->dst.u_buffer - 1,
+                                      xd->dst.v_buffer - 1,
+                                      xd->dst.uv_stride,
+                                      xd->dst.u_buffer, xd->dst.v_buffer,
+                                      xd->dst.uv_stride);
+
+    vp8_subtract_mbuv(x->src_diff, x->src.u_buffer,
+        x->src.v_buffer, x->src.uv_stride, xd->dst.u_buffer,
+        xd->dst.v_buffer, xd->dst.uv_stride);
+
+    vp8_transform_mbuv(x);
+
+    vp8_quantize_mbuv(x);
+
+    if (x->optimize)
+        vp8_optimize_mbuv(x);
+}
diff --git a/libvpx/vp8/encoder/encodeintra.h b/libvpx/vp8/encoder/encodeintra.h
new file mode 100644
index 0000000..be2141f
--- /dev/null
+++ b/libvpx/vp8/encoder/encodeintra.h
@@ -0,0 +1,21 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef _ENCODEINTRA_H_
+#define _ENCODEINTRA_H_
+#include "onyx_int.h"
+
+int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred);
+void vp8_encode_intra16x16mby(MACROBLOCK *x);
+void vp8_encode_intra16x16mbuv(MACROBLOCK *x);
+void vp8_encode_intra4x4mby(MACROBLOCK *mb);
+void vp8_encode_intra4x4block(MACROBLOCK *x, int ib);
+#endif
diff --git a/libvpx/vp8/encoder/encodemb.c b/libvpx/vp8/encoder/encodemb.c
new file mode 100644
index 0000000..7d494f2
--- /dev/null
+++ b/libvpx/vp8/encoder/encodemb.c
@@ -0,0 +1,648 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "encodemb.h"
+#include "vp8/common/reconinter.h"
+#include "quantize.h"
+#include "tokenize.h"
+#include "vp8/common/invtrans.h"
+#include "vpx_mem/vpx_mem.h"
+#include "rdopt.h"
+
+void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
+{
+    unsigned char *src_ptr = (*(be->base_src) + be->src);
+    short *diff_ptr = be->src_diff;
+    unsigned char *pred_ptr = bd->predictor;
+    int src_stride = be->src_stride;
+
+    int r, c;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+        }
+
+        diff_ptr += pitch;
+        pred_ptr += pitch;
+        src_ptr  += src_stride;
+    }
+}
+
+void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+                         int src_stride, unsigned char *upred,
+                         unsigned char *vpred, int pred_stride)
+{
+    short *udiff = diff + 256;
+    short *vdiff = diff + 320;
+
+    int r, c;
+
+    for (r = 0; r < 8; r++)
+    {
+        for (c = 0; c < 8; c++)
+        {
+            udiff[c] = usrc[c] - upred[c];
+        }
+
+        udiff += 8;
+        upred += pred_stride;
+        usrc  += src_stride;
+    }
+
+    for (r = 0; r < 8; r++)
+    {
+        for (c = 0; c < 8; c++)
+        {
+            vdiff[c] = vsrc[c] - vpred[c];
+        }
+
+        vdiff += 8;
+        vpred += pred_stride;
+        vsrc  += src_stride;
+    }
+}
+
+void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
+                        unsigned char *pred, int pred_stride)
+{
+    int r, c;
+
+    for (r = 0; r < 16; r++)
+    {
+        for (c = 0; c < 16; c++)
+        {
+            diff[c] = src[c] - pred[c];
+        }
+
+        diff += 16;
+        pred += pred_stride;
+        src  += src_stride;
+    }
+}
+
+static void vp8_subtract_mb(MACROBLOCK *x)
+{
+    BLOCK *b = &x->block[0];
+
+    vp8_subtract_mby(x->src_diff, *(b->base_src),
+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
+    vp8_subtract_mbuv(x->src_diff, x->src.u_buffer,
+        x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer,
+        x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
+}
+
+static void build_dcblock(MACROBLOCK *x)
+{
+    short *src_diff_ptr = &x->src_diff[384];
+    int i;
+
+    for (i = 0; i < 16; i++)
+    {
+        src_diff_ptr[i] = x->coeff[i * 16];
+    }
+}
+
+void vp8_transform_mbuv(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i += 2)
+    {
+        x->short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 16);
+    }
+}
+
+
+void vp8_transform_intra_mby(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 2)
+    {
+        x->short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
+    }
+
+    /* build dc block from 16 y dc values */
+    build_dcblock(x);
+
+    /* do 2nd order transform on the dc block */
+    x->short_walsh4x4(&x->block[24].src_diff[0],
+        &x->block[24].coeff[0], 8);
+
+}
+
+
+static void transform_mb(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 2)
+    {
+        x->short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
+    }
+
+    /* build dc block from 16 y dc values */
+    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
+        build_dcblock(x);
+
+    for (i = 16; i < 24; i += 2)
+    {
+        x->short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 16);
+    }
+
+    /* do 2nd order transform on the dc block */
+    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
+        x->short_walsh4x4(&x->block[24].src_diff[0],
+        &x->block[24].coeff[0], 8);
+
+}
+
+
+static void transform_mby(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 2)
+    {
+        x->short_fdct8x4(&x->block[i].src_diff[0],
+            &x->block[i].coeff[0], 32);
+    }
+
+    /* build dc block from 16 y dc values */
+    if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
+    {
+        build_dcblock(x);
+        x->short_walsh4x4(&x->block[24].src_diff[0],
+            &x->block[24].coeff[0], 8);
+    }
+}
+
+
+
+#define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
+
+typedef struct vp8_token_state vp8_token_state;
+
+struct vp8_token_state{
+  int           rate;
+  int           error;
+  signed char   next;
+  signed char   token;
+  short         qc;
+};
+
+/* TODO: experiments to find optimal multiple numbers */
+#define Y1_RD_MULT 4
+#define UV_RD_MULT 2
+#define Y2_RD_MULT 16
+
+static const int plane_rd_mult[4]=
+{
+    Y1_RD_MULT,
+    Y2_RD_MULT,
+    UV_RD_MULT,
+    Y1_RD_MULT
+};
+
+static void optimize_b(MACROBLOCK *mb, int ib, int type,
+                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
+{
+    BLOCK *b;
+    BLOCKD *d;
+    vp8_token_state tokens[17][2];
+    unsigned best_mask[2];
+    const short *dequant_ptr;
+    const short *coeff_ptr;
+    short *qcoeff_ptr;
+    short *dqcoeff_ptr;
+    int eob;
+    int i0;
+    int rc;
+    int x;
+    int sz = 0;
+    int next;
+    int rdmult;
+    int rddiv;
+    int final_eob;
+    int rd_cost0;
+    int rd_cost1;
+    int rate0;
+    int rate1;
+    int error0;
+    int error1;
+    int t0;
+    int t1;
+    int best;
+    int band;
+    int pt;
+    int i;
+    int err_mult = plane_rd_mult[type];
+
+    b = &mb->block[ib];
+    d = &mb->e_mbd.block[ib];
+
+    /* Enable this to test the effect of RDO as a replacement for the dynamic
+     *  zero bin instead of an augmentation of it.
+     */
+#if 0
+    vp8_strict_quantize_b(b, d);
+#endif
+
+    dequant_ptr = d->dequant;
+    coeff_ptr = b->coeff;
+    qcoeff_ptr = d->qcoeff;
+    dqcoeff_ptr = d->dqcoeff;
+    i0 = !type;
+    eob = *d->eob;
+
+    /* Now set up a Viterbi trellis to evaluate alternative roundings. */
+    rdmult = mb->rdmult * err_mult;
+    if(mb->e_mbd.mode_info_context->mbmi.ref_frame==INTRA_FRAME)
+        rdmult = (rdmult * 9)>>4;
+
+    rddiv = mb->rddiv;
+    best_mask[0] = best_mask[1] = 0;
+    /* Initialize the sentinel node of the trellis. */
+    tokens[eob][0].rate = 0;
+    tokens[eob][0].error = 0;
+    tokens[eob][0].next = 16;
+    tokens[eob][0].token = DCT_EOB_TOKEN;
+    tokens[eob][0].qc = 0;
+    *(tokens[eob] + 1) = *(tokens[eob] + 0);
+    next = eob;
+    for (i = eob; i-- > i0;)
+    {
+        int base_bits;
+        int d2;
+        int dx;
+
+        rc = vp8_default_zig_zag1d[i];
+        x = qcoeff_ptr[rc];
+        /* Only add a trellis state for non-zero coefficients. */
+        if (x)
+        {
+            int shortcut=0;
+            error0 = tokens[next][0].error;
+            error1 = tokens[next][1].error;
+            /* Evaluate the first possibility for this state. */
+            rate0 = tokens[next][0].rate;
+            rate1 = tokens[next][1].rate;
+            t0 = (vp8_dct_value_tokens_ptr + x)->Token;
+            /* Consider both possible successor states. */
+            if (next < 16)
+            {
+                band = vp8_coef_bands[i + 1];
+                pt = vp8_prev_token_class[t0];
+                rate0 +=
+                    mb->token_costs[type][band][pt][tokens[next][0].token];
+                rate1 +=
+                    mb->token_costs[type][band][pt][tokens[next][1].token];
+            }
+            rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
+            rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
+            if (rd_cost0 == rd_cost1)
+            {
+                rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
+                rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
+            }
+            /* And pick the best. */
+            best = rd_cost1 < rd_cost0;
+            base_bits = *(vp8_dct_value_cost_ptr + x);
+            dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
+            d2 = dx*dx;
+            tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
+            tokens[i][0].error = d2 + (best ? error1 : error0);
+            tokens[i][0].next = next;
+            tokens[i][0].token = t0;
+            tokens[i][0].qc = x;
+            best_mask[0] |= best << i;
+            /* Evaluate the second possibility for this state. */
+            rate0 = tokens[next][0].rate;
+            rate1 = tokens[next][1].rate;
+
+            if((abs(x)*dequant_ptr[rc]>abs(coeff_ptr[rc])) &&
+               (abs(x)*dequant_ptr[rc]<abs(coeff_ptr[rc])+dequant_ptr[rc]))
+                shortcut = 1;
+            else
+                shortcut = 0;
+
+            if(shortcut)
+            {
+                sz = -(x < 0);
+                x -= 2*sz + 1;
+            }
+
+            /* Consider both possible successor states. */
+            if (!x)
+            {
+                /* If we reduced this coefficient to zero, check to see if
+                 *  we need to move the EOB back here.
+                 */
+                t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
+                    DCT_EOB_TOKEN : ZERO_TOKEN;
+                t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
+                    DCT_EOB_TOKEN : ZERO_TOKEN;
+            }
+            else
+            {
+                t0=t1 = (vp8_dct_value_tokens_ptr + x)->Token;
+            }
+            if (next < 16)
+            {
+                band = vp8_coef_bands[i + 1];
+                if(t0!=DCT_EOB_TOKEN)
+                {
+                    pt = vp8_prev_token_class[t0];
+                    rate0 += mb->token_costs[type][band][pt][
+                        tokens[next][0].token];
+                }
+                if(t1!=DCT_EOB_TOKEN)
+                {
+                    pt = vp8_prev_token_class[t1];
+                    rate1 += mb->token_costs[type][band][pt][
+                        tokens[next][1].token];
+                }
+            }
+
+            rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
+            rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
+            if (rd_cost0 == rd_cost1)
+            {
+                rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
+                rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
+            }
+            /* And pick the best. */
+            best = rd_cost1 < rd_cost0;
+            base_bits = *(vp8_dct_value_cost_ptr + x);
+
+            if(shortcut)
+            {
+                dx -= (dequant_ptr[rc] + sz) ^ sz;
+                d2 = dx*dx;
+            }
+            tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
+            tokens[i][1].error = d2 + (best ? error1 : error0);
+            tokens[i][1].next = next;
+            tokens[i][1].token =best?t1:t0;
+            tokens[i][1].qc = x;
+            best_mask[1] |= best << i;
+            /* Finally, make this the new head of the trellis. */
+            next = i;
+        }
+        /* There's no choice to make for a zero coefficient, so we don't
+         *  add a new trellis node, but we do need to update the costs.
+         */
+        else
+        {
+            band = vp8_coef_bands[i + 1];
+            t0 = tokens[next][0].token;
+            t1 = tokens[next][1].token;
+            /* Update the cost of each path if we're past the EOB token. */
+            if (t0 != DCT_EOB_TOKEN)
+            {
+                tokens[next][0].rate += mb->token_costs[type][band][0][t0];
+                tokens[next][0].token = ZERO_TOKEN;
+            }
+            if (t1 != DCT_EOB_TOKEN)
+            {
+                tokens[next][1].rate += mb->token_costs[type][band][0][t1];
+                tokens[next][1].token = ZERO_TOKEN;
+            }
+            /* Don't update next, because we didn't add a new node. */
+        }
+    }
+
+    /* Now pick the best path through the whole trellis. */
+    band = vp8_coef_bands[i + 1];
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+    rate0 = tokens[next][0].rate;
+    rate1 = tokens[next][1].rate;
+    error0 = tokens[next][0].error;
+    error1 = tokens[next][1].error;
+    t0 = tokens[next][0].token;
+    t1 = tokens[next][1].token;
+    rate0 += mb->token_costs[type][band][pt][t0];
+    rate1 += mb->token_costs[type][band][pt][t1];
+    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
+    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
+    if (rd_cost0 == rd_cost1)
+    {
+        rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
+        rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
+    }
+    best = rd_cost1 < rd_cost0;
+    final_eob = i0 - 1;
+    for (i = next; i < eob; i = next)
+    {
+        x = tokens[i][best].qc;
+        if (x)
+            final_eob = i;
+        rc = vp8_default_zig_zag1d[i];
+        qcoeff_ptr[rc] = x;
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];
+        next = tokens[i][best].next;
+        best = (best_mask[best] >> i) & 1;
+    }
+    final_eob++;
+
+    *a = *l = (final_eob != !type);
+    *d->eob = (char)final_eob;
+}
+static void check_reset_2nd_coeffs(MACROBLOCKD *x, int type,
+                                   ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
+{
+    int sum=0;
+    int i;
+    BLOCKD *bd = &x->block[24];
+
+    if(bd->dequant[0]>=35 && bd->dequant[1]>=35)
+        return;
+
+    for(i=0;i<(*bd->eob);i++)
+    {
+        int coef = bd->dqcoeff[vp8_default_zig_zag1d[i]];
+        sum+= (coef>=0)?coef:-coef;
+        if(sum>=35)
+            return;
+    }
+    /**************************************************************************
+    our inverse hadamard transform effectively is weighted sum of all 16 inputs
+    with weight either 1 or -1. It has a last stage scaling of (sum+3)>>3. And
+    dc only idct is (dc+4)>>3. So if all the sums are between -35 and 29, the
+    output after inverse wht and idct will be all zero. A sum of absolute value
+    smaller than 35 guarantees all 16 different (+1/-1) weighted sums in wht
+    fall between -35 and +35.
+    **************************************************************************/
+    if(sum < 35)
+    {
+        for(i=0;i<(*bd->eob);i++)
+        {
+            int rc = vp8_default_zig_zag1d[i];
+            bd->qcoeff[rc]=0;
+            bd->dqcoeff[rc]=0;
+        }
+        *bd->eob = 0;
+        *a = *l = (*bd->eob != !type);
+    }
+}
+
+static void optimize_mb(MACROBLOCK *x)
+{
+    int b;
+    int type;
+    int has_2nd_order;
+
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+
+    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
+
+    for (b = 0; b < 16; b++)
+    {
+        optimize_b(x, b, type,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+    }
+
+    for (b = 16; b < 24; b++)
+    {
+        optimize_b(x, b, PLANE_TYPE_UV,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+    }
+
+    if (has_2nd_order)
+    {
+        b=24;
+        optimize_b(x, b, PLANE_TYPE_Y2,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+        check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+    }
+}
+
+
+void vp8_optimize_mby(MACROBLOCK *x)
+{
+    int b;
+    int type;
+    int has_2nd_order;
+
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+
+    if (!x->e_mbd.above_context)
+        return;
+
+    if (!x->e_mbd.left_context)
+        return;
+
+    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
+
+    for (b = 0; b < 16; b++)
+    {
+        optimize_b(x, b, type,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+    }
+
+
+    if (has_2nd_order)
+    {
+        b=24;
+        optimize_b(x, b, PLANE_TYPE_Y2,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+        check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+    }
+}
+
+void vp8_optimize_mbuv(MACROBLOCK *x)
+{
+    int b;
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+
+    if (!x->e_mbd.above_context)
+        return;
+
+    if (!x->e_mbd.left_context)
+        return;
+
+    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    for (b = 16; b < 24; b++)
+    {
+        optimize_b(x, b, PLANE_TYPE_UV,
+            ta + vp8_block2above[b], tl + vp8_block2left[b]);
+    }
+}
+
+void vp8_encode_inter16x16(MACROBLOCK *x)
+{
+    vp8_build_inter_predictors_mb(&x->e_mbd);
+
+    vp8_subtract_mb(x);
+
+    transform_mb(x);
+
+    vp8_quantize_mb(x);
+
+    if (x->optimize)
+        optimize_mb(x);
+}
+
+/* this funciton is used by first pass only */
+void vp8_encode_inter16x16y(MACROBLOCK *x)
+{
+    BLOCK *b = &x->block[0];
+
+    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
+                                        x->e_mbd.dst.y_stride);
+
+    vp8_subtract_mby(x->src_diff, *(b->base_src),
+        b->src_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
+
+    transform_mby(x);
+
+    vp8_quantize_mby(x);
+
+    vp8_inverse_transform_mby(&x->e_mbd);
+}
diff --git a/libvpx/vp8/encoder/encodemb.h b/libvpx/vp8/encoder/encodemb.h
new file mode 100644
index 0000000..6badf7d
--- /dev/null
+++ b/libvpx/vp8/encoder/encodemb.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENCODEMB_H
+#define __INC_ENCODEMB_H
+
+#include "onyx_int.h"
+void vp8_encode_inter16x16(MACROBLOCK *x);
+
+void vp8_build_dcblock(MACROBLOCK *b);
+void vp8_transform_mb(MACROBLOCK *mb);
+void vp8_transform_mbuv(MACROBLOCK *x);
+void vp8_transform_intra_mby(MACROBLOCK *x);
+
+void vp8_optimize_mby(MACROBLOCK *x);
+void vp8_optimize_mbuv(MACROBLOCK *x);
+void vp8_encode_inter16x16y(MACROBLOCK *x);
+#endif
diff --git a/libvpx/vp8/encoder/encodemv.c b/libvpx/vp8/encoder/encodemv.c
new file mode 100644
index 0000000..0c43d06
--- /dev/null
+++ b/libvpx/vp8/encoder/encodemv.c
@@ -0,0 +1,380 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/common.h"
+#include "encodemv.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/systemdependent.h"
+
+#include <math.h>
+
+#ifdef ENTROPY_STATS
+extern unsigned int active_section;
+#endif
+
+static void encode_mvcomponent(
+    vp8_writer *const w,
+    const int v,
+    const struct mv_context *mvc
+)
+{
+    const vp8_prob *p = mvc->prob;
+    const int x = v < 0 ? -v : v;
+
+    if (x < mvnum_short)     /* Small */
+    {
+        vp8_write(w, 0, p [mvpis_short]);
+        vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3);
+
+        if (!x)
+            return;         /* no sign bit */
+    }
+    else                    /* Large */
+    {
+        int i = 0;
+
+        vp8_write(w, 1, p [mvpis_short]);
+
+        do
+            vp8_write(w, (x >> i) & 1, p [MVPbits + i]);
+
+        while (++i < 3);
+
+        i = mvlong_width - 1;  /* Skip bit 3, which is sometimes implicit */
+
+        do
+            vp8_write(w, (x >> i) & 1, p [MVPbits + i]);
+
+        while (--i > 3);
+
+        if (x & 0xFFF0)
+            vp8_write(w, (x >> 3) & 1, p [MVPbits + 3]);
+    }
+
+    vp8_write(w, v < 0, p [MVPsign]);
+}
+#if 0
+static int max_mv_r = 0;
+static int max_mv_c = 0;
+#endif
+void vp8_encode_motion_vector(vp8_writer *w, const MV *mv, const MV_CONTEXT *mvc)
+{
+
+#if 0
+    {
+        if (abs(mv->row >> 1) > max_mv_r)
+        {
+            FILE *f = fopen("maxmv.stt", "a");
+            max_mv_r = abs(mv->row >> 1);
+            fprintf(f, "New Mv Row Max %6d\n", (mv->row >> 1));
+
+            if ((abs(mv->row) / 2) != max_mv_r)
+                fprintf(f, "MV Row conversion error %6d\n", abs(mv->row) / 2);
+
+            fclose(f);
+        }
+
+        if (abs(mv->col >> 1) > max_mv_c)
+        {
+            FILE *f = fopen("maxmv.stt", "a");
+            fprintf(f, "New Mv Col Max %6d\n", (mv->col >> 1));
+            max_mv_c = abs(mv->col >> 1);
+            fclose(f);
+        }
+    }
+#endif
+
+    encode_mvcomponent(w, mv->row >> 1, &mvc[0]);
+    encode_mvcomponent(w, mv->col >> 1, &mvc[1]);
+}
+
+
+static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc)
+{
+    const vp8_prob *p = mvc->prob;
+    const int x = v;
+    unsigned int cost;
+
+    if (x < mvnum_short)
+    {
+        cost = vp8_cost_zero(p [mvpis_short])
+               + vp8_treed_cost(vp8_small_mvtree, p + MVPshort, x, 3);
+
+        if (!x)
+            return cost;
+    }
+    else
+    {
+        int i = 0;
+        cost = vp8_cost_one(p [mvpis_short]);
+
+        do
+            cost += vp8_cost_bit(p [MVPbits + i], (x >> i) & 1);
+
+        while (++i < 3);
+
+        i = mvlong_width - 1;  /* Skip bit 3, which is sometimes implicit */
+
+        do
+            cost += vp8_cost_bit(p [MVPbits + i], (x >> i) & 1);
+
+        while (--i > 3);
+
+        if (x & 0xFFF0)
+            cost += vp8_cost_bit(p [MVPbits + 3], (x >> 3) & 1);
+    }
+
+    return cost;   /* + vp8_cost_bit( p [MVPsign], v < 0); */
+}
+
+void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2])
+{
+    int i = 1;
+    unsigned int cost0 = 0;
+    unsigned int cost1 = 0;
+
+    vp8_clear_system_state();
+
+    i = 1;
+
+    if (mvc_flag[0])
+    {
+        mvcost [0] [0] = cost_mvcomponent(0, &mvc[0]);
+
+        do
+        {
+            cost0 = cost_mvcomponent(i, &mvc[0]);
+
+            mvcost [0] [i] = cost0 + vp8_cost_zero(mvc[0].prob[MVPsign]);
+            mvcost [0] [-i] = cost0 + vp8_cost_one(mvc[0].prob[MVPsign]);
+        }
+        while (++i <= mv_max);
+    }
+
+    i = 1;
+
+    if (mvc_flag[1])
+    {
+        mvcost [1] [0] = cost_mvcomponent(0, &mvc[1]);
+
+        do
+        {
+            cost1 = cost_mvcomponent(i, &mvc[1]);
+
+            mvcost [1] [i] = cost1 + vp8_cost_zero(mvc[1].prob[MVPsign]);
+            mvcost [1] [-i] = cost1 + vp8_cost_one(mvc[1].prob[MVPsign]);
+        }
+        while (++i <= mv_max);
+    }
+}
+
+
+/* Motion vector probability table update depends on benefit.
+ * Small correction allows for the fact that an update to an MV probability
+ * may have benefit in subsequent frames as well as the current one.
+ */
+#define MV_PROB_UPDATE_CORRECTION   -1
+
+
+static void calc_prob(vp8_prob *p, const unsigned int ct[2])
+{
+    const unsigned int tot = ct[0] + ct[1];
+
+    if (tot)
+    {
+        const vp8_prob x = ((ct[0] * 255) / tot) & -2;
+        *p = x ? x : 1;
+    }
+}
+
+static void update(
+    vp8_writer *const w,
+    const unsigned int ct[2],
+    vp8_prob *const cur_p,
+    const vp8_prob new_p,
+    const vp8_prob update_p,
+    int *updated
+)
+{
+    const int cur_b = vp8_cost_branch(ct, *cur_p);
+    const int new_b = vp8_cost_branch(ct, new_p);
+    const int cost = 7 + MV_PROB_UPDATE_CORRECTION + ((vp8_cost_one(update_p) - vp8_cost_zero(update_p) + 128) >> 8);
+
+    if (cur_b - new_b > cost)
+    {
+        *cur_p = new_p;
+        vp8_write(w, 1, update_p);
+        vp8_write_literal(w, new_p >> 1, 7);
+        *updated = 1;
+
+    }
+    else
+        vp8_write(w, 0, update_p);
+}
+
+static void write_component_probs(
+    vp8_writer *const w,
+    struct mv_context *cur_mvc,
+    const struct mv_context *default_mvc_,
+    const struct mv_context *update_mvc,
+    const unsigned int events [MVvals],
+    unsigned int rc,
+    int *updated
+)
+{
+    vp8_prob *Pcur = cur_mvc->prob;
+    const vp8_prob *default_mvc = default_mvc_->prob;
+    const vp8_prob *Pupdate = update_mvc->prob;
+    unsigned int is_short_ct[2], sign_ct[2];
+
+    unsigned int bit_ct [mvlong_width] [2];
+
+    unsigned int short_ct  [mvnum_short];
+    unsigned int short_bct [mvnum_short-1] [2];
+
+    vp8_prob Pnew [MVPcount];
+
+    (void) rc;
+    vp8_copy_array(Pnew, default_mvc, MVPcount);
+
+    vp8_zero(is_short_ct)
+    vp8_zero(sign_ct)
+    vp8_zero(bit_ct)
+    vp8_zero(short_ct)
+    vp8_zero(short_bct)
+
+
+    /* j=0 */
+    {
+        const int c = events [mv_max];
+
+        is_short_ct [0] += c;     /* Short vector */
+        short_ct [0] += c;       /* Magnitude distribution */
+    }
+
+    /* j: 1 ~ mv_max (1023) */
+    {
+        int j = 1;
+
+        do
+        {
+            const int c1 = events [mv_max + j];  /* positive */
+            const int c2 = events [mv_max - j];  /* negative */
+            const int c  = c1 + c2;
+            int a = j;
+
+            sign_ct [0] += c1;
+            sign_ct [1] += c2;
+
+            if (a < mvnum_short)
+            {
+                is_short_ct [0] += c;     /* Short vector */
+                short_ct [a] += c;       /* Magnitude distribution */
+            }
+            else
+            {
+                int k = mvlong_width - 1;
+                is_short_ct [1] += c;     /* Long vector */
+
+                /*  bit 3 not always encoded. */
+                do
+                    bit_ct [k] [(a >> k) & 1] += c;
+
+                while (--k >= 0);
+            }
+        }
+        while (++j <= mv_max);
+    }
+
+    calc_prob(Pnew + mvpis_short, is_short_ct);
+
+    calc_prob(Pnew + MVPsign, sign_ct);
+
+    {
+        vp8_prob p [mvnum_short - 1];    /* actually only need branch ct */
+        int j = 0;
+
+        vp8_tree_probs_from_distribution(
+            8, vp8_small_mvencodings, vp8_small_mvtree,
+            p, short_bct, short_ct,
+            256, 1
+        );
+
+        do
+            calc_prob(Pnew + MVPshort + j, short_bct[j]);
+
+        while (++j < mvnum_short - 1);
+    }
+
+    {
+        int j = 0;
+
+        do
+            calc_prob(Pnew + MVPbits + j, bit_ct[j]);
+
+        while (++j < mvlong_width);
+    }
+
+    update(w, is_short_ct, Pcur + mvpis_short, Pnew[mvpis_short], *Pupdate++, updated);
+
+    update(w, sign_ct, Pcur + MVPsign, Pnew[MVPsign], *Pupdate++, updated);
+
+    {
+        const vp8_prob *const new_p = Pnew + MVPshort;
+        vp8_prob *const cur_p = Pcur + MVPshort;
+
+        int j = 0;
+
+        do
+
+            update(w, short_bct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+
+        while (++j < mvnum_short - 1);
+    }
+
+    {
+        const vp8_prob *const new_p = Pnew + MVPbits;
+        vp8_prob *const cur_p = Pcur + MVPbits;
+
+        int j = 0;
+
+        do
+
+            update(w, bit_ct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+
+        while (++j < mvlong_width);
+    }
+}
+
+void vp8_write_mvprobs(VP8_COMP *cpi)
+{
+    vp8_writer *const w  = cpi->bc;
+    MV_CONTEXT *mvc = cpi->common.fc.mvc;
+    int flags[2] = {0, 0};
+#ifdef ENTROPY_STATS
+    active_section = 4;
+#endif
+    write_component_probs(
+        w, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0],
+        cpi->mb.MVcount[0], 0, &flags[0]
+    );
+    write_component_probs(
+        w, &mvc[1], &vp8_default_mv_context[1], &vp8_mv_update_probs[1],
+        cpi->mb.MVcount[1], 1, &flags[1]
+    );
+
+    if (flags[0] || flags[1])
+        vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags);
+
+#ifdef ENTROPY_STATS
+    active_section = 5;
+#endif
+}
diff --git a/libvpx/vp8/encoder/encodemv.h b/libvpx/vp8/encoder/encodemv.h
new file mode 100644
index 0000000..a6116c1
--- /dev/null
+++ b/libvpx/vp8/encoder/encodemv.h
@@ -0,0 +1,21 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENCODEMV_H
+#define __INC_ENCODEMV_H
+
+#include "onyx_int.h"
+
+void vp8_write_mvprobs(VP8_COMP *);
+void vp8_encode_motion_vector(vp8_writer *, const MV *, const MV_CONTEXT *);
+void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]);
+
+#endif
diff --git a/libvpx/vp8/encoder/ethreading.c b/libvpx/vp8/encoder/ethreading.c
new file mode 100644
index 0000000..39340f2
--- /dev/null
+++ b/libvpx/vp8/encoder/ethreading.c
@@ -0,0 +1,642 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "onyx_int.h"
+#include "vp8/common/threading.h"
+#include "vp8/common/common.h"
+#include "vp8/common/extend.h"
+#include "bitstream.h"
+#include "encodeframe.h"
+
+#if CONFIG_MULTITHREAD
+
+extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip);
+
+extern void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
+
+static THREAD_FUNCTION thread_loopfilter(void *p_data)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
+    VP8_COMMON *cm = &cpi->common;
+
+    while (1)
+    {
+        if (cpi->b_multi_threaded == 0)
+            break;
+
+        if (sem_wait(&cpi->h_event_start_lpf) == 0)
+        {
+            if (cpi->b_multi_threaded == 0) /* we're shutting down */
+                break;
+
+            vp8_loopfilter_frame(cpi, cm);
+
+            sem_post(&cpi->h_event_end_lpf);
+        }
+    }
+
+    return 0;
+}
+
+static
+THREAD_FUNCTION thread_encoding_proc(void *p_data)
+{
+    int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
+    VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
+    MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
+    ENTROPY_CONTEXT_PLANES mb_row_left_context;
+
+    while (1)
+    {
+        if (cpi->b_multi_threaded == 0)
+            break;
+
+        if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0)
+        {
+            const int nsync = cpi->mt_sync_range;
+            VP8_COMMON *cm = &cpi->common;
+            int mb_row;
+            MACROBLOCK *x = &mbri->mb;
+            MACROBLOCKD *xd = &x->e_mbd;
+            TOKENEXTRA *tp ;
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+            TOKENEXTRA *tp_start = cpi->tok + (1 + ithread) * (16 * 24);
+            const int num_part = (1 << cm->multi_token_partition);
+#endif
+
+            int *segment_counts = mbri->segment_counts;
+            int *totalrate = &mbri->totalrate;
+
+            if (cpi->b_multi_threaded == 0) /* we're shutting down */
+                break;
+
+            for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
+            {
+
+                int recon_yoffset, recon_uvoffset;
+                int mb_col;
+                int ref_fb_idx = cm->lst_fb_idx;
+                int dst_fb_idx = cm->new_fb_idx;
+                int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+                int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+                int map_index = (mb_row * cm->mb_cols);
+                volatile const int *last_row_current_mb_col;
+                volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+
+#if  (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+                vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)];
+#else
+                tp = cpi->tok + (mb_row * (cm->mb_cols * 16 * 24));
+                cpi->tplist[mb_row].start = tp;
+#endif
+
+                last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
+
+                /* reset above block coeffs */
+                xd->above_context = cm->above_context;
+                xd->left_context = &mb_row_left_context;
+
+                vp8_zero(mb_row_left_context);
+
+                xd->up_available = (mb_row != 0);
+                recon_yoffset = (mb_row * recon_y_stride * 16);
+                recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+                /* Set the mb activity pointer to the start of the row. */
+                x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+                /* for each macroblock col in image */
+                for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+                {
+                    *current_mb_col = mb_col - 1;
+
+                    if ((mb_col & (nsync - 1)) == 0)
+                    {
+                        while (mb_col > (*last_row_current_mb_col - nsync))
+                        {
+                            x86_pause_hint();
+                            thread_sleep(0);
+                        }
+                    }
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    tp = tp_start;
+#endif
+
+                    /* Distance of Mb to the various image edges.
+                     * These specified to 8th pel as they are always compared
+                     * to values that are in 1/8th pel units
+                     */
+                    xd->mb_to_left_edge = -((mb_col * 16) << 3);
+                    xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+                    xd->mb_to_top_edge = -((mb_row * 16) << 3);
+                    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+
+                    /* Set up limit values for motion vectors used to prevent
+                     * them extending outside the UMV borders
+                     */
+                    x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+                    x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+                    x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+                    x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+                    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+                    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+                    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+                    xd->left_available = (mb_col != 0);
+
+                    x->rddiv = cpi->RDDIV;
+                    x->rdmult = cpi->RDMULT;
+
+                    /* Copy current mb to a buffer */
+                    vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+                    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+                        vp8_activity_masking(cpi, x);
+
+                    /* Is segmentation enabled */
+                    /* MB level adjustment to quantizer */
+                    if (xd->segmentation_enabled)
+                    {
+                        /* Code to set segment id in xd->mbmi.segment_id for
+                         * current MB (with range checking)
+                         */
+                        if (cpi->segmentation_map[map_index + mb_col] <= 3)
+                            xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[map_index + mb_col];
+                        else
+                            xd->mode_info_context->mbmi.segment_id = 0;
+
+                        vp8cx_mb_init_quantizer(cpi, x, 1);
+                    }
+                    else
+                        /* Set to Segment 0 by default */
+                        xd->mode_info_context->mbmi.segment_id = 0;
+
+                    x->active_ptr = cpi->active_map + map_index + mb_col;
+
+                    if (cm->frame_type == KEY_FRAME)
+                    {
+                        *totalrate += vp8cx_encode_intra_macroblock(cpi, x, &tp);
+#ifdef MODE_STATS
+                        y_modes[xd->mbmi.mode] ++;
+#endif
+                    }
+                    else
+                    {
+                        *totalrate += vp8cx_encode_inter_macroblock(cpi, x, &tp, recon_yoffset, recon_uvoffset, mb_row, mb_col);
+
+#ifdef MODE_STATS
+                        inter_y_modes[xd->mbmi.mode] ++;
+
+                        if (xd->mbmi.mode == SPLITMV)
+                        {
+                            int b;
+
+                            for (b = 0; b < xd->mbmi.partition_count; b++)
+                            {
+                                inter_b_modes[x->partition->bmi[b].mode] ++;
+                            }
+                        }
+
+#endif
+
+                        /* Special case code for cyclic refresh
+                         * If cyclic update enabled then copy
+                         * xd->mbmi.segment_id; (which may have been updated
+                         * based on mode during
+                         * vp8cx_encode_inter_macroblock()) back into the
+                         * global segmentation map
+                         */
+                        if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
+                        {
+                            const MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+                            cpi->segmentation_map[map_index + mb_col] = mbmi->segment_id;
+
+                            /* If the block has been refreshed mark it as clean
+                             * (the magnitude of the -ve influences how long it
+                             * will be before we consider another refresh):
+                             * Else if it was coded (last frame 0,0) and has
+                             * not already been refreshed then mark it as a
+                             * candidate for cleanup next time (marked 0) else
+                             * mark it as dirty (1).
+                             */
+                            if (mbmi->segment_id)
+                                cpi->cyclic_refresh_map[map_index + mb_col] = -1;
+                            else if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
+                            {
+                                if (cpi->cyclic_refresh_map[map_index + mb_col] == 1)
+                                    cpi->cyclic_refresh_map[map_index + mb_col] = 0;
+                            }
+                            else
+                                cpi->cyclic_refresh_map[map_index + mb_col] = 1;
+
+                        }
+                    }
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    /* pack tokens for this MB */
+                    {
+                        int tok_count = tp - tp_start;
+                        pack_tokens(w, tp_start, tok_count);
+                    }
+#else
+                    cpi->tplist[mb_row].stop = tp;
+#endif
+                    /* Increment pointer into gf usage flags structure. */
+                    x->gf_active_ptr++;
+
+                    /* Increment the activity mask pointers. */
+                    x->mb_activity_ptr++;
+
+                    /* adjust to the next column of macroblocks */
+                    x->src.y_buffer += 16;
+                    x->src.u_buffer += 8;
+                    x->src.v_buffer += 8;
+
+                    recon_yoffset += 16;
+                    recon_uvoffset += 8;
+
+                    /* Keep track of segment usage */
+                    segment_counts[xd->mode_info_context->mbmi.segment_id]++;
+
+                    /* skip to next mb */
+                    xd->mode_info_context++;
+                    x->partition_info++;
+                    xd->above_context++;
+                }
+
+                vp8_extend_mb_row( &cm->yv12_fb[dst_fb_idx],
+                                    xd->dst.y_buffer + 16,
+                                    xd->dst.u_buffer + 8,
+                                    xd->dst.v_buffer + 8);
+
+                *current_mb_col = mb_col + nsync;
+
+                /* this is to account for the border */
+                xd->mode_info_context++;
+                x->partition_info++;
+
+                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+                x->src.u_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+                x->src.v_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+                x->partition_info += xd->mode_info_stride * cpi->encoding_thread_count;
+                x->gf_active_ptr   += cm->mb_cols * cpi->encoding_thread_count;
+
+                if (mb_row == cm->mb_rows - 1)
+                {
+                    sem_post(&cpi->h_event_end_encoding); /* signal frame encoding end */
+                }
+            }
+        }
+    }
+
+    /* printf("exit thread %d\n", ithread); */
+    return 0;
+}
+
+static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
+{
+
+    MACROBLOCK *x = mbsrc;
+    MACROBLOCK *z = mbdst;
+    int i;
+
+    z->ss               = x->ss;
+    z->ss_count          = x->ss_count;
+    z->searches_per_step  = x->searches_per_step;
+    z->errorperbit      = x->errorperbit;
+
+    z->sadperbit16      = x->sadperbit16;
+    z->sadperbit4       = x->sadperbit4;
+
+    /*
+    z->mv_col_min    = x->mv_col_min;
+    z->mv_col_max    = x->mv_col_max;
+    z->mv_row_min    = x->mv_row_min;
+    z->mv_row_max    = x->mv_row_max;
+    */
+
+    z->short_fdct4x4     = x->short_fdct4x4;
+    z->short_fdct8x4     = x->short_fdct8x4;
+    z->short_walsh4x4    = x->short_walsh4x4;
+    z->quantize_b        = x->quantize_b;
+    z->quantize_b_pair   = x->quantize_b_pair;
+    z->optimize          = x->optimize;
+
+    /*
+    z->mvc              = x->mvc;
+    z->src.y_buffer      = x->src.y_buffer;
+    z->src.u_buffer      = x->src.u_buffer;
+    z->src.v_buffer      = x->src.v_buffer;
+    */
+
+    z->mvcost[0] =  x->mvcost[0];
+    z->mvcost[1] =  x->mvcost[1];
+    z->mvsadcost[0] =  x->mvsadcost[0];
+    z->mvsadcost[1] =  x->mvsadcost[1];
+
+    z->token_costs = x->token_costs;
+    z->inter_bmode_costs = x->inter_bmode_costs;
+    z->mbmode_cost = x->mbmode_cost;
+    z->intra_uv_mode_cost = x->intra_uv_mode_cost;
+    z->bmode_costs = x->bmode_costs;
+
+    for (i = 0; i < 25; i++)
+    {
+        z->block[i].quant           = x->block[i].quant;
+        z->block[i].quant_fast      = x->block[i].quant_fast;
+        z->block[i].quant_shift     = x->block[i].quant_shift;
+        z->block[i].zbin            = x->block[i].zbin;
+        z->block[i].zrun_zbin_boost = x->block[i].zrun_zbin_boost;
+        z->block[i].round           = x->block[i].round;
+        z->block[i].src_stride      = x->block[i].src_stride;
+    }
+
+    z->q_index           = x->q_index;
+    z->act_zbin_adj      = x->act_zbin_adj;
+    z->last_act_zbin_adj = x->last_act_zbin_adj;
+
+    {
+        MACROBLOCKD *xd = &x->e_mbd;
+        MACROBLOCKD *zd = &z->e_mbd;
+
+        /*
+        zd->mode_info_context = xd->mode_info_context;
+        zd->mode_info        = xd->mode_info;
+
+        zd->mode_info_stride  = xd->mode_info_stride;
+        zd->frame_type       = xd->frame_type;
+        zd->up_available     = xd->up_available   ;
+        zd->left_available   = xd->left_available;
+        zd->left_context     = xd->left_context;
+        zd->last_frame_dc     = xd->last_frame_dc;
+        zd->last_frame_dccons = xd->last_frame_dccons;
+        zd->gold_frame_dc     = xd->gold_frame_dc;
+        zd->gold_frame_dccons = xd->gold_frame_dccons;
+        zd->mb_to_left_edge    = xd->mb_to_left_edge;
+        zd->mb_to_right_edge   = xd->mb_to_right_edge;
+        zd->mb_to_top_edge     = xd->mb_to_top_edge   ;
+        zd->mb_to_bottom_edge  = xd->mb_to_bottom_edge;
+        zd->gf_active_ptr     = xd->gf_active_ptr;
+        zd->frames_since_golden       = xd->frames_since_golden;
+        zd->frames_till_alt_ref_frame   = xd->frames_till_alt_ref_frame;
+        */
+        zd->subpixel_predict         = xd->subpixel_predict;
+        zd->subpixel_predict8x4      = xd->subpixel_predict8x4;
+        zd->subpixel_predict8x8      = xd->subpixel_predict8x8;
+        zd->subpixel_predict16x16    = xd->subpixel_predict16x16;
+        zd->segmentation_enabled     = xd->segmentation_enabled;
+        zd->mb_segement_abs_delta      = xd->mb_segement_abs_delta;
+        vpx_memcpy(zd->segment_feature_data, xd->segment_feature_data,
+                   sizeof(xd->segment_feature_data));
+
+        vpx_memcpy(zd->dequant_y1_dc, xd->dequant_y1_dc,
+                   sizeof(xd->dequant_y1_dc));
+        vpx_memcpy(zd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        vpx_memcpy(zd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        vpx_memcpy(zd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+
+#if 1
+        /*TODO:  Remove dequant from BLOCKD.  This is a temporary solution until
+         * the quantizer code uses a passed in pointer to the dequant constants.
+         * This will also require modifications to the x86 and neon assembly.
+         * */
+        for (i = 0; i < 16; i++)
+            zd->block[i].dequant = zd->dequant_y1;
+        for (i = 16; i < 24; i++)
+            zd->block[i].dequant = zd->dequant_uv;
+        zd->block[24].dequant = zd->dequant_y2;
+#endif
+    }
+}
+
+void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
+                               MACROBLOCK *x,
+                               MB_ROW_COMP *mbr_ei,
+                               int mb_row,
+                               int count
+                              )
+{
+
+    VP8_COMMON *const cm = & cpi->common;
+    MACROBLOCKD *const xd = & x->e_mbd;
+    int i;
+    (void) mb_row;
+
+    for (i = 0; i < count; i++)
+    {
+        MACROBLOCK *mb = & mbr_ei[i].mb;
+        MACROBLOCKD *mbd = &mb->e_mbd;
+
+        mbd->subpixel_predict        = xd->subpixel_predict;
+        mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
+        mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
+        mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
+        mb->gf_active_ptr            = x->gf_active_ptr;
+
+        vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
+        mbr_ei[i].totalrate = 0;
+
+        mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1);
+
+        mbd->mode_info_context = cm->mi   + x->e_mbd.mode_info_stride * (i + 1);
+        mbd->mode_info_stride  = cm->mode_info_stride;
+
+        mbd->frame_type = cm->frame_type;
+
+        mb->src = * cpi->Source;
+        mbd->pre = cm->yv12_fb[cm->lst_fb_idx];
+        mbd->dst = cm->yv12_fb[cm->new_fb_idx];
+
+        mb->src.y_buffer += 16 * x->src.y_stride * (i + 1);
+        mb->src.u_buffer +=  8 * x->src.uv_stride * (i + 1);
+        mb->src.v_buffer +=  8 * x->src.uv_stride * (i + 1);
+
+        vp8_build_block_offsets(mb);
+
+        mbd->left_context = &cm->left_context;
+        mb->mvc = cm->fc.mvc;
+
+        setup_mbby_copy(&mbr_ei[i].mb, x);
+
+        mbd->fullpixel_mask = 0xffffffff;
+        if(cm->full_pixel)
+            mbd->fullpixel_mask = 0xfffffff8;
+
+        vp8_zero(mb->coef_counts);
+        vp8_zero(x->ymode_count);
+        mb->skip_true_count = 0;
+        vp8_zero(mb->MVcount);
+        mb->prediction_error = 0;
+        mb->intra_error = 0;
+    }
+}
+
+int vp8cx_create_encoder_threads(VP8_COMP *cpi)
+{
+    const VP8_COMMON * cm = &cpi->common;
+
+    cpi->b_multi_threaded = 0;
+    cpi->encoding_thread_count = 0;
+    cpi->b_lpf_running = 0;
+
+    if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
+    {
+        int ithread;
+        int th_count = cpi->oxcf.multi_threaded - 1;
+        int rc = 0;
+
+        /* don't allocate more threads than cores available */
+        if (cpi->oxcf.multi_threaded > cm->processor_core_count)
+            th_count = cm->processor_core_count - 1;
+
+        /* we have th_count + 1 (main) threads processing one row each */
+        /* no point to have more threads than the sync range allows */
+        if(th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1))
+        {
+            th_count = (cm->mb_cols / cpi->mt_sync_range) - 1;
+        }
+
+        if(th_count == 0)
+            return 0;
+
+        CHECK_MEM_ERROR(cpi->h_encoding_thread,
+                        vpx_malloc(sizeof(pthread_t) * th_count));
+        CHECK_MEM_ERROR(cpi->h_event_start_encoding,
+                        vpx_malloc(sizeof(sem_t) * th_count));
+        CHECK_MEM_ERROR(cpi->mb_row_ei,
+                        vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
+        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
+        CHECK_MEM_ERROR(cpi->en_thread_data,
+                        vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
+
+        sem_init(&cpi->h_event_end_encoding, 0, 0);
+
+        cpi->b_multi_threaded = 1;
+        cpi->encoding_thread_count = th_count;
+
+        /*
+        printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n",
+               (cpi->encoding_thread_count +1));
+        */
+
+        for (ithread = 0; ithread < th_count; ithread++)
+        {
+            ENCODETHREAD_DATA *ethd = &cpi->en_thread_data[ithread];
+
+            /* Setup block ptrs and offsets */
+            vp8_setup_block_ptrs(&cpi->mb_row_ei[ithread].mb);
+            vp8_setup_block_dptrs(&cpi->mb_row_ei[ithread].mb.e_mbd);
+
+            sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
+
+            ethd->ithread = ithread;
+            ethd->ptr1 = (void *)cpi;
+            ethd->ptr2 = (void *)&cpi->mb_row_ei[ithread];
+
+            rc = pthread_create(&cpi->h_encoding_thread[ithread], 0,
+                                thread_encoding_proc, ethd);
+            if(rc)
+                break;
+        }
+
+        if(rc)
+        {
+            /* shutdown other threads */
+            cpi->b_multi_threaded = 0;
+            for(--ithread; ithread >= 0; ithread--)
+            {
+                pthread_join(cpi->h_encoding_thread[ithread], 0);
+                sem_destroy(&cpi->h_event_start_encoding[ithread]);
+            }
+            sem_destroy(&cpi->h_event_end_encoding);
+
+            /* free thread related resources */
+            vpx_free(cpi->h_event_start_encoding);
+            vpx_free(cpi->h_encoding_thread);
+            vpx_free(cpi->mb_row_ei);
+            vpx_free(cpi->en_thread_data);
+
+            return -1;
+        }
+
+
+        {
+            LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data;
+
+            sem_init(&cpi->h_event_start_lpf, 0, 0);
+            sem_init(&cpi->h_event_end_lpf, 0, 0);
+
+            lpfthd->ptr1 = (void *)cpi;
+            rc = pthread_create(&cpi->h_filter_thread, 0, thread_loopfilter,
+                                lpfthd);
+
+            if(rc)
+            {
+                /* shutdown other threads */
+                cpi->b_multi_threaded = 0;
+                for(--ithread; ithread >= 0; ithread--)
+                {
+                    sem_post(&cpi->h_event_start_encoding[ithread]);
+                    pthread_join(cpi->h_encoding_thread[ithread], 0);
+                    sem_destroy(&cpi->h_event_start_encoding[ithread]);
+                }
+                sem_destroy(&cpi->h_event_end_encoding);
+                sem_destroy(&cpi->h_event_end_lpf);
+                sem_destroy(&cpi->h_event_start_lpf);
+
+                /* free thread related resources */
+                vpx_free(cpi->h_event_start_encoding);
+                vpx_free(cpi->h_encoding_thread);
+                vpx_free(cpi->mb_row_ei);
+                vpx_free(cpi->en_thread_data);
+
+                return -2;
+            }
+        }
+    }
+    return 0;
+}
+
+void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
+{
+    if (cpi->b_multi_threaded)
+    {
+        /* shutdown other threads */
+        cpi->b_multi_threaded = 0;
+        {
+            int i;
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                sem_post(&cpi->h_event_start_encoding[i]);
+                pthread_join(cpi->h_encoding_thread[i], 0);
+
+                sem_destroy(&cpi->h_event_start_encoding[i]);
+            }
+
+            sem_post(&cpi->h_event_start_lpf);
+            pthread_join(cpi->h_filter_thread, 0);
+        }
+
+        sem_destroy(&cpi->h_event_end_encoding);
+        sem_destroy(&cpi->h_event_end_lpf);
+        sem_destroy(&cpi->h_event_start_lpf);
+
+        /* free thread related resources */
+        vpx_free(cpi->h_event_start_encoding);
+        vpx_free(cpi->h_encoding_thread);
+        vpx_free(cpi->mb_row_ei);
+        vpx_free(cpi->en_thread_data);
+    }
+}
+#endif
diff --git a/libvpx/vp8/encoder/firstpass.c b/libvpx/vp8/encoder/firstpass.c
new file mode 100644
index 0000000..b668c8f
--- /dev/null
+++ b/libvpx/vp8/encoder/firstpass.c
@@ -0,0 +1,3360 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "block.h"
+#include "onyx_int.h"
+#include "vp8/common/variance.h"
+#include "encodeintra.h"
+#include "vp8/common/setupintrarecon.h"
+#include "vp8/common/systemdependent.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "vpx_scale/vpxscale.h"
+#include "encodemb.h"
+#include "vp8/common/extend.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "rdopt.h"
+#include "vp8/common/quant_common.h"
+#include "encodemv.h"
+#include "encodeframe.h"
+
+/* #define OUTPUT_FPF 1 */
+
+extern void vp8cx_frame_init_quantizer(VP8_COMP *cpi);
+extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv);
+extern void vp8_alloc_compressor_data(VP8_COMP *cpi);
+
+#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q]
+extern int vp8_kf_boost_qadjustment[QINDEX_RANGE];
+
+extern const int vp8_gf_boost_qadjustment[QINDEX_RANGE];
+
+#define IIFACTOR   1.5
+#define IIKFACTOR1 1.40
+#define IIKFACTOR2 1.5
+#define RMAX       14.0
+#define GF_RMAX    48.0
+
+#define KF_MB_INTRA_MIN 300
+#define GF_MB_INTRA_MIN 200
+
+#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
+
+#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
+#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
+
+#define NEW_BOOST 1
+
+static int vscale_lookup[7] = {0, 1, 1, 2, 2, 3, 3};
+static int hscale_lookup[7] = {0, 0, 1, 1, 2, 2, 3};
+
+
+static const int cq_level[QINDEX_RANGE] =
+{
+    0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
+    9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
+    20,21,22,22,23,24,24,25,26,27,27,28,29,30,30,31,
+    32,33,33,34,35,36,36,37,38,39,39,40,41,42,42,43,
+    44,45,46,46,47,48,49,50,50,51,52,53,54,55,55,56,
+    57,58,59,60,60,61,62,63,64,65,66,67,67,68,69,70,
+    71,72,73,74,75,75,76,77,78,79,80,81,82,83,84,85,
+    86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
+};
+
+static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
+
+/* Resets the first pass file to the given position using a relative seek
+ * from the current position
+ */
+static void reset_fpf_position(VP8_COMP *cpi, FIRSTPASS_STATS *Position)
+{
+    cpi->twopass.stats_in = Position;
+}
+
+static int lookup_next_frame_stats(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+{
+    if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+        return EOF;
+
+    *next_frame = *cpi->twopass.stats_in;
+    return 1;
+}
+
+/* Read frame stats at an offset from the current position */
+static int read_frame_stats( VP8_COMP *cpi,
+                             FIRSTPASS_STATS *frame_stats,
+                             int offset )
+{
+    FIRSTPASS_STATS * fps_ptr = cpi->twopass.stats_in;
+
+    /* Check legality of offset */
+    if ( offset >= 0 )
+    {
+        if ( &fps_ptr[offset] >= cpi->twopass.stats_in_end )
+             return EOF;
+    }
+    else if ( offset < 0 )
+    {
+        if ( &fps_ptr[offset] < cpi->twopass.stats_in_start )
+             return EOF;
+    }
+
+    *frame_stats = fps_ptr[offset];
+    return 1;
+}
+
+static int input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
+{
+    if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+        return EOF;
+
+    *fps = *cpi->twopass.stats_in;
+    cpi->twopass.stats_in =
+         (void*)((char *)cpi->twopass.stats_in + sizeof(FIRSTPASS_STATS));
+    return 1;
+}
+
+static void output_stats(const VP8_COMP            *cpi,
+                         struct vpx_codec_pkt_list *pktlist,
+                         FIRSTPASS_STATS            *stats)
+{
+    struct vpx_codec_cx_pkt pkt;
+    pkt.kind = VPX_CODEC_STATS_PKT;
+    pkt.data.twopass_stats.buf = stats;
+    pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+    vpx_codec_pkt_list_add(pktlist, &pkt);
+
+/* TEMP debug code */
+#if OUTPUT_FPF
+
+    {
+        FILE *fpfile;
+        fpfile = fopen("firstpass.stt", "a");
+
+        fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f"
+                " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
+                " %12.0f %12.0f %12.4f\n",
+                stats->frame,
+                stats->intra_error,
+                stats->coded_error,
+                stats->ssim_weighted_pred_err,
+                stats->pcnt_inter,
+                stats->pcnt_motion,
+                stats->pcnt_second_ref,
+                stats->pcnt_neutral,
+                stats->MVr,
+                stats->mvr_abs,
+                stats->MVc,
+                stats->mvc_abs,
+                stats->MVrv,
+                stats->MVcv,
+                stats->mv_in_out_count,
+                stats->new_mv_count,
+                stats->count,
+                stats->duration);
+        fclose(fpfile);
+    }
+#endif
+}
+
+static void zero_stats(FIRSTPASS_STATS *section)
+{
+    section->frame      = 0.0;
+    section->intra_error = 0.0;
+    section->coded_error = 0.0;
+    section->ssim_weighted_pred_err = 0.0;
+    section->pcnt_inter  = 0.0;
+    section->pcnt_motion  = 0.0;
+    section->pcnt_second_ref = 0.0;
+    section->pcnt_neutral = 0.0;
+    section->MVr        = 0.0;
+    section->mvr_abs     = 0.0;
+    section->MVc        = 0.0;
+    section->mvc_abs     = 0.0;
+    section->MVrv       = 0.0;
+    section->MVcv       = 0.0;
+    section->mv_in_out_count  = 0.0;
+    section->new_mv_count = 0.0;
+    section->count      = 0.0;
+    section->duration   = 1.0;
+}
+
+static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
+{
+    section->frame += frame->frame;
+    section->intra_error += frame->intra_error;
+    section->coded_error += frame->coded_error;
+    section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err;
+    section->pcnt_inter  += frame->pcnt_inter;
+    section->pcnt_motion += frame->pcnt_motion;
+    section->pcnt_second_ref += frame->pcnt_second_ref;
+    section->pcnt_neutral += frame->pcnt_neutral;
+    section->MVr        += frame->MVr;
+    section->mvr_abs     += frame->mvr_abs;
+    section->MVc        += frame->MVc;
+    section->mvc_abs     += frame->mvc_abs;
+    section->MVrv       += frame->MVrv;
+    section->MVcv       += frame->MVcv;
+    section->mv_in_out_count  += frame->mv_in_out_count;
+    section->new_mv_count += frame->new_mv_count;
+    section->count      += frame->count;
+    section->duration   += frame->duration;
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
+{
+    section->frame -= frame->frame;
+    section->intra_error -= frame->intra_error;
+    section->coded_error -= frame->coded_error;
+    section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err;
+    section->pcnt_inter  -= frame->pcnt_inter;
+    section->pcnt_motion -= frame->pcnt_motion;
+    section->pcnt_second_ref -= frame->pcnt_second_ref;
+    section->pcnt_neutral -= frame->pcnt_neutral;
+    section->MVr        -= frame->MVr;
+    section->mvr_abs     -= frame->mvr_abs;
+    section->MVc        -= frame->MVc;
+    section->mvc_abs     -= frame->mvc_abs;
+    section->MVrv       -= frame->MVrv;
+    section->MVcv       -= frame->MVcv;
+    section->mv_in_out_count  -= frame->mv_in_out_count;
+    section->new_mv_count -= frame->new_mv_count;
+    section->count      -= frame->count;
+    section->duration   -= frame->duration;
+}
+
+static void avg_stats(FIRSTPASS_STATS *section)
+{
+    if (section->count < 1.0)
+        return;
+
+    section->intra_error /= section->count;
+    section->coded_error /= section->count;
+    section->ssim_weighted_pred_err /= section->count;
+    section->pcnt_inter  /= section->count;
+    section->pcnt_second_ref /= section->count;
+    section->pcnt_neutral /= section->count;
+    section->pcnt_motion /= section->count;
+    section->MVr        /= section->count;
+    section->mvr_abs     /= section->count;
+    section->MVc        /= section->count;
+    section->mvc_abs     /= section->count;
+    section->MVrv       /= section->count;
+    section->MVcv       /= section->count;
+    section->mv_in_out_count   /= section->count;
+    section->duration   /= section->count;
+}
+
+/* Calculate a modified Error used in distributing bits between easier
+ * and harder frames
+ */
+static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+    double av_err = ( cpi->twopass.total_stats.ssim_weighted_pred_err /
+                      cpi->twopass.total_stats.count );
+    double this_err = this_frame->ssim_weighted_pred_err;
+    double modified_err;
+
+    if (this_err > av_err)
+        modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
+    else
+        modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
+
+    return modified_err;
+}
+
+static const double weight_table[256] = {
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
+0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
+0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
+0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
+};
+
+static double simple_weight(YV12_BUFFER_CONFIG *source)
+{
+    int i, j;
+
+    unsigned char *src = source->y_buffer;
+    double sum_weights = 0.0;
+
+    /* Loop throught the Y plane raw examining levels and creating a weight
+     * for the image
+     */
+    i = source->y_height;
+    do
+    {
+        j = source->y_width;
+        do
+        {
+            sum_weights += weight_table[ *src];
+            src++;
+        }while(--j);
+        src -= source->y_width;
+        src += source->y_stride;
+    }while(--i);
+
+    sum_weights /= (source->y_height * source->y_width);
+
+    return sum_weights;
+}
+
+
+/* This function returns the current per frame maximum bitrate target */
+static int frame_max_bits(VP8_COMP *cpi)
+{
+    /* Max allocation for a single frame based on the max section guidelines
+     * passed in and how many bits are left
+     */
+    int max_bits;
+
+    /* For CBR we need to also consider buffer fullness.
+     * If we are running below the optimal level then we need to gradually
+     * tighten up on max_bits.
+     */
+    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+    {
+        double buffer_fullness_ratio = (double)cpi->buffer_level / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.optimal_buffer_level);
+
+        /* For CBR base this on the target average bits per frame plus the
+         * maximum sedction rate passed in by the user
+         */
+        max_bits = (int)(cpi->av_per_frame_bandwidth * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+
+        /* If our buffer is below the optimum level */
+        if (buffer_fullness_ratio < 1.0)
+        {
+            /* The lower of max_bits / 4 or cpi->av_per_frame_bandwidth / 4. */
+            int min_max_bits = ((cpi->av_per_frame_bandwidth >> 2) < (max_bits >> 2)) ? cpi->av_per_frame_bandwidth >> 2 : max_bits >> 2;
+
+            max_bits = (int)(max_bits * buffer_fullness_ratio);
+
+            /* Lowest value we will set ... which should allow the buffer to
+             * refill.
+             */
+            if (max_bits < min_max_bits)
+                max_bits = min_max_bits;
+        }
+    }
+    /* VBR */
+    else
+    {
+        /* For VBR base this on the bits and frames left plus the
+         * two_pass_vbrmax_section rate passed in by the user
+         */
+        max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats.count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+    }
+
+    /* Trap case where we are out of bits */
+    if (max_bits < 0)
+        max_bits = 0;
+
+    return max_bits;
+}
+
+void vp8_init_first_pass(VP8_COMP *cpi)
+{
+    zero_stats(&cpi->twopass.total_stats);
+}
+
+void vp8_end_first_pass(VP8_COMP *cpi)
+{
+    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
+}
+
+static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x,
+                              YV12_BUFFER_CONFIG * raw_buffer,
+                              int * raw_motion_err,
+                              YV12_BUFFER_CONFIG * recon_buffer,
+                              int * best_motion_err, int recon_yoffset)
+{
+    MACROBLOCKD * const xd = & x->e_mbd;
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+
+    unsigned char *src_ptr = (*(b->base_src) + b->src);
+    int src_stride = b->src_stride;
+    unsigned char *raw_ptr;
+    int raw_stride = raw_buffer->y_stride;
+    unsigned char *ref_ptr;
+    int ref_stride = x->e_mbd.pre.y_stride;
+
+    /* Set up pointers for this macro block raw buffer */
+    raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
+                                + d->offset);
+    vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride,
+                   (unsigned int *)(raw_motion_err));
+
+    /* Set up pointers for this macro block recon buffer */
+    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+    ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
+    vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride,
+                   (unsigned int *)(best_motion_err));
+}
+
+static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
+                                     int_mv *ref_mv, MV *best_mv,
+                                     YV12_BUFFER_CONFIG *recon_buffer,
+                                     int *best_motion_err, int recon_yoffset )
+{
+    MACROBLOCKD *const xd = & x->e_mbd;
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    int num00;
+
+    int_mv tmp_mv;
+    int_mv ref_mv_full;
+
+    int tmp_err;
+    int step_param = 3; /* Dont search over full range for first pass */
+    int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+    int n;
+    vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+    int new_mv_mode_penalty = 256;
+
+    /* override the default variance function to use MSE */
+    v_fn_ptr.vf    = vp8_mse16x16;
+
+    /* Set up pointers for this macro block recon buffer */
+    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+
+    /* Initial step/diamond search centred on best mv */
+    tmp_mv.as_int = 0;
+    ref_mv_full.as_mv.col = ref_mv->as_mv.col>>3;
+    ref_mv_full.as_mv.row = ref_mv->as_mv.row>>3;
+    tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv, step_param,
+                                      x->sadperbit16, &num00, &v_fn_ptr,
+                                      x->mvcost, ref_mv);
+    if ( tmp_err < INT_MAX-new_mv_mode_penalty )
+        tmp_err += new_mv_mode_penalty;
+
+    if (tmp_err < *best_motion_err)
+    {
+        *best_motion_err = tmp_err;
+        best_mv->row = tmp_mv.as_mv.row;
+        best_mv->col = tmp_mv.as_mv.col;
+    }
+
+    /* Further step/diamond searches as necessary */
+    n = num00;
+    num00 = 0;
+
+    while (n < further_steps)
+    {
+        n++;
+
+        if (num00)
+            num00--;
+        else
+        {
+            tmp_err = cpi->diamond_search_sad(x, b, d, &ref_mv_full, &tmp_mv,
+                                              step_param + n, x->sadperbit16,
+                                              &num00, &v_fn_ptr, x->mvcost,
+                                              ref_mv);
+            if ( tmp_err < INT_MAX-new_mv_mode_penalty )
+                tmp_err += new_mv_mode_penalty;
+
+            if (tmp_err < *best_motion_err)
+            {
+                *best_motion_err = tmp_err;
+                best_mv->row = tmp_mv.as_mv.row;
+                best_mv->col = tmp_mv.as_mv.col;
+            }
+        }
+    }
+}
+
+void vp8_first_pass(VP8_COMP *cpi)
+{
+    int mb_row, mb_col;
+    MACROBLOCK *const x = & cpi->mb;
+    VP8_COMMON *const cm = & cpi->common;
+    MACROBLOCKD *const xd = & x->e_mbd;
+
+    int recon_yoffset, recon_uvoffset;
+    YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
+    YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+    YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
+    int recon_y_stride = lst_yv12->y_stride;
+    int recon_uv_stride = lst_yv12->uv_stride;
+    int64_t intra_error = 0;
+    int64_t coded_error = 0;
+
+    int sum_mvr = 0, sum_mvc = 0;
+    int sum_mvr_abs = 0, sum_mvc_abs = 0;
+    int sum_mvrs = 0, sum_mvcs = 0;
+    int mvcount = 0;
+    int intercount = 0;
+    int second_ref_count = 0;
+    int intrapenalty = 256;
+    int neutral_count = 0;
+    int new_mv_count = 0;
+    int sum_in_vectors = 0;
+    uint32_t lastmv_as_int = 0;
+
+    int_mv zero_ref_mv;
+
+    zero_ref_mv.as_int = 0;
+
+    vp8_clear_system_state();
+
+    x->src = * cpi->Source;
+    xd->pre = *lst_yv12;
+    xd->dst = *new_yv12;
+
+    x->partition_info = x->pi;
+
+    xd->mode_info_context = cm->mi;
+
+    if(!cm->use_bilinear_mc_filter)
+    {
+         xd->subpixel_predict        = vp8_sixtap_predict4x4;
+         xd->subpixel_predict8x4     = vp8_sixtap_predict8x4;
+         xd->subpixel_predict8x8     = vp8_sixtap_predict8x8;
+         xd->subpixel_predict16x16   = vp8_sixtap_predict16x16;
+     }
+     else
+     {
+         xd->subpixel_predict        = vp8_bilinear_predict4x4;
+         xd->subpixel_predict8x4     = vp8_bilinear_predict8x4;
+         xd->subpixel_predict8x8     = vp8_bilinear_predict8x8;
+         xd->subpixel_predict16x16   = vp8_bilinear_predict16x16;
+     }
+
+    vp8_build_block_offsets(x);
+
+    /* set up frame new frame for intra coded blocks */
+    vp8_setup_intra_recon(new_yv12);
+    vp8cx_frame_init_quantizer(cpi);
+
+    /* Initialise the MV cost table to the defaults */
+    {
+        int flag[2] = {1, 1};
+        vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
+        vpx_memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+        vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
+    }
+
+    /* for each macroblock row in image */
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        int_mv best_ref_mv;
+
+        best_ref_mv.as_int = 0;
+
+        /* reset above block coeffs */
+        xd->up_available = (mb_row != 0);
+        recon_yoffset = (mb_row * recon_y_stride * 16);
+        recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+        /* Set up limit values for motion vectors to prevent them extending
+         * outside the UMV borders
+         */
+        x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+        x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+
+        /* for each macroblock col in image */
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            int this_error;
+            int gf_motion_error = INT_MAX;
+            int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+
+            xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
+            xd->dst.u_buffer = new_yv12->u_buffer + recon_uvoffset;
+            xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset;
+            xd->left_available = (mb_col != 0);
+
+            /* Copy current mb to a buffer */
+            vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
+            /* do intra 16x16 prediction */
+            this_error = vp8_encode_intra(cpi, x, use_dc_pred);
+
+            /* "intrapenalty" below deals with situations where the intra
+             * and inter error scores are very low (eg a plain black frame)
+             * We do not have special cases in first pass for 0,0 and
+             * nearest etc so all inter modes carry an overhead cost
+             * estimate fot the mv. When the error score is very low this
+             * causes us to pick all or lots of INTRA modes and throw lots
+             * of key frames. This penalty adds a cost matching that of a
+             * 0,0 mv to the intra case.
+             */
+            this_error += intrapenalty;
+
+            /* Cumulative intra error total */
+            intra_error += (int64_t)this_error;
+
+            /* Set up limit values for motion vectors to prevent them
+             * extending outside the UMV borders
+             */
+            x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+            x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+
+            /* Other than for the first frame do a motion search */
+            if (cm->current_video_frame > 0)
+            {
+                BLOCKD *d = &x->e_mbd.block[0];
+                MV tmp_mv = {0, 0};
+                int tmp_err;
+                int motion_error = INT_MAX;
+                int raw_motion_error = INT_MAX;
+
+                /* Simple 0,0 motion with no mv overhead */
+                zz_motion_search( cpi, x, cpi->last_frame_unscaled_source,
+                                  &raw_motion_error, lst_yv12, &motion_error,
+                                  recon_yoffset );
+                d->bmi.mv.as_mv.row = 0;
+                d->bmi.mv.as_mv.col = 0;
+
+                if (raw_motion_error < cpi->oxcf.encode_breakout)
+                    goto skip_motion_search;
+
+                /* Test last reference frame using the previous best mv as the
+                 * starting point (best reference) for the search
+                 */
+                first_pass_motion_search(cpi, x, &best_ref_mv,
+                                        &d->bmi.mv.as_mv, lst_yv12,
+                                        &motion_error, recon_yoffset);
+
+                /* If the current best reference mv is not centred on 0,0
+                 * then do a 0,0 based search as well
+                 */
+                if (best_ref_mv.as_int)
+                {
+                   tmp_err = INT_MAX;
+                   first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv,
+                                     lst_yv12, &tmp_err, recon_yoffset);
+
+                   if ( tmp_err < motion_error )
+                   {
+                        motion_error = tmp_err;
+                        d->bmi.mv.as_mv.row = tmp_mv.row;
+                        d->bmi.mv.as_mv.col = tmp_mv.col;
+                   }
+                }
+
+                /* Experimental search in a second reference frame ((0,0)
+                 * based only)
+                 */
+                if (cm->current_video_frame > 1)
+                {
+                    first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, gld_yv12, &gf_motion_error, recon_yoffset);
+
+                    if ((gf_motion_error < motion_error) && (gf_motion_error < this_error))
+                    {
+                        second_ref_count++;
+                    }
+
+                    /* Reset to last frame as reference buffer */
+                    xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;
+                    xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;
+                    xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
+                }
+
+skip_motion_search:
+                /* Intra assumed best */
+                best_ref_mv.as_int = 0;
+
+                if (motion_error <= this_error)
+                {
+                    /* Keep a count of cases where the inter and intra were
+                     * very close and very low. This helps with scene cut
+                     * detection for example in cropped clips with black bars
+                     * at the sides or top and bottom.
+                     */
+                    if( (((this_error-intrapenalty) * 9) <=
+                         (motion_error*10)) &&
+                        (this_error < (2*intrapenalty)) )
+                    {
+                        neutral_count++;
+                    }
+
+                    d->bmi.mv.as_mv.row <<= 3;
+                    d->bmi.mv.as_mv.col <<= 3;
+                    this_error = motion_error;
+                    vp8_set_mbmode_and_mvs(x, NEWMV, &d->bmi.mv);
+                    vp8_encode_inter16x16y(x);
+                    sum_mvr += d->bmi.mv.as_mv.row;
+                    sum_mvr_abs += abs(d->bmi.mv.as_mv.row);
+                    sum_mvc += d->bmi.mv.as_mv.col;
+                    sum_mvc_abs += abs(d->bmi.mv.as_mv.col);
+                    sum_mvrs += d->bmi.mv.as_mv.row * d->bmi.mv.as_mv.row;
+                    sum_mvcs += d->bmi.mv.as_mv.col * d->bmi.mv.as_mv.col;
+                    intercount++;
+
+                    best_ref_mv.as_int = d->bmi.mv.as_int;
+
+                    /* Was the vector non-zero */
+                    if (d->bmi.mv.as_int)
+                    {
+                        mvcount++;
+
+                        /* Was it different from the last non zero vector */
+                        if ( d->bmi.mv.as_int != lastmv_as_int )
+                            new_mv_count++;
+                        lastmv_as_int = d->bmi.mv.as_int;
+
+                        /* Does the Row vector point inwards or outwards */
+                        if (mb_row < cm->mb_rows / 2)
+                        {
+                            if (d->bmi.mv.as_mv.row > 0)
+                                sum_in_vectors--;
+                            else if (d->bmi.mv.as_mv.row < 0)
+                                sum_in_vectors++;
+                        }
+                        else if (mb_row > cm->mb_rows / 2)
+                        {
+                            if (d->bmi.mv.as_mv.row > 0)
+                                sum_in_vectors++;
+                            else if (d->bmi.mv.as_mv.row < 0)
+                                sum_in_vectors--;
+                        }
+
+                        /* Does the Row vector point inwards or outwards */
+                        if (mb_col < cm->mb_cols / 2)
+                        {
+                            if (d->bmi.mv.as_mv.col > 0)
+                                sum_in_vectors--;
+                            else if (d->bmi.mv.as_mv.col < 0)
+                                sum_in_vectors++;
+                        }
+                        else if (mb_col > cm->mb_cols / 2)
+                        {
+                            if (d->bmi.mv.as_mv.col > 0)
+                                sum_in_vectors++;
+                            else if (d->bmi.mv.as_mv.col < 0)
+                                sum_in_vectors--;
+                        }
+                    }
+                }
+            }
+
+            coded_error += (int64_t)this_error;
+
+            /* adjust to the next column of macroblocks */
+            x->src.y_buffer += 16;
+            x->src.u_buffer += 8;
+            x->src.v_buffer += 8;
+
+            recon_yoffset += 16;
+            recon_uvoffset += 8;
+        }
+
+        /* adjust to the next row of mbs */
+        x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+        x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+        x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+
+        /* extend the recon for intra prediction */
+        vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+        vp8_clear_system_state();
+    }
+
+    vp8_clear_system_state();
+    {
+        double weight = 0.0;
+
+        FIRSTPASS_STATS fps;
+
+        fps.frame      = cm->current_video_frame ;
+        fps.intra_error = (double)(intra_error >> 8);
+        fps.coded_error = (double)(coded_error >> 8);
+        weight = simple_weight(cpi->Source);
+
+
+        if (weight < 0.1)
+            weight = 0.1;
+
+        fps.ssim_weighted_pred_err = fps.coded_error * weight;
+
+        fps.pcnt_inter  = 0.0;
+        fps.pcnt_motion = 0.0;
+        fps.MVr        = 0.0;
+        fps.mvr_abs     = 0.0;
+        fps.MVc        = 0.0;
+        fps.mvc_abs     = 0.0;
+        fps.MVrv       = 0.0;
+        fps.MVcv       = 0.0;
+        fps.mv_in_out_count  = 0.0;
+        fps.new_mv_count = 0.0;
+        fps.count      = 1.0;
+
+        fps.pcnt_inter   = 1.0 * (double)intercount / cm->MBs;
+        fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
+        fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
+
+        if (mvcount > 0)
+        {
+            fps.MVr = (double)sum_mvr / (double)mvcount;
+            fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
+            fps.MVc = (double)sum_mvc / (double)mvcount;
+            fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
+            fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;
+            fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;
+            fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
+            fps.new_mv_count = new_mv_count;
+
+            fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
+        }
+
+        /* TODO:  handle the case when duration is set to 0, or something less
+         * than the full time between subsequent cpi->source_time_stamps
+         */
+        fps.duration = (double)(cpi->source->ts_end
+                       - cpi->source->ts_start);
+
+        /* don't want to do output stats with a stack variable! */
+        memcpy(&cpi->twopass.this_frame_stats,
+               &fps,
+               sizeof(FIRSTPASS_STATS));
+        output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
+        accumulate_stats(&cpi->twopass.total_stats, &fps);
+    }
+
+    /* Copy the previous Last Frame into the GF buffer if specific
+     * conditions for doing so are met
+     */
+    if ((cm->current_video_frame > 0) &&
+        (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
+        ((cpi->twopass.this_frame_stats.intra_error / cpi->twopass.this_frame_stats.coded_error) > 2.0))
+    {
+        vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+    }
+
+    /* swap frame pointers so last frame refers to the frame we just
+     * compressed
+     */
+    vp8_swap_yv12_buffer(lst_yv12, new_yv12);
+    vp8_yv12_extend_frame_borders(lst_yv12);
+
+    /* Special case for the first frame. Copy into the GF buffer as a
+     * second reference.
+     */
+    if (cm->current_video_frame == 0)
+    {
+        vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+    }
+
+
+    /* use this to see what the first pass reconstruction looks like */
+    if (0)
+    {
+        char filename[512];
+        FILE *recon_file;
+        sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+
+        if (cm->current_video_frame == 0)
+            recon_file = fopen(filename, "wb");
+        else
+            recon_file = fopen(filename, "ab");
+
+        (void) fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1,
+                      recon_file);
+        fclose(recon_file);
+    }
+
+    cm->current_video_frame++;
+
+}
+extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
+
+/* Estimate a cost per mb attributable to overheads such as the coding of
+ * modes and motion vectors.
+ * Currently simplistic in its assumptions for testing.
+ */
+
+static double bitcost( double prob )
+{
+    return -(log( prob ) / log( 2.0 ));
+}
+static int64_t estimate_modemvcost(VP8_COMP *cpi,
+                                     FIRSTPASS_STATS * fpstats)
+{
+    int mv_cost;
+    int mode_cost;
+
+    double av_pct_inter = fpstats->pcnt_inter / fpstats->count;
+    double av_pct_motion = fpstats->pcnt_motion / fpstats->count;
+    double av_intra = (1.0 - av_pct_inter);
+
+    double zz_cost;
+    double motion_cost;
+    double intra_cost;
+
+    zz_cost = bitcost(av_pct_inter - av_pct_motion);
+    motion_cost = bitcost(av_pct_motion);
+    intra_cost = bitcost(av_intra);
+
+    /* Estimate of extra bits per mv overhead for mbs
+     * << 9 is the normalization to the (bits * 512) used in vp8_bits_per_mb
+     */
+    mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
+
+    /* Crude estimate of overhead cost from modes
+     * << 9 is the normalization to (bits * 512) used in vp8_bits_per_mb
+     */
+    mode_cost =
+        (int)( ( ((av_pct_inter - av_pct_motion) * zz_cost) +
+                 (av_pct_motion * motion_cost) +
+                 (av_intra * intra_cost) ) * cpi->common.MBs ) << 9;
+
+    return mv_cost + mode_cost;
+}
+
+static double calc_correction_factor( double err_per_mb,
+                                      double err_devisor,
+                                      double pt_low,
+                                      double pt_high,
+                                      int Q )
+{
+    double power_term;
+    double error_term = err_per_mb / err_devisor;
+    double correction_factor;
+
+    /* Adjustment based on Q to power term. */
+    power_term = pt_low + (Q * 0.01);
+    power_term = (power_term > pt_high) ? pt_high : power_term;
+
+    /* Adjustments to error term */
+    /* TBD */
+
+    /* Calculate correction factor */
+    correction_factor = pow(error_term, power_term);
+
+    /* Clip range */
+    correction_factor =
+        (correction_factor < 0.05)
+            ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
+
+    return correction_factor;
+}
+
+static int estimate_max_q(VP8_COMP *cpi,
+                          FIRSTPASS_STATS * fpstats,
+                          int section_target_bandwitdh,
+                          int overhead_bits )
+{
+    int Q;
+    int num_mbs = cpi->common.MBs;
+    int target_norm_bits_per_mb;
+
+    double section_err = (fpstats->coded_error / fpstats->count);
+    double err_per_mb = section_err / num_mbs;
+    double err_correction_factor;
+    double speed_correction = 1.0;
+    int overhead_bits_per_mb;
+
+    if (section_target_bandwitdh <= 0)
+        return cpi->twopass.maxq_max_limit;       /* Highest value allowed */
+
+    target_norm_bits_per_mb =
+        (section_target_bandwitdh < (1 << 20))
+            ? (512 * section_target_bandwitdh) / num_mbs
+            : 512 * (section_target_bandwitdh / num_mbs);
+
+    /* Calculate a corrective factor based on a rolling ratio of bits spent
+     * vs target bits
+     */
+    if ((cpi->rolling_target_bits > 0) &&
+        (cpi->active_worst_quality < cpi->worst_quality))
+    {
+        double rolling_ratio;
+
+        rolling_ratio = (double)cpi->rolling_actual_bits /
+                        (double)cpi->rolling_target_bits;
+
+        if (rolling_ratio < 0.95)
+            cpi->twopass.est_max_qcorrection_factor -= 0.005;
+        else if (rolling_ratio > 1.05)
+            cpi->twopass.est_max_qcorrection_factor += 0.005;
+
+        cpi->twopass.est_max_qcorrection_factor =
+            (cpi->twopass.est_max_qcorrection_factor < 0.1)
+                ? 0.1
+                : (cpi->twopass.est_max_qcorrection_factor > 10.0)
+                    ? 10.0 : cpi->twopass.est_max_qcorrection_factor;
+    }
+
+    /* Corrections for higher compression speed settings
+     * (reduced compression expected)
+     */
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+
+    /* Estimate of overhead bits per mb */
+    /* Correction to overhead bits for min allowed Q. */
+    overhead_bits_per_mb = overhead_bits / num_mbs;
+    overhead_bits_per_mb = (int)(overhead_bits_per_mb *
+                            pow( 0.98, (double)cpi->twopass.maxq_min_limit ));
+
+    /* Try and pick a max Q that will be high enough to encode the
+     * content at the given rate.
+     */
+    for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++)
+    {
+        int bits_per_mb_at_this_q;
+
+        /* Error per MB based correction factor */
+        err_correction_factor =
+            calc_correction_factor(err_per_mb, 150.0, 0.40, 0.90, Q);
+
+        bits_per_mb_at_this_q =
+            vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
+
+        bits_per_mb_at_this_q = (int)(.5 + err_correction_factor
+            * speed_correction * cpi->twopass.est_max_qcorrection_factor
+            * cpi->twopass.section_max_qfactor
+            * (double)bits_per_mb_at_this_q);
+
+        /* Mode and motion overhead */
+        /* As Q rises in real encode loop rd code will force overhead down
+         * We make a crude adjustment for this here as *.98 per Q step.
+         */
+        overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    /* Restriction on active max q for constrained quality mode. */
+    if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+         (Q < cpi->cq_target_quality) )
+    {
+        Q = cpi->cq_target_quality;
+    }
+
+    /* Adjust maxq_min_limit and maxq_max_limit limits based on
+     * average q observed in clip for non kf/gf.arf frames
+     * Give average a chance to settle though.
+     */
+    if ( (cpi->ni_frames >
+                  ((int)cpi->twopass.total_stats.count >> 8)) &&
+         (cpi->ni_frames > 150) )
+    {
+        cpi->twopass.maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality)
+                                  ? (cpi->ni_av_qi + 32) : cpi->worst_quality;
+        cpi->twopass.maxq_min_limit = ((cpi->ni_av_qi - 32) > cpi->best_quality)
+                                  ? (cpi->ni_av_qi - 32) : cpi->best_quality;
+    }
+
+    return Q;
+}
+
+/* For cq mode estimate a cq level that matches the observed
+ * complexity and data rate.
+ */
+static int estimate_cq( VP8_COMP *cpi,
+                        FIRSTPASS_STATS * fpstats,
+                        int section_target_bandwitdh,
+                        int overhead_bits )
+{
+    int Q;
+    int num_mbs = cpi->common.MBs;
+    int target_norm_bits_per_mb;
+
+    double section_err = (fpstats->coded_error / fpstats->count);
+    double err_per_mb = section_err / num_mbs;
+    double err_correction_factor;
+    double speed_correction = 1.0;
+    double clip_iiratio;
+    double clip_iifactor;
+    int overhead_bits_per_mb;
+
+    if (0)
+    {
+        FILE *f = fopen("epmp.stt", "a");
+        fprintf(f, "%10.2f\n", err_per_mb );
+        fclose(f);
+    }
+
+    target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
+                              ? (512 * section_target_bandwitdh) / num_mbs
+                              : 512 * (section_target_bandwitdh / num_mbs);
+
+    /* Estimate of overhead bits per mb */
+    overhead_bits_per_mb = overhead_bits / num_mbs;
+
+    /* Corrections for higher compression speed settings
+     * (reduced compression expected)
+     */
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+
+    /* II ratio correction factor for clip as a whole */
+    clip_iiratio = cpi->twopass.total_stats.intra_error /
+                   DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
+    clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
+    if (clip_iifactor < 0.80)
+        clip_iifactor = 0.80;
+
+    /* Try and pick a Q that can encode the content at the given rate. */
+    for (Q = 0; Q < MAXQ; Q++)
+    {
+        int bits_per_mb_at_this_q;
+
+        /* Error per MB based correction factor */
+        err_correction_factor =
+            calc_correction_factor(err_per_mb, 100.0, 0.40, 0.90, Q);
+
+        bits_per_mb_at_this_q =
+            vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
+
+        bits_per_mb_at_this_q =
+            (int)( .5 + err_correction_factor *
+                        speed_correction *
+                        clip_iifactor *
+                        (double)bits_per_mb_at_this_q);
+
+        /* Mode and motion overhead */
+        /* As Q rises in real encode loop rd code will force overhead down
+         * We make a crude adjustment for this here as *.98 per Q step.
+         */
+        overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    /* Clip value to range "best allowed to (worst allowed - 1)" */
+    Q = cq_level[Q];
+    if ( Q >= cpi->worst_quality )
+        Q = cpi->worst_quality - 1;
+    if ( Q < cpi->best_quality )
+        Q = cpi->best_quality;
+
+    return Q;
+}
+
+static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
+{
+    int Q;
+    int num_mbs = cpi->common.MBs;
+    int target_norm_bits_per_mb;
+
+    double err_per_mb = section_err / num_mbs;
+    double err_correction_factor;
+    double speed_correction = 1.0;
+
+    target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs);
+
+    /* Corrections for higher compression speed settings
+     * (reduced compression expected)
+     */
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+
+    /* Try and pick a Q that can encode the content at the given rate. */
+    for (Q = 0; Q < MAXQ; Q++)
+    {
+        int bits_per_mb_at_this_q;
+
+        /* Error per MB based correction factor */
+        err_correction_factor =
+            calc_correction_factor(err_per_mb, 150.0, 0.40, 0.90, Q);
+
+        bits_per_mb_at_this_q =
+            (int)( .5 + ( err_correction_factor *
+                          speed_correction *
+                          cpi->twopass.est_max_qcorrection_factor *
+                          (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0 ) );
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    return Q;
+}
+
+/* Estimate a worst case Q for a KF group */
+static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, double group_iiratio)
+{
+    int Q;
+    int num_mbs = cpi->common.MBs;
+    int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs;
+    int bits_per_mb_at_this_q;
+
+    double err_per_mb = section_err / num_mbs;
+    double err_correction_factor;
+    double speed_correction = 1.0;
+    double current_spend_ratio = 1.0;
+
+    double pow_highq = (POW1 < 0.6) ? POW1 + 0.3 : 0.90;
+    double pow_lowq = (POW1 < 0.7) ? POW1 + 0.1 : 0.80;
+
+    double iiratio_correction_factor = 1.0;
+
+    double combined_correction_factor;
+
+    /* Trap special case where the target is <= 0 */
+    if (target_norm_bits_per_mb <= 0)
+        return MAXQ * 2;
+
+    /* Calculate a corrective factor based on a rolling ratio of bits spent
+     *  vs target bits
+     * This is clamped to the range 0.1 to 10.0
+     */
+    if (cpi->long_rolling_target_bits <= 0)
+        current_spend_ratio = 10.0;
+    else
+    {
+        current_spend_ratio = (double)cpi->long_rolling_actual_bits / (double)cpi->long_rolling_target_bits;
+        current_spend_ratio = (current_spend_ratio > 10.0) ? 10.0 : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio;
+    }
+
+    /* Calculate a correction factor based on the quality of prediction in
+     * the sequence as indicated by intra_inter error score ratio (IIRatio)
+     * The idea here is to favour subsampling in the hardest sections vs
+     * the easyest.
+     */
+    iiratio_correction_factor = 1.0 - ((group_iiratio - 6.0) * 0.1);
+
+    if (iiratio_correction_factor < 0.5)
+        iiratio_correction_factor = 0.5;
+
+    /* Corrections for higher compression speed settings
+     * (reduced compression expected)
+     */
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+
+    /* Combine the various factors calculated above */
+    combined_correction_factor = speed_correction * iiratio_correction_factor * current_spend_ratio;
+
+    /* Try and pick a Q that should be high enough to encode the content at
+     * the given rate.
+     */
+    for (Q = 0; Q < MAXQ; Q++)
+    {
+        /* Error per MB based correction factor */
+        err_correction_factor =
+            calc_correction_factor(err_per_mb, 150.0, pow_lowq, pow_highq, Q);
+
+        bits_per_mb_at_this_q =
+            (int)(.5 + ( err_correction_factor *
+                         combined_correction_factor *
+                         (double)vp8_bits_per_mb[INTER_FRAME][Q]) );
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    /* If we could not hit the target even at Max Q then estimate what Q
+     * would have been required
+     */
+    while ((bits_per_mb_at_this_q > target_norm_bits_per_mb)  && (Q < (MAXQ * 2)))
+    {
+
+        bits_per_mb_at_this_q = (int)(0.96 * bits_per_mb_at_this_q);
+        Q++;
+    }
+
+    if (0)
+    {
+        FILE *f = fopen("estkf_q.stt", "a");
+        fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n", cpi->common.current_video_frame, bits_per_mb_at_this_q,
+                target_norm_bits_per_mb, err_per_mb, err_correction_factor,
+                current_spend_ratio, group_iiratio, iiratio_correction_factor,
+                (double)cpi->buffer_level / (double)cpi->oxcf.optimal_buffer_level, Q);
+        fclose(f);
+    }
+
+    return Q;
+}
+
+extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate);
+
+void vp8_init_second_pass(VP8_COMP *cpi)
+{
+    FIRSTPASS_STATS this_frame;
+    FIRSTPASS_STATS *start_pos;
+
+    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+    zero_stats(&cpi->twopass.total_stats);
+    zero_stats(&cpi->twopass.total_left_stats);
+
+    if (!cpi->twopass.stats_in_end)
+        return;
+
+    cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+    cpi->twopass.total_left_stats = cpi->twopass.total_stats;
+
+    /* each frame can have a different duration, as the frame rate in the
+     * source isn't guaranteed to be constant.   The frame rate prior to
+     * the first frame encoded in the second pass is a guess.  However the
+     * sum duration is not. Its calculated based on the actual durations of
+     * all frames from the first pass.
+     */
+    vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);
+
+    cpi->output_frame_rate = cpi->frame_rate;
+    cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
+    cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0);
+
+    /* Calculate a minimum intra value to be used in determining the IIratio
+     * scores used in the second pass. We have this minimum to make sure
+     * that clips that are static but "low complexity" in the intra domain
+     * are still boosted appropriately for KF/GF/ARF
+     */
+    cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
+    cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
+
+    /* Scan the first pass file and calculate an average Intra / Inter error
+     * score ratio for the sequence
+     */
+    {
+        double sum_iiratio = 0.0;
+        double IIRatio;
+
+        start_pos = cpi->twopass.stats_in; /* Note starting "file" position */
+
+        while (input_stats(cpi, &this_frame) != EOF)
+        {
+            IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
+            IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
+            sum_iiratio += IIRatio;
+        }
+
+        cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);
+
+        /* Reset file position */
+        reset_fpf_position(cpi, start_pos);
+    }
+
+    /* Scan the first pass file and calculate a modified total error based
+     * upon the bias/power function used to allocate bits
+     */
+    {
+        start_pos = cpi->twopass.stats_in;  /* Note starting "file" position */
+
+        cpi->twopass.modified_error_total = 0.0;
+        cpi->twopass.modified_error_used = 0.0;
+
+        while (input_stats(cpi, &this_frame) != EOF)
+        {
+            cpi->twopass.modified_error_total += calculate_modified_err(cpi, &this_frame);
+        }
+        cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
+
+        reset_fpf_position(cpi, start_pos);  /* Reset file position */
+
+    }
+}
+
+void vp8_end_second_pass(VP8_COMP *cpi)
+{
+}
+
+/* This function gives and estimate of how badly we believe the prediction
+ * quality is decaying from frame to frame.
+ */
+static double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+{
+    double prediction_decay_rate;
+    double motion_decay;
+    double motion_pct = next_frame->pcnt_motion;
+
+    /* Initial basis is the % mbs inter coded */
+    prediction_decay_rate = next_frame->pcnt_inter;
+
+    /* High % motion -> somewhat higher decay rate */
+    motion_decay = (1.0 - (motion_pct / 20.0));
+    if (motion_decay < prediction_decay_rate)
+        prediction_decay_rate = motion_decay;
+
+    /* Adjustment to decay rate based on speed of motion */
+    {
+        double this_mv_rabs;
+        double this_mv_cabs;
+        double distance_factor;
+
+        this_mv_rabs = fabs(next_frame->mvr_abs * motion_pct);
+        this_mv_cabs = fabs(next_frame->mvc_abs * motion_pct);
+
+        distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
+                               (this_mv_cabs * this_mv_cabs)) / 250.0;
+        distance_factor = ((distance_factor > 1.0)
+                                ? 0.0 : (1.0 - distance_factor));
+        if (distance_factor < prediction_decay_rate)
+            prediction_decay_rate = distance_factor;
+    }
+
+    return prediction_decay_rate;
+}
+
+/* Function to test for a condition where a complex transition is followed
+ * by a static section. For example in slide shows where there is a fade
+ * between slides. This is to help with more optimal kf and gf positioning.
+ */
+static int detect_transition_to_still(
+    VP8_COMP *cpi,
+    int frame_interval,
+    int still_interval,
+    double loop_decay_rate,
+    double decay_accumulator )
+{
+    int trans_to_still = 0;
+
+    /* Break clause to detect very still sections after motion
+     * For example a static image after a fade or other transition
+     * instead of a clean scene cut.
+     */
+    if ( (frame_interval > MIN_GF_INTERVAL) &&
+         (loop_decay_rate >= 0.999) &&
+         (decay_accumulator < 0.9) )
+    {
+        int j;
+        FIRSTPASS_STATS * position = cpi->twopass.stats_in;
+        FIRSTPASS_STATS tmp_next_frame;
+        double decay_rate;
+
+        /* Look ahead a few frames to see if static condition persists... */
+        for ( j = 0; j < still_interval; j++ )
+        {
+            if (EOF == input_stats(cpi, &tmp_next_frame))
+                break;
+
+            decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame);
+            if ( decay_rate < 0.999 )
+                break;
+        }
+        /* Reset file position */
+        reset_fpf_position(cpi, position);
+
+        /* Only if it does do we signal a transition to still */
+        if ( j == still_interval )
+            trans_to_still = 1;
+    }
+
+    return trans_to_still;
+}
+
+/* This function detects a flash through the high relative pcnt_second_ref
+ * score in the frame following a flash frame. The offset passed in should
+ * reflect this
+ */
+static int detect_flash( VP8_COMP *cpi, int offset )
+{
+    FIRSTPASS_STATS next_frame;
+
+    int flash_detected = 0;
+
+    /* Read the frame data. */
+    /* The return is 0 (no flash detected) if not a valid frame */
+    if ( read_frame_stats(cpi, &next_frame, offset) != EOF )
+    {
+        /* What we are looking for here is a situation where there is a
+         * brief break in prediction (such as a flash) but subsequent frames
+         * are reasonably well predicted by an earlier (pre flash) frame.
+         * The recovery after a flash is indicated by a high pcnt_second_ref
+         * comapred to pcnt_inter.
+         */
+        if ( (next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
+             (next_frame.pcnt_second_ref >= 0.5 ) )
+        {
+            flash_detected = 1;
+
+            /*if (1)
+            {
+                FILE *f = fopen("flash.stt", "a");
+                fprintf(f, "%8.0f %6.2f %6.2f\n",
+                    next_frame.frame,
+                    next_frame.pcnt_inter,
+                    next_frame.pcnt_second_ref);
+                fclose(f);
+            }*/
+        }
+    }
+
+    return flash_detected;
+}
+
+/* Update the motion related elements to the GF arf boost calculation */
+static void accumulate_frame_motion_stats(
+    VP8_COMP *cpi,
+    FIRSTPASS_STATS * this_frame,
+    double * this_frame_mv_in_out,
+    double * mv_in_out_accumulator,
+    double * abs_mv_in_out_accumulator,
+    double * mv_ratio_accumulator )
+{
+    double this_frame_mvr_ratio;
+    double this_frame_mvc_ratio;
+    double motion_pct;
+
+    /* Accumulate motion stats. */
+    motion_pct = this_frame->pcnt_motion;
+
+    /* Accumulate Motion In/Out of frame stats */
+    *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
+    *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
+    *abs_mv_in_out_accumulator +=
+        fabs(this_frame->mv_in_out_count * motion_pct);
+
+    /* Accumulate a measure of how uniform (or conversely how random)
+     * the motion field is. (A ratio of absmv / mv)
+     */
+    if (motion_pct > 0.05)
+    {
+        this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
+                               DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr));
+
+        this_frame_mvc_ratio = fabs(this_frame->mvc_abs) /
+                               DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc));
+
+         *mv_ratio_accumulator +=
+            (this_frame_mvr_ratio < this_frame->mvr_abs)
+                ? (this_frame_mvr_ratio * motion_pct)
+                : this_frame->mvr_abs * motion_pct;
+
+        *mv_ratio_accumulator +=
+            (this_frame_mvc_ratio < this_frame->mvc_abs)
+                ? (this_frame_mvc_ratio * motion_pct)
+                : this_frame->mvc_abs * motion_pct;
+
+    }
+}
+
+/* Calculate a baseline boost number for the current frame. */
+static double calc_frame_boost(
+    VP8_COMP *cpi,
+    FIRSTPASS_STATS * this_frame,
+    double this_frame_mv_in_out )
+{
+    double frame_boost;
+
+    /* Underlying boost factor is based on inter intra error ratio */
+    if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
+        frame_boost = (IIFACTOR * this_frame->intra_error /
+                      DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
+    else
+        frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
+                      DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
+
+    /* Increase boost for frames where new data coming into frame
+     * (eg zoom out). Slightly reduce boost if there is a net balance
+     * of motion out of the frame (zoom in).
+     * The range for this_frame_mv_in_out is -1.0 to +1.0
+     */
+    if (this_frame_mv_in_out > 0.0)
+        frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+    /* In extreme case boost is halved */
+    else
+        frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+    /* Clip to maximum */
+    if (frame_boost > GF_RMAX)
+        frame_boost = GF_RMAX;
+
+    return frame_boost;
+}
+
+#if NEW_BOOST
+static int calc_arf_boost(
+    VP8_COMP *cpi,
+    int offset,
+    int f_frames,
+    int b_frames,
+    int *f_boost,
+    int *b_boost )
+{
+    FIRSTPASS_STATS this_frame;
+
+    int i;
+    double boost_score = 0.0;
+    double mv_ratio_accumulator = 0.0;
+    double decay_accumulator = 1.0;
+    double this_frame_mv_in_out = 0.0;
+    double mv_in_out_accumulator = 0.0;
+    double abs_mv_in_out_accumulator = 0.0;
+    double r;
+    int flash_detected = 0;
+
+    /* Search forward from the proposed arf/next gf position */
+    for ( i = 0; i < f_frames; i++ )
+    {
+        if ( read_frame_stats(cpi, &this_frame, (i+offset)) == EOF )
+            break;
+
+        /* Update the motion related elements to the boost calculation */
+        accumulate_frame_motion_stats( cpi, &this_frame,
+            &this_frame_mv_in_out, &mv_in_out_accumulator,
+            &abs_mv_in_out_accumulator, &mv_ratio_accumulator );
+
+        /* Calculate the baseline boost number for this frame */
+        r = calc_frame_boost( cpi, &this_frame, this_frame_mv_in_out );
+
+        /* We want to discount the the flash frame itself and the recovery
+         * frame that follows as both will have poor scores.
+         */
+        flash_detected = detect_flash(cpi, (i+offset)) ||
+                         detect_flash(cpi, (i+offset+1));
+
+        /* Cumulative effect of prediction quality decay */
+        if ( !flash_detected )
+        {
+            decay_accumulator =
+                decay_accumulator *
+                get_prediction_decay_rate(cpi, &this_frame);
+            decay_accumulator =
+                decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+        }
+        boost_score += (decay_accumulator * r);
+
+        /* Break out conditions. */
+        if  ( (!flash_detected) &&
+              ((mv_ratio_accumulator > 100.0) ||
+               (abs_mv_in_out_accumulator > 3.0) ||
+               (mv_in_out_accumulator < -2.0) ) )
+        {
+            break;
+        }
+    }
+
+    *f_boost = (int)(boost_score * 100.0) >> 4;
+
+    /* Reset for backward looking loop */
+    boost_score = 0.0;
+    mv_ratio_accumulator = 0.0;
+    decay_accumulator = 1.0;
+    this_frame_mv_in_out = 0.0;
+    mv_in_out_accumulator = 0.0;
+    abs_mv_in_out_accumulator = 0.0;
+
+    /* Search forward from the proposed arf/next gf position */
+    for ( i = -1; i >= -b_frames; i-- )
+    {
+        if ( read_frame_stats(cpi, &this_frame, (i+offset)) == EOF )
+            break;
+
+        /* Update the motion related elements to the boost calculation */
+        accumulate_frame_motion_stats( cpi, &this_frame,
+            &this_frame_mv_in_out, &mv_in_out_accumulator,
+            &abs_mv_in_out_accumulator, &mv_ratio_accumulator );
+
+        /* Calculate the baseline boost number for this frame */
+        r = calc_frame_boost( cpi, &this_frame, this_frame_mv_in_out );
+
+        /* We want to discount the the flash frame itself and the recovery
+         * frame that follows as both will have poor scores.
+         */
+        flash_detected = detect_flash(cpi, (i+offset)) ||
+                         detect_flash(cpi, (i+offset+1));
+
+        /* Cumulative effect of prediction quality decay */
+        if ( !flash_detected )
+        {
+            decay_accumulator =
+                decay_accumulator *
+                get_prediction_decay_rate(cpi, &this_frame);
+            decay_accumulator =
+                decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+        }
+
+        boost_score += (decay_accumulator * r);
+
+        /* Break out conditions. */
+        if  ( (!flash_detected) &&
+              ((mv_ratio_accumulator > 100.0) ||
+               (abs_mv_in_out_accumulator > 3.0) ||
+               (mv_in_out_accumulator < -2.0) ) )
+        {
+            break;
+        }
+    }
+    *b_boost = (int)(boost_score * 100.0) >> 4;
+
+    return (*f_boost + *b_boost);
+}
+#endif
+
+/* Analyse and define a gf/arf group . */
+static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+    FIRSTPASS_STATS next_frame;
+    FIRSTPASS_STATS *start_pos;
+    int i;
+    double r;
+    double boost_score = 0.0;
+    double old_boost_score = 0.0;
+    double gf_group_err = 0.0;
+    double gf_first_frame_err = 0.0;
+    double mod_frame_err = 0.0;
+
+    double mv_ratio_accumulator = 0.0;
+    double decay_accumulator = 1.0;
+
+    double loop_decay_rate = 1.00;          /* Starting decay rate */
+
+    double this_frame_mv_in_out = 0.0;
+    double mv_in_out_accumulator = 0.0;
+    double abs_mv_in_out_accumulator = 0.0;
+    double mod_err_per_mb_accumulator = 0.0;
+
+    int max_bits = frame_max_bits(cpi);     /* Max for a single frame */
+
+    unsigned int allow_alt_ref =
+                    cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
+
+    int alt_boost = 0;
+    int f_boost = 0;
+    int b_boost = 0;
+    int flash_detected;
+
+    cpi->twopass.gf_group_bits = 0;
+    cpi->twopass.gf_decay_rate = 0;
+
+    vp8_clear_system_state();
+
+    start_pos = cpi->twopass.stats_in;
+
+    vpx_memset(&next_frame, 0, sizeof(next_frame)); /* assure clean */
+
+    /* Load stats for the current frame. */
+    mod_frame_err = calculate_modified_err(cpi, this_frame);
+
+    /* Note the error of the frame at the start of the group (this will be
+     * the GF frame error if we code a normal gf
+     */
+    gf_first_frame_err = mod_frame_err;
+
+    /* Special treatment if the current frame is a key frame (which is also
+     * a gf). If it is then its error score (and hence bit allocation) need
+     * to be subtracted out from the calculation for the GF group
+     */
+    if (cpi->common.frame_type == KEY_FRAME)
+        gf_group_err -= gf_first_frame_err;
+
+    /* Scan forward to try and work out how many frames the next gf group
+     * should contain and what level of boost is appropriate for the GF
+     * or ARF that will be coded with the group
+     */
+    i = 0;
+
+    while (((i < cpi->twopass.static_scene_max_gf_interval) ||
+            ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) &&
+           (i < cpi->twopass.frames_to_key))
+    {
+        i++;
+
+        /* Accumulate error score of frames in this gf group */
+        mod_frame_err = calculate_modified_err(cpi, this_frame);
+
+        gf_group_err += mod_frame_err;
+
+        mod_err_per_mb_accumulator +=
+            mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
+
+        if (EOF == input_stats(cpi, &next_frame))
+            break;
+
+        /* Test for the case where there is a brief flash but the prediction
+         * quality back to an earlier frame is then restored.
+         */
+        flash_detected = detect_flash(cpi, 0);
+
+        /* Update the motion related elements to the boost calculation */
+        accumulate_frame_motion_stats( cpi, &next_frame,
+            &this_frame_mv_in_out, &mv_in_out_accumulator,
+            &abs_mv_in_out_accumulator, &mv_ratio_accumulator );
+
+        /* Calculate a baseline boost number for this frame */
+        r = calc_frame_boost( cpi, &next_frame, this_frame_mv_in_out );
+
+        /* Cumulative effect of prediction quality decay */
+        if ( !flash_detected )
+        {
+            loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+            decay_accumulator = decay_accumulator * loop_decay_rate;
+            decay_accumulator =
+                decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+        }
+        boost_score += (decay_accumulator * r);
+
+        /* Break clause to detect very still sections after motion
+         * For example a staic image after a fade or other transition.
+         */
+        if ( detect_transition_to_still( cpi, i, 5,
+                                         loop_decay_rate,
+                                         decay_accumulator ) )
+        {
+            allow_alt_ref = 0;
+            boost_score = old_boost_score;
+            break;
+        }
+
+        /* Break out conditions. */
+        if  (
+            /* Break at cpi->max_gf_interval unless almost totally static */
+            (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) ||
+            (
+                /* Dont break out with a very short interval */
+                (i > MIN_GF_INTERVAL) &&
+                /* Dont break out very close to a key frame */
+                ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
+                ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
+                (!flash_detected) &&
+                ((mv_ratio_accumulator > 100.0) ||
+                 (abs_mv_in_out_accumulator > 3.0) ||
+                 (mv_in_out_accumulator < -2.0) ||
+                 ((boost_score - old_boost_score) < 2.0))
+            ) )
+        {
+            boost_score = old_boost_score;
+            break;
+        }
+
+        vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));
+
+        old_boost_score = boost_score;
+    }
+
+    cpi->twopass.gf_decay_rate =
+        (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0;
+
+    /* When using CBR apply additional buffer related upper limits */
+    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+    {
+        double max_boost;
+
+        /* For cbr apply buffer related limits */
+        if (cpi->drop_frames_allowed)
+        {
+            int64_t df_buffer_level = cpi->oxcf.drop_frames_water_mark *
+                                  (cpi->oxcf.optimal_buffer_level / 100);
+
+            if (cpi->buffer_level > df_buffer_level)
+                max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+            else
+                max_boost = 0.0;
+        }
+        else if (cpi->buffer_level > 0)
+        {
+            max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+        }
+        else
+        {
+            max_boost = 0.0;
+        }
+
+        if (boost_score > max_boost)
+            boost_score = max_boost;
+    }
+
+    /* Dont allow conventional gf too near the next kf */
+    if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)
+    {
+        while (i < cpi->twopass.frames_to_key)
+        {
+            i++;
+
+            if (EOF == input_stats(cpi, this_frame))
+                break;
+
+            if (i < cpi->twopass.frames_to_key)
+            {
+                mod_frame_err = calculate_modified_err(cpi, this_frame);
+                gf_group_err += mod_frame_err;
+            }
+        }
+    }
+
+    cpi->gfu_boost = (int)(boost_score * 100.0) >> 4;
+
+#if NEW_BOOST
+    /* Alterrnative boost calculation for alt ref */
+    alt_boost = calc_arf_boost( cpi, 0, (i-1), (i-1), &f_boost, &b_boost );
+#endif
+
+    /* Should we use the alternate refernce frame */
+    if (allow_alt_ref &&
+        (i >= MIN_GF_INTERVAL) &&
+        /* dont use ARF very near next kf */
+        (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
+#if NEW_BOOST
+        ((next_frame.pcnt_inter > 0.75) ||
+         (next_frame.pcnt_second_ref > 0.5)) &&
+        ((mv_in_out_accumulator / (double)i > -0.2) ||
+         (mv_in_out_accumulator > -2.0)) &&
+        (b_boost > 100) &&
+        (f_boost > 100) )
+#else
+        (next_frame.pcnt_inter > 0.75) &&
+        ((mv_in_out_accumulator / (double)i > -0.2) ||
+         (mv_in_out_accumulator > -2.0)) &&
+        (cpi->gfu_boost > 100) &&
+        (cpi->twopass.gf_decay_rate <=
+            (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))) )
+#endif
+    {
+        int Boost;
+        int allocation_chunks;
+        int Q = (cpi->oxcf.fixed_q < 0)
+                ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+        int tmp_q;
+        int arf_frame_bits = 0;
+        int group_bits;
+
+#if NEW_BOOST
+        cpi->gfu_boost = alt_boost;
+#endif
+
+        /* Estimate the bits to be allocated to the group as a whole */
+        if ((cpi->twopass.kf_group_bits > 0) &&
+            (cpi->twopass.kf_group_error_left > 0))
+        {
+            group_bits = (int)((double)cpi->twopass.kf_group_bits *
+                (gf_group_err / (double)cpi->twopass.kf_group_error_left));
+        }
+        else
+            group_bits = 0;
+
+        /* Boost for arf frame */
+#if NEW_BOOST
+        Boost = (alt_boost * GFQ_ADJUSTMENT) / 100;
+#else
+        Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
+#endif
+        Boost += (i * 50);
+
+        /* Set max and minimum boost and hence minimum allocation */
+        if (Boost > ((cpi->baseline_gf_interval + 1) * 200))
+            Boost = ((cpi->baseline_gf_interval + 1) * 200);
+        else if (Boost < 125)
+            Boost = 125;
+
+        allocation_chunks = (i * 100) + Boost;
+
+        /* Normalize Altboost and allocations chunck down to prevent overflow */
+        while (Boost > 1000)
+        {
+            Boost /= 2;
+            allocation_chunks /= 2;
+        }
+
+        /* Calculate the number of bits to be spent on the arf based on the
+         * boost number
+         */
+        arf_frame_bits = (int)((double)Boost * (group_bits /
+                               (double)allocation_chunks));
+
+        /* Estimate if there are enough bits available to make worthwhile use
+         * of an arf.
+         */
+        tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits);
+
+        /* Only use an arf if it is likely we will be able to code
+         * it at a lower Q than the surrounding frames.
+         */
+        if (tmp_q < cpi->worst_quality)
+        {
+            int half_gf_int;
+            int frames_after_arf;
+            int frames_bwd = cpi->oxcf.arnr_max_frames - 1;
+            int frames_fwd = cpi->oxcf.arnr_max_frames - 1;
+
+            cpi->source_alt_ref_pending = 1;
+
+            /*
+             * For alt ref frames the error score for the end frame of the
+             * group (the alt ref frame) should not contribute to the group
+             * total and hence the number of bit allocated to the group.
+             * Rather it forms part of the next group (it is the GF at the
+             * start of the next group)
+             * gf_group_err -= mod_frame_err;
+             *
+             * For alt ref frames alt ref frame is technically part of the
+             * GF frame for the next group but we always base the error
+             * calculation and bit allocation on the current group of frames.
+             *
+             * Set the interval till the next gf or arf.
+             * For ARFs this is the number of frames to be coded before the
+             * future frame that is coded as an ARF.
+             * The future frame itself is part of the next group
+             */
+            cpi->baseline_gf_interval = i;
+
+            /*
+             * Define the arnr filter width for this group of frames:
+             * We only filter frames that lie within a distance of half
+             * the GF interval from the ARF frame. We also have to trap
+             * cases where the filter extends beyond the end of clip.
+             * Note: this_frame->frame has been updated in the loop
+             * so it now points at the ARF frame.
+             */
+            half_gf_int = cpi->baseline_gf_interval >> 1;
+            frames_after_arf = (int)(cpi->twopass.total_stats.count -
+                               this_frame->frame - 1);
+
+            switch (cpi->oxcf.arnr_type)
+            {
+            case 1: /* Backward filter */
+                frames_fwd = 0;
+                if (frames_bwd > half_gf_int)
+                    frames_bwd = half_gf_int;
+                break;
+
+            case 2: /* Forward filter */
+                if (frames_fwd > half_gf_int)
+                    frames_fwd = half_gf_int;
+                if (frames_fwd > frames_after_arf)
+                    frames_fwd = frames_after_arf;
+                frames_bwd = 0;
+                break;
+
+            case 3: /* Centered filter */
+            default:
+                frames_fwd >>= 1;
+                if (frames_fwd > frames_after_arf)
+                    frames_fwd = frames_after_arf;
+                if (frames_fwd > half_gf_int)
+                    frames_fwd = half_gf_int;
+
+                frames_bwd = frames_fwd;
+
+                /* For even length filter there is one more frame backward
+                 * than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+                 */
+                if (frames_bwd < half_gf_int)
+                    frames_bwd += (cpi->oxcf.arnr_max_frames+1) & 0x1;
+                break;
+            }
+
+            cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
+        }
+        else
+        {
+            cpi->source_alt_ref_pending = 0;
+            cpi->baseline_gf_interval = i;
+        }
+    }
+    else
+    {
+        cpi->source_alt_ref_pending = 0;
+        cpi->baseline_gf_interval = i;
+    }
+
+    /*
+     * Now decide how many bits should be allocated to the GF group as  a
+     * proportion of those remaining in the kf group.
+     * The final key frame group in the clip is treated as a special case
+     * where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
+     * This is also important for short clips where there may only be one
+     * key frame.
+     */
+    if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
+                                            cpi->common.current_video_frame))
+    {
+        cpi->twopass.kf_group_bits =
+            (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
+    }
+
+    /* Calculate the bits to be allocated to the group as a whole */
+    if ((cpi->twopass.kf_group_bits > 0) &&
+        (cpi->twopass.kf_group_error_left > 0))
+    {
+        cpi->twopass.gf_group_bits =
+            (int)((double)cpi->twopass.kf_group_bits *
+                  (gf_group_err / (double)cpi->twopass.kf_group_error_left));
+    }
+    else
+        cpi->twopass.gf_group_bits = 0;
+
+    cpi->twopass.gf_group_bits = (int)(
+        (cpi->twopass.gf_group_bits < 0)
+            ? 0
+            : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
+                ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits);
+
+    /* Clip cpi->twopass.gf_group_bits based on user supplied data rate
+     * variability limit (cpi->oxcf.two_pass_vbrmax_section)
+     */
+    if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)
+        cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;
+
+    /* Reset the file position */
+    reset_fpf_position(cpi, start_pos);
+
+    /* Update the record of error used so far (only done once per gf group) */
+    cpi->twopass.modified_error_used += gf_group_err;
+
+    /* Assign  bits to the arf or gf. */
+    for (i = 0; i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); i++) {
+        int Boost;
+        int allocation_chunks;
+        int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+        int gf_bits;
+
+        /* For ARF frames */
+        if (cpi->source_alt_ref_pending && i == 0)
+        {
+#if NEW_BOOST
+            Boost = (alt_boost * GFQ_ADJUSTMENT) / 100;
+#else
+            Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
+#endif
+            Boost += (cpi->baseline_gf_interval * 50);
+
+            /* Set max and minimum boost and hence minimum allocation */
+            if (Boost > ((cpi->baseline_gf_interval + 1) * 200))
+                Boost = ((cpi->baseline_gf_interval + 1) * 200);
+            else if (Boost < 125)
+                Boost = 125;
+
+            allocation_chunks =
+                ((cpi->baseline_gf_interval + 1) * 100) + Boost;
+        }
+        /* Else for standard golden frames */
+        else
+        {
+            /* boost based on inter / intra ratio of subsequent frames */
+            Boost = (cpi->gfu_boost * GFQ_ADJUSTMENT) / 100;
+
+            /* Set max and minimum boost and hence minimum allocation */
+            if (Boost > (cpi->baseline_gf_interval * 150))
+                Boost = (cpi->baseline_gf_interval * 150);
+            else if (Boost < 125)
+                Boost = 125;
+
+            allocation_chunks =
+                (cpi->baseline_gf_interval * 100) + (Boost - 100);
+        }
+
+        /* Normalize Altboost and allocations chunck down to prevent overflow */
+        while (Boost > 1000)
+        {
+            Boost /= 2;
+            allocation_chunks /= 2;
+        }
+
+        /* Calculate the number of bits to be spent on the gf or arf based on
+         * the boost number
+         */
+        gf_bits = (int)((double)Boost *
+                        (cpi->twopass.gf_group_bits /
+                         (double)allocation_chunks));
+
+        /* If the frame that is to be boosted is simpler than the average for
+         * the gf/arf group then use an alternative calculation
+         * based on the error score of the frame itself
+         */
+        if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval)
+        {
+            double  alt_gf_grp_bits;
+            int     alt_gf_bits;
+
+            alt_gf_grp_bits =
+                (double)cpi->twopass.kf_group_bits  *
+                (mod_frame_err * (double)cpi->baseline_gf_interval) /
+                DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left);
+
+            alt_gf_bits = (int)((double)Boost * (alt_gf_grp_bits /
+                                                 (double)allocation_chunks));
+
+            if (gf_bits > alt_gf_bits)
+            {
+                gf_bits = alt_gf_bits;
+            }
+        }
+        /* Else if it is harder than other frames in the group make sure it at
+         * least receives an allocation in keeping with its relative error
+         * score, otherwise it may be worse off than an "un-boosted" frame
+         */
+        else
+        {
+            int alt_gf_bits =
+                (int)((double)cpi->twopass.kf_group_bits *
+                      mod_frame_err /
+                      DOUBLE_DIVIDE_CHECK((double)cpi->twopass.kf_group_error_left));
+
+            if (alt_gf_bits > gf_bits)
+            {
+                gf_bits = alt_gf_bits;
+            }
+        }
+
+        /* Apply an additional limit for CBR */
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            if (cpi->twopass.gf_bits > (int)(cpi->buffer_level >> 1))
+                cpi->twopass.gf_bits = (int)(cpi->buffer_level >> 1);
+        }
+
+        /* Dont allow a negative value for gf_bits */
+        if (gf_bits < 0)
+            gf_bits = 0;
+
+        /* Add in minimum for a frame */
+        gf_bits += cpi->min_frame_bandwidth;
+
+        if (i == 0)
+        {
+            cpi->twopass.gf_bits = gf_bits;
+        }
+        if (i == 1 || (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)))
+        {
+            /* Per frame bit target for this frame */
+            cpi->per_frame_bandwidth = gf_bits;
+        }
+    }
+
+    {
+        /* Adjust KF group bits and error remainin */
+        cpi->twopass.kf_group_error_left -= (int64_t)gf_group_err;
+        cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;
+
+        if (cpi->twopass.kf_group_bits < 0)
+            cpi->twopass.kf_group_bits = 0;
+
+        /* Note the error score left in the remaining frames of the group.
+         * For normal GFs we want to remove the error score for the first
+         * frame of the group (except in Key frame case where this has
+         * already happened)
+         */
+        if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME)
+            cpi->twopass.gf_group_error_left = (int)(gf_group_err -
+                                                     gf_first_frame_err);
+        else
+            cpi->twopass.gf_group_error_left = (int) gf_group_err;
+
+        cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - cpi->min_frame_bandwidth;
+
+        if (cpi->twopass.gf_group_bits < 0)
+            cpi->twopass.gf_group_bits = 0;
+
+        /* This condition could fail if there are two kfs very close together
+         * despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
+         * calculation of cpi->twopass.alt_extra_bits.
+         */
+        if ( cpi->baseline_gf_interval >= 3 )
+        {
+#if NEW_BOOST
+            int boost = (cpi->source_alt_ref_pending)
+                        ? b_boost : cpi->gfu_boost;
+#else
+            int boost = cpi->gfu_boost;
+#endif
+            if ( boost >= 150 )
+            {
+                int pct_extra;
+
+                pct_extra = (boost - 100) / 50;
+                pct_extra = (pct_extra > 20) ? 20 : pct_extra;
+
+                cpi->twopass.alt_extra_bits =
+                    (cpi->twopass.gf_group_bits * pct_extra) / 100;
+                cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits;
+                cpi->twopass.alt_extra_bits /=
+                    ((cpi->baseline_gf_interval-1)>>1);
+            }
+            else
+                cpi->twopass.alt_extra_bits = 0;
+        }
+        else
+            cpi->twopass.alt_extra_bits = 0;
+    }
+
+    /* Adjustments based on a measure of complexity of the section */
+    if (cpi->common.frame_type != KEY_FRAME)
+    {
+        FIRSTPASS_STATS sectionstats;
+        double Ratio;
+
+        zero_stats(&sectionstats);
+        reset_fpf_position(cpi, start_pos);
+
+        for (i = 0 ; i < cpi->baseline_gf_interval ; i++)
+        {
+            input_stats(cpi, &next_frame);
+            accumulate_stats(&sectionstats, &next_frame);
+        }
+
+        avg_stats(&sectionstats);
+
+        cpi->twopass.section_intra_rating = (unsigned int)
+            (sectionstats.intra_error /
+            DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
+
+        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+        cpi->twopass.section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025);
+
+        if (cpi->twopass.section_max_qfactor < 0.80)
+            cpi->twopass.section_max_qfactor = 0.80;
+
+        reset_fpf_position(cpi, start_pos);
+    }
+}
+
+/* Allocate bits to a normal frame that is neither a gf an arf or a key frame. */
+static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+    int    target_frame_size;
+
+    double modified_err;
+    double err_fraction;
+
+    int max_bits = frame_max_bits(cpi);  /* Max for a single frame */
+
+    /* Calculate modified prediction error used in bit allocation */
+    modified_err = calculate_modified_err(cpi, this_frame);
+
+    /* What portion of the remaining GF group error is used by this frame */
+    if (cpi->twopass.gf_group_error_left > 0)
+        err_fraction = modified_err / cpi->twopass.gf_group_error_left;
+    else
+        err_fraction = 0.0;
+
+    /* How many of those bits available for allocation should we give it? */
+    target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);
+
+    /* Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits)
+     * at the top end.
+     */
+    if (target_frame_size < 0)
+        target_frame_size = 0;
+    else
+    {
+        if (target_frame_size > max_bits)
+            target_frame_size = max_bits;
+
+        if (target_frame_size > cpi->twopass.gf_group_bits)
+            target_frame_size = cpi->twopass.gf_group_bits;
+    }
+
+    /* Adjust error and bits remaining */
+    cpi->twopass.gf_group_error_left -= (int)modified_err;
+    cpi->twopass.gf_group_bits -= target_frame_size;
+
+    if (cpi->twopass.gf_group_bits < 0)
+        cpi->twopass.gf_group_bits = 0;
+
+    /* Add in the minimum number of bits that is set aside for every frame. */
+    target_frame_size += cpi->min_frame_bandwidth;
+
+    /* Every other frame gets a few extra bits */
+    if ( (cpi->common.frames_since_golden & 0x01) &&
+         (cpi->frames_till_gf_update_due > 0) )
+    {
+        target_frame_size += cpi->twopass.alt_extra_bits;
+    }
+
+    /* Per frame bit target for this frame */
+    cpi->per_frame_bandwidth = target_frame_size;
+}
+
+void vp8_second_pass(VP8_COMP *cpi)
+{
+    int tmp_q;
+    int frames_left = (int)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);
+
+    FIRSTPASS_STATS this_frame = {0};
+    FIRSTPASS_STATS this_frame_copy;
+
+    double this_frame_intra_error;
+    double this_frame_coded_error;
+
+    int overhead_bits;
+
+    if (!cpi->twopass.stats_in)
+    {
+        return ;
+    }
+
+    vp8_clear_system_state();
+
+    if (EOF == input_stats(cpi, &this_frame))
+        return;
+
+    this_frame_intra_error = this_frame.intra_error;
+    this_frame_coded_error = this_frame.coded_error;
+
+    /* keyframe and section processing ! */
+    if (cpi->twopass.frames_to_key == 0)
+    {
+        /* Define next KF group and assign bits to it */
+        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+        find_next_key_frame(cpi, &this_frame_copy);
+
+        /* Special case: Error error_resilient_mode mode does not make much
+         * sense for two pass but with its current meaning but this code is
+         * designed to stop outlandish behaviour if someone does set it when
+         * using two pass. It effectively disables GF groups. This is
+         * temporary code till we decide what should really happen in this
+         * case.
+         */
+        if (cpi->oxcf.error_resilient_mode)
+        {
+            cpi->twopass.gf_group_bits = (int)cpi->twopass.kf_group_bits;
+            cpi->twopass.gf_group_error_left =
+                                  (int)cpi->twopass.kf_group_error_left;
+            cpi->baseline_gf_interval = cpi->twopass.frames_to_key;
+            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+            cpi->source_alt_ref_pending = 0;
+        }
+
+    }
+
+    /* Is this a GF / ARF (Note that a KF is always also a GF) */
+    if (cpi->frames_till_gf_update_due == 0)
+    {
+        /* Define next gf group and assign bits to it */
+        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+        define_gf_group(cpi, &this_frame_copy);
+
+        /* If we are going to code an altref frame at the end of the group
+         * and the current frame is not a key frame.... If the previous
+         * group used an arf this frame has already benefited from that arf
+         * boost and it should not be given extra bits If the previous
+         * group was NOT coded using arf we may want to apply some boost to
+         * this GF as well
+         */
+        if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))
+        {
+            /* Assign a standard frames worth of bits from those allocated
+             * to the GF group
+             */
+            int bak = cpi->per_frame_bandwidth;
+            vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+            assign_std_frame_bits(cpi, &this_frame_copy);
+            cpi->per_frame_bandwidth = bak;
+        }
+    }
+
+    /* Otherwise this is an ordinary frame */
+    else
+    {
+        /* Special case: Error error_resilient_mode mode does not make much
+         * sense for two pass but with its current meaning but this code is
+         * designed to stop outlandish behaviour if someone does set it
+         * when using two pass. It effectively disables GF groups. This is
+         * temporary code till we decide what should really happen in this
+         * case.
+         */
+        if (cpi->oxcf.error_resilient_mode)
+        {
+            cpi->frames_till_gf_update_due = cpi->twopass.frames_to_key;
+
+            if (cpi->common.frame_type != KEY_FRAME)
+            {
+                /* Assign bits from those allocated to the GF group */
+                vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+                assign_std_frame_bits(cpi, &this_frame_copy);
+            }
+        }
+        else
+        {
+            /* Assign bits from those allocated to the GF group */
+            vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+            assign_std_frame_bits(cpi, &this_frame_copy);
+        }
+    }
+
+    /* Keep a globally available copy of this and the next frame's iiratio. */
+    cpi->twopass.this_iiratio = (unsigned int)(this_frame_intra_error /
+                        DOUBLE_DIVIDE_CHECK(this_frame_coded_error));
+    {
+        FIRSTPASS_STATS next_frame;
+        if ( lookup_next_frame_stats(cpi, &next_frame) != EOF )
+        {
+            cpi->twopass.next_iiratio = (unsigned int)(next_frame.intra_error /
+                                DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+        }
+    }
+
+    /* Set nominal per second bandwidth for this frame */
+    cpi->target_bandwidth = (int)
+    (cpi->per_frame_bandwidth * cpi->output_frame_rate);
+    if (cpi->target_bandwidth < 0)
+        cpi->target_bandwidth = 0;
+
+
+    /* Account for mv, mode and other overheads. */
+    overhead_bits = (int)estimate_modemvcost(
+                        cpi, &cpi->twopass.total_left_stats );
+
+    /* Special case code for first frame. */
+    if (cpi->common.current_video_frame == 0)
+    {
+        cpi->twopass.est_max_qcorrection_factor = 1.0;
+
+        /* Set a cq_level in constrained quality mode. */
+        if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY )
+        {
+            int est_cq;
+
+            est_cq =
+                estimate_cq( cpi,
+                             &cpi->twopass.total_left_stats,
+                             (int)(cpi->twopass.bits_left / frames_left),
+                             overhead_bits );
+
+            cpi->cq_target_quality = cpi->oxcf.cq_level;
+            if ( est_cq > cpi->cq_target_quality )
+                cpi->cq_target_quality = est_cq;
+        }
+
+        /* guess at maxq needed in 2nd pass */
+        cpi->twopass.maxq_max_limit = cpi->worst_quality;
+        cpi->twopass.maxq_min_limit = cpi->best_quality;
+
+        tmp_q = estimate_max_q(
+                    cpi,
+                    &cpi->twopass.total_left_stats,
+                    (int)(cpi->twopass.bits_left / frames_left),
+                    overhead_bits );
+
+        /* Limit the maxq value returned subsequently.
+         * This increases the risk of overspend or underspend if the initial
+         * estimate for the clip is bad, but helps prevent excessive
+         * variation in Q, especially near the end of a clip
+         * where for example a small overspend may cause Q to crash
+         */
+        cpi->twopass.maxq_max_limit = ((tmp_q + 32) < cpi->worst_quality)
+                                  ? (tmp_q + 32) : cpi->worst_quality;
+        cpi->twopass.maxq_min_limit = ((tmp_q - 32) > cpi->best_quality)
+                                  ? (tmp_q - 32) : cpi->best_quality;
+
+        cpi->active_worst_quality         = tmp_q;
+        cpi->ni_av_qi                     = tmp_q;
+    }
+
+    /* The last few frames of a clip almost always have to few or too many
+     * bits and for the sake of over exact rate control we dont want to make
+     * radical adjustments to the allowed quantizer range just to use up a
+     * few surplus bits or get beneath the target rate.
+     */
+    else if ( (cpi->common.current_video_frame <
+                 (((unsigned int)cpi->twopass.total_stats.count * 255)>>8)) &&
+              ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
+                 (unsigned int)cpi->twopass.total_stats.count) )
+    {
+        if (frames_left < 1)
+            frames_left = 1;
+
+        tmp_q = estimate_max_q(
+                    cpi,
+                    &cpi->twopass.total_left_stats,
+                    (int)(cpi->twopass.bits_left / frames_left),
+                    overhead_bits );
+
+        /* Move active_worst_quality but in a damped way */
+        if (tmp_q > cpi->active_worst_quality)
+            cpi->active_worst_quality ++;
+        else if (tmp_q < cpi->active_worst_quality)
+            cpi->active_worst_quality --;
+
+        cpi->active_worst_quality =
+            ((cpi->active_worst_quality * 3) + tmp_q + 2) / 4;
+    }
+
+    cpi->twopass.frames_to_key --;
+
+    /* Update the total stats remaining sturcture */
+    subtract_stats(&cpi->twopass.total_left_stats, &this_frame );
+}
+
+
+static int test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame)
+{
+    int is_viable_kf = 0;
+
+    /* Does the frame satisfy the primary criteria of a key frame
+     *      If so, then examine how well it predicts subsequent frames
+     */
+    if ((this_frame->pcnt_second_ref < 0.10) &&
+        (next_frame->pcnt_second_ref < 0.10) &&
+        ((this_frame->pcnt_inter < 0.05) ||
+         (
+             ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .25) &&
+             ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+             ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
+              (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
+              ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)
+             )
+         )
+        )
+       )
+    {
+        int i;
+        FIRSTPASS_STATS *start_pos;
+
+        FIRSTPASS_STATS local_next_frame;
+
+        double boost_score = 0.0;
+        double old_boost_score = 0.0;
+        double decay_accumulator = 1.0;
+        double next_iiratio;
+
+        vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
+
+        /* Note the starting file position so we can reset to it */
+        start_pos = cpi->twopass.stats_in;
+
+        /* Examine how well the key frame predicts subsequent frames */
+        for (i = 0 ; i < 16; i++)
+        {
+            next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)) ;
+
+            if (next_iiratio > RMAX)
+                next_iiratio = RMAX;
+
+            /* Cumulative effect of decay in prediction quality */
+            if (local_next_frame.pcnt_inter > 0.85)
+                decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+            else
+                decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+
+            /* Keep a running total */
+            boost_score += (decay_accumulator * next_iiratio);
+
+            /* Test various breakout clauses */
+            if ((local_next_frame.pcnt_inter < 0.05) ||
+                (next_iiratio < 1.5) ||
+                (((local_next_frame.pcnt_inter -
+                   local_next_frame.pcnt_neutral) < 0.20) &&
+                 (next_iiratio < 3.0)) ||
+                ((boost_score - old_boost_score) < 0.5) ||
+                (local_next_frame.intra_error < 200)
+               )
+            {
+                break;
+            }
+
+            old_boost_score = boost_score;
+
+            /* Get the next frame details */
+            if (EOF == input_stats(cpi, &local_next_frame))
+                break;
+        }
+
+        /* If there is tolerable prediction for at least the next 3 frames
+         * then break out else discard this pottential key frame and move on
+         */
+        if (boost_score > 5.0 && (i > 3))
+            is_viable_kf = 1;
+        else
+        {
+            /* Reset the file position */
+            reset_fpf_position(cpi, start_pos);
+
+            is_viable_kf = 0;
+        }
+    }
+
+    return is_viable_kf;
+}
+static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+    int i,j;
+    FIRSTPASS_STATS last_frame;
+    FIRSTPASS_STATS first_frame;
+    FIRSTPASS_STATS next_frame;
+    FIRSTPASS_STATS *start_position;
+
+    double decay_accumulator = 1.0;
+    double boost_score = 0;
+    double old_boost_score = 0.0;
+    double loop_decay_rate;
+
+    double kf_mod_err = 0.0;
+    double kf_group_err = 0.0;
+    double kf_group_intra_err = 0.0;
+    double kf_group_coded_err = 0.0;
+    double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
+
+    vpx_memset(&next_frame, 0, sizeof(next_frame));
+
+    vp8_clear_system_state();
+    start_position = cpi->twopass.stats_in;
+
+    cpi->common.frame_type = KEY_FRAME;
+
+    /* is this a forced key frame by interval */
+    cpi->this_key_frame_forced = cpi->next_key_frame_forced;
+
+    /* Clear the alt ref active flag as this can never be active on a key
+     * frame
+     */
+    cpi->source_alt_ref_active = 0;
+
+    /* Kf is always a gf so clear frames till next gf counter */
+    cpi->frames_till_gf_update_due = 0;
+
+    cpi->twopass.frames_to_key = 1;
+
+    /* Take a copy of the initial frame details */
+    vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));
+
+    cpi->twopass.kf_group_bits = 0;
+    cpi->twopass.kf_group_error_left = 0;
+
+    kf_mod_err = calculate_modified_err(cpi, this_frame);
+
+    /* find the next keyframe */
+    i = 0;
+    while (cpi->twopass.stats_in < cpi->twopass.stats_in_end)
+    {
+        /* Accumulate kf group error */
+        kf_group_err += calculate_modified_err(cpi, this_frame);
+
+        /* These figures keep intra and coded error counts for all frames
+         * including key frames in the group. The effect of the key frame
+         * itself can be subtracted out using the first_frame data
+         * collected above
+         */
+        kf_group_intra_err += this_frame->intra_error;
+        kf_group_coded_err += this_frame->coded_error;
+
+        /* load a the next frame's stats */
+        vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
+        input_stats(cpi, this_frame);
+
+        /* Provided that we are not at the end of the file... */
+        if (cpi->oxcf.auto_key
+            && lookup_next_frame_stats(cpi, &next_frame) != EOF)
+        {
+            /* Normal scene cut check */
+            if ( ( i >= MIN_GF_INTERVAL ) &&
+                 test_candidate_kf(cpi, &last_frame, this_frame, &next_frame) )
+            {
+                break;
+            }
+
+            /* How fast is prediction quality decaying */
+            loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+            /* We want to know something about the recent past... rather than
+             * as used elsewhere where we are concened with decay in prediction
+             * quality since the last GF or KF.
+             */
+            recent_loop_decay[i%8] = loop_decay_rate;
+            decay_accumulator = 1.0;
+            for (j = 0; j < 8; j++)
+            {
+                decay_accumulator = decay_accumulator * recent_loop_decay[j];
+            }
+
+            /* Special check for transition or high motion followed by a
+             * static scene.
+             */
+            if ( detect_transition_to_still( cpi, i,
+                                             (cpi->key_frame_frequency-i),
+                                             loop_decay_rate,
+                                             decay_accumulator ) )
+            {
+                break;
+            }
+
+
+            /* Step on to the next frame */
+            cpi->twopass.frames_to_key ++;
+
+            /* If we don't have a real key frame within the next two
+             * forcekeyframeevery intervals then break out of the loop.
+             */
+            if (cpi->twopass.frames_to_key >= 2 *(int)cpi->key_frame_frequency)
+                break;
+        } else
+            cpi->twopass.frames_to_key ++;
+
+        i++;
+    }
+
+    /* If there is a max kf interval set by the user we must obey it.
+     * We already breakout of the loop above at 2x max.
+     * This code centers the extra kf if the actual natural
+     * interval is between 1x and 2x
+     */
+    if (cpi->oxcf.auto_key
+        && cpi->twopass.frames_to_key > (int)cpi->key_frame_frequency )
+    {
+        FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in;
+        FIRSTPASS_STATS tmp_frame;
+
+        cpi->twopass.frames_to_key /= 2;
+
+        /* Copy first frame details */
+        vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
+
+        /* Reset to the start of the group */
+        reset_fpf_position(cpi, start_position);
+
+        kf_group_err = 0;
+        kf_group_intra_err = 0;
+        kf_group_coded_err = 0;
+
+        /* Rescan to get the correct error data for the forced kf group */
+        for( i = 0; i < cpi->twopass.frames_to_key; i++ )
+        {
+            /* Accumulate kf group errors */
+            kf_group_err += calculate_modified_err(cpi, &tmp_frame);
+            kf_group_intra_err += tmp_frame.intra_error;
+            kf_group_coded_err += tmp_frame.coded_error;
+
+            /* Load a the next frame's stats */
+            input_stats(cpi, &tmp_frame);
+        }
+
+        /* Reset to the start of the group */
+        reset_fpf_position(cpi, current_pos);
+
+        cpi->next_key_frame_forced = 1;
+    }
+    else
+        cpi->next_key_frame_forced = 0;
+
+    /* Special case for the last frame of the file */
+    if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
+    {
+        /* Accumulate kf group error */
+        kf_group_err += calculate_modified_err(cpi, this_frame);
+
+        /* These figures keep intra and coded error counts for all frames
+         * including key frames in the group. The effect of the key frame
+         * itself can be subtracted out using the first_frame data
+         * collected above
+         */
+        kf_group_intra_err += this_frame->intra_error;
+        kf_group_coded_err += this_frame->coded_error;
+    }
+
+    /* Calculate the number of bits that should be assigned to the kf group. */
+    if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0))
+    {
+        /* Max for a single normal frame (not key frame) */
+        int max_bits = frame_max_bits(cpi);
+
+        /* Maximum bits for the kf group */
+        int64_t max_grp_bits;
+
+        /* Default allocation based on bits left and relative
+         * complexity of the section
+         */
+        cpi->twopass.kf_group_bits = (int64_t)( cpi->twopass.bits_left *
+                                          ( kf_group_err /
+                                            cpi->twopass.modified_error_left ));
+
+        /* Clip based on maximum per frame rate defined by the user. */
+        max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;
+        if (cpi->twopass.kf_group_bits > max_grp_bits)
+            cpi->twopass.kf_group_bits = max_grp_bits;
+
+        /* Additional special case for CBR if buffer is getting full. */
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            int64_t opt_buffer_lvl = cpi->oxcf.optimal_buffer_level;
+            int64_t buffer_lvl = cpi->buffer_level;
+
+            /* If the buffer is near or above the optimal and this kf group is
+             * not being allocated much then increase the allocation a bit.
+             */
+            if (buffer_lvl >= opt_buffer_lvl)
+            {
+                int64_t high_water_mark = (opt_buffer_lvl +
+                                       cpi->oxcf.maximum_buffer_size) >> 1;
+
+                int64_t av_group_bits;
+
+                /* Av bits per frame * number of frames */
+                av_group_bits = (int64_t)cpi->av_per_frame_bandwidth *
+                                (int64_t)cpi->twopass.frames_to_key;
+
+                /* We are at or above the maximum. */
+                if (cpi->buffer_level >= high_water_mark)
+                {
+                    int64_t min_group_bits;
+
+                    min_group_bits = av_group_bits +
+                                     (int64_t)(buffer_lvl -
+                                                 high_water_mark);
+
+                    if (cpi->twopass.kf_group_bits < min_group_bits)
+                        cpi->twopass.kf_group_bits = min_group_bits;
+                }
+                /* We are above optimal but below the maximum */
+                else if (cpi->twopass.kf_group_bits < av_group_bits)
+                {
+                    int64_t bits_below_av = av_group_bits -
+                                              cpi->twopass.kf_group_bits;
+
+                    cpi->twopass.kf_group_bits +=
+                       (int64_t)((double)bits_below_av *
+                                   (double)(buffer_lvl - opt_buffer_lvl) /
+                                   (double)(high_water_mark - opt_buffer_lvl));
+                }
+            }
+        }
+    }
+    else
+        cpi->twopass.kf_group_bits = 0;
+
+    /* Reset the first pass file position */
+    reset_fpf_position(cpi, start_position);
+
+    /* determine how big to make this keyframe based on how well the
+     * subsequent frames use inter blocks
+     */
+    decay_accumulator = 1.0;
+    boost_score = 0.0;
+    loop_decay_rate = 1.00;       /* Starting decay rate */
+
+    for (i = 0 ; i < cpi->twopass.frames_to_key ; i++)
+    {
+        double r;
+
+        if (EOF == input_stats(cpi, &next_frame))
+            break;
+
+        if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
+            r = (IIKFACTOR2 * next_frame.intra_error /
+                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+        else
+            r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min /
+                     DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
+
+        if (r > RMAX)
+            r = RMAX;
+
+        /* How fast is prediction quality decaying */
+        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+        decay_accumulator = decay_accumulator * loop_decay_rate;
+        decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+
+        boost_score += (decay_accumulator * r);
+
+        if ((i > MIN_GF_INTERVAL) &&
+            ((boost_score - old_boost_score) < 1.0))
+        {
+            break;
+        }
+
+        old_boost_score = boost_score;
+    }
+
+    if (1)
+    {
+        FIRSTPASS_STATS sectionstats;
+        double Ratio;
+
+        zero_stats(&sectionstats);
+        reset_fpf_position(cpi, start_position);
+
+        for (i = 0 ; i < cpi->twopass.frames_to_key ; i++)
+        {
+            input_stats(cpi, &next_frame);
+            accumulate_stats(&sectionstats, &next_frame);
+        }
+
+        avg_stats(&sectionstats);
+
+        cpi->twopass.section_intra_rating = (unsigned int)
+            (sectionstats.intra_error
+            / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
+
+        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+        cpi->twopass.section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025);
+
+        if (cpi->twopass.section_max_qfactor < 0.80)
+            cpi->twopass.section_max_qfactor = 0.80;
+    }
+
+    /* When using CBR apply additional buffer fullness related upper limits */
+    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+    {
+        double max_boost;
+
+        if (cpi->drop_frames_allowed)
+        {
+            int df_buffer_level = (int)(cpi->oxcf.drop_frames_water_mark
+                                  * (cpi->oxcf.optimal_buffer_level / 100));
+
+            if (cpi->buffer_level > df_buffer_level)
+                max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+            else
+                max_boost = 0.0;
+        }
+        else if (cpi->buffer_level > 0)
+        {
+            max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+        }
+        else
+        {
+            max_boost = 0.0;
+        }
+
+        if (boost_score > max_boost)
+            boost_score = max_boost;
+    }
+
+    /* Reset the first pass file position */
+    reset_fpf_position(cpi, start_position);
+
+    /* Work out how many bits to allocate for the key frame itself */
+    if (1)
+    {
+        int kf_boost = (int)boost_score;
+        int allocation_chunks;
+        int Counter = cpi->twopass.frames_to_key;
+        int alt_kf_bits;
+        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
+        /* Min boost based on kf interval */
+#if 0
+
+        while ((kf_boost < 48) && (Counter > 0))
+        {
+            Counter -= 2;
+            kf_boost ++;
+        }
+
+#endif
+
+        if (kf_boost < 48)
+        {
+            kf_boost += ((Counter + 1) >> 1);
+
+            if (kf_boost > 48) kf_boost = 48;
+        }
+
+        /* bigger frame sizes need larger kf boosts, smaller frames smaller
+         * boosts...
+         */
+        if ((lst_yv12->y_width * lst_yv12->y_height) > (320 * 240))
+            kf_boost += 2 * (lst_yv12->y_width * lst_yv12->y_height) / (320 * 240);
+        else if ((lst_yv12->y_width * lst_yv12->y_height) < (320 * 240))
+            kf_boost -= 4 * (320 * 240) / (lst_yv12->y_width * lst_yv12->y_height);
+
+        /* Min KF boost */
+        kf_boost = (int)((double)kf_boost * 100.0) >> 4; /* Scale 16 to 100 */
+        if (kf_boost < 250)
+            kf_boost = 250;
+
+        /*
+         * We do three calculations for kf size.
+         * The first is based on the error score for the whole kf group.
+         * The second (optionaly) on the key frames own error if this is
+         * smaller than the average for the group.
+         * The final one insures that the frame receives at least the
+         * allocation it would have received based on its own error score vs
+         * the error score remaining
+         * Special case if the sequence appears almost totaly static
+         * as measured by the decay accumulator. In this case we want to
+         * spend almost all of the bits on the key frame.
+         * cpi->twopass.frames_to_key-1 because key frame itself is taken
+         * care of by kf_boost.
+         */
+        if ( decay_accumulator >= 0.99 )
+        {
+            allocation_chunks =
+                ((cpi->twopass.frames_to_key - 1) * 10) + kf_boost;
+        }
+        else
+        {
+            allocation_chunks =
+                ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost;
+        }
+
+        /* Normalize Altboost and allocations chunck down to prevent overflow */
+        while (kf_boost > 1000)
+        {
+            kf_boost /= 2;
+            allocation_chunks /= 2;
+        }
+
+        cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
+
+        /* Calculate the number of bits to be spent on the key frame */
+        cpi->twopass.kf_bits  = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
+
+        /* Apply an additional limit for CBR */
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            if (cpi->twopass.kf_bits > (int)((3 * cpi->buffer_level) >> 2))
+                cpi->twopass.kf_bits = (int)((3 * cpi->buffer_level) >> 2);
+        }
+
+        /* If the key frame is actually easier than the average for the
+         * kf group (which does sometimes happen... eg a blank intro frame)
+         * Then use an alternate calculation based on the kf error score
+         * which should give a smaller key frame.
+         */
+        if (kf_mod_err < kf_group_err / cpi->twopass.frames_to_key)
+        {
+            double  alt_kf_grp_bits =
+                        ((double)cpi->twopass.bits_left *
+                         (kf_mod_err * (double)cpi->twopass.frames_to_key) /
+                         DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left));
+
+            alt_kf_bits = (int)((double)kf_boost *
+                                (alt_kf_grp_bits / (double)allocation_chunks));
+
+            if (cpi->twopass.kf_bits > alt_kf_bits)
+            {
+                cpi->twopass.kf_bits = alt_kf_bits;
+            }
+        }
+        /* Else if it is much harder than other frames in the group make sure
+         * it at least receives an allocation in keeping with its relative
+         * error score
+         */
+        else
+        {
+            alt_kf_bits =
+                (int)((double)cpi->twopass.bits_left *
+                      (kf_mod_err /
+                       DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)));
+
+            if (alt_kf_bits > cpi->twopass.kf_bits)
+            {
+                cpi->twopass.kf_bits = alt_kf_bits;
+            }
+        }
+
+        cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
+        /* Add in the minimum frame allowance */
+        cpi->twopass.kf_bits += cpi->min_frame_bandwidth;
+
+        /* Peer frame bit target for this frame */
+        cpi->per_frame_bandwidth = cpi->twopass.kf_bits;
+
+        /* Convert to a per second bitrate */
+        cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
+                                      cpi->output_frame_rate);
+    }
+
+    /* Note the total error score of the kf group minus the key frame itself */
+    cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+    /* Adjust the count of total modified error left. The count of bits left
+     * is adjusted elsewhere based on real coded frame sizes
+     */
+    cpi->twopass.modified_error_left -= kf_group_err;
+
+    if (cpi->oxcf.allow_spatial_resampling)
+    {
+        int resample_trigger = 0;
+        int last_kf_resampled = 0;
+        int kf_q;
+        int scale_val = 0;
+        int hr, hs, vr, vs;
+        int new_width = cpi->oxcf.Width;
+        int new_height = cpi->oxcf.Height;
+
+        int projected_buffer_level = (int)cpi->buffer_level;
+        int tmp_q;
+
+        double projected_bits_perframe;
+        double group_iiratio = (kf_group_intra_err - first_frame.intra_error) / (kf_group_coded_err - first_frame.coded_error);
+        double err_per_frame = kf_group_err / cpi->twopass.frames_to_key;
+        double bits_per_frame;
+        double av_bits_per_frame;
+        double effective_size_ratio;
+
+        if ((cpi->common.Width != cpi->oxcf.Width) || (cpi->common.Height != cpi->oxcf.Height))
+            last_kf_resampled = 1;
+
+        /* Set back to unscaled by defaults */
+        cpi->common.horiz_scale = NORMAL;
+        cpi->common.vert_scale = NORMAL;
+
+        /* Calculate Average bits per frame. */
+        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate);
+
+        /* CBR... Use the clip average as the target for deciding resample */
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            bits_per_frame = av_bits_per_frame;
+        }
+
+        /* In VBR we want to avoid downsampling in easy section unless we
+         * are under extreme pressure So use the larger of target bitrate
+         * for this section or average bitrate for sequence
+         */
+        else
+        {
+            /* This accounts for how hard the section is... */
+            bits_per_frame = (double)
+                (cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key);
+
+            /* Dont turn to resampling in easy sections just because they
+             * have been assigned a small number of bits
+             */
+            if (bits_per_frame < av_bits_per_frame)
+                bits_per_frame = av_bits_per_frame;
+        }
+
+        /* bits_per_frame should comply with our minimum */
+        if (bits_per_frame < (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100))
+            bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+        /* Work out if spatial resampling is necessary */
+        kf_q = estimate_kf_group_q(cpi, err_per_frame,
+                                  (int)bits_per_frame, group_iiratio);
+
+        /* If we project a required Q higher than the maximum allowed Q then
+         * make a guess at the actual size of frames in this section
+         */
+        projected_bits_perframe = bits_per_frame;
+        tmp_q = kf_q;
+
+        while (tmp_q > cpi->worst_quality)
+        {
+            projected_bits_perframe *= 1.04;
+            tmp_q--;
+        }
+
+        /* Guess at buffer level at the end of the section */
+        projected_buffer_level = (int)
+                    (cpi->buffer_level - (int)
+                    ((projected_bits_perframe - av_bits_per_frame) *
+                    cpi->twopass.frames_to_key));
+
+        if (0)
+        {
+            FILE *f = fopen("Subsamle.stt", "a");
+            fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n",  cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale, cpi->common.vert_scale,  kf_group_err / cpi->twopass.frames_to_key, (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key), new_height, new_width);
+            fclose(f);
+        }
+
+        /* The trigger for spatial resampling depends on the various
+         * parameters such as whether we are streaming (CBR) or VBR.
+         */
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            /* Trigger resample if we are projected to fall below down
+             * sample level or resampled last time and are projected to
+             * remain below the up sample level
+             */
+            if ((projected_buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100)) ||
+                (last_kf_resampled && (projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))))
+                resample_trigger = 1;
+            else
+                resample_trigger = 0;
+        }
+        else
+        {
+            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
+            int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
+
+            /* If triggered last time the threshold for triggering again is
+             * reduced:
+             *
+             * Projected Q higher than allowed and Overspend > 5% of total
+             * bits
+             */
+            if ((last_kf_resampled && (kf_q > cpi->worst_quality)) ||
+                ((kf_q > cpi->worst_quality) &&
+                 (over_spend > clip_bits / 20)))
+                resample_trigger = 1;
+            else
+                resample_trigger = 0;
+
+        }
+
+        if (resample_trigger)
+        {
+            while ((kf_q >= cpi->worst_quality) && (scale_val < 6))
+            {
+                scale_val ++;
+
+                cpi->common.vert_scale   = vscale_lookup[scale_val];
+                cpi->common.horiz_scale  = hscale_lookup[scale_val];
+
+                Scale2Ratio(cpi->common.horiz_scale, &hr, &hs);
+                Scale2Ratio(cpi->common.vert_scale, &vr, &vs);
+
+                new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
+                new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs;
+
+                /* Reducing the area to 1/4 does not reduce the complexity
+                 * (err_per_frame) to 1/4... effective_sizeratio attempts
+                 * to provide a crude correction for this
+                 */
+                effective_size_ratio = (double)(new_width * new_height) / (double)(cpi->oxcf.Width * cpi->oxcf.Height);
+                effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0;
+
+                /* Now try again and see what Q we get with the smaller
+                 * image size
+                 */
+                kf_q = estimate_kf_group_q(cpi,
+                                          err_per_frame * effective_size_ratio,
+                                          (int)bits_per_frame, group_iiratio);
+
+                if (0)
+                {
+                    FILE *f = fopen("Subsamle.stt", "a");
+                    fprintf(f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n",  kf_q, cpi->common.horiz_scale, cpi->common.vert_scale,  kf_group_err / cpi->twopass.frames_to_key, (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key), new_height, new_width);
+                    fclose(f);
+                }
+            }
+        }
+
+        if ((cpi->common.Width != new_width) || (cpi->common.Height != new_height))
+        {
+            cpi->common.Width = new_width;
+            cpi->common.Height = new_height;
+            vp8_alloc_compressor_data(cpi);
+        }
+    }
+}
diff --git a/libvpx/vp8/encoder/firstpass.h b/libvpx/vp8/encoder/firstpass.h
new file mode 100644
index 0000000..95e1e54
--- /dev/null
+++ b/libvpx/vp8/encoder/firstpass.h
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#if !defined __INC_FIRSTPASS_H
+#define      __INC_FIRSTPASS_H
+
+extern void vp8_init_first_pass(VP8_COMP *cpi);
+extern void vp8_first_pass(VP8_COMP *cpi);
+extern void vp8_end_first_pass(VP8_COMP *cpi);
+
+extern void vp8_init_second_pass(VP8_COMP *cpi);
+extern void vp8_second_pass(VP8_COMP *cpi);
+extern void vp8_end_second_pass(VP8_COMP *cpi);
+
+extern size_t vp8_firstpass_stats_sz(unsigned int mb_count);
+#endif
diff --git a/libvpx/vp8/encoder/lookahead.c b/libvpx/vp8/encoder/lookahead.c
new file mode 100644
index 0000000..ce2ce08
--- /dev/null
+++ b/libvpx/vp8/encoder/lookahead.c
@@ -0,0 +1,230 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include "vpx_config.h"
+#include "lookahead.h"
+#include "vp8/common/extend.h"
+
+#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY? 1 : 25)
+
+struct lookahead_ctx
+{
+    unsigned int max_sz;         /* Absolute size of the queue */
+    unsigned int sz;             /* Number of buffers currently in the queue */
+    unsigned int read_idx;       /* Read index */
+    unsigned int write_idx;      /* Write index */
+    struct lookahead_entry *buf; /* Buffer list */
+};
+
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *
+pop(struct lookahead_ctx *ctx,
+    unsigned int         *idx)
+{
+    unsigned int            index = *idx;
+    struct lookahead_entry *buf = ctx->buf + index;
+
+    assert(index < ctx->max_sz);
+    if(++index >= ctx->max_sz)
+        index -= ctx->max_sz;
+    *idx = index;
+    return buf;
+}
+
+
+void
+vp8_lookahead_destroy(struct lookahead_ctx *ctx)
+{
+    if(ctx)
+    {
+        if(ctx->buf)
+        {
+            unsigned int i;
+
+            for(i = 0; i < ctx->max_sz; i++)
+                vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img);
+            free(ctx->buf);
+        }
+        free(ctx);
+    }
+}
+
+
+struct lookahead_ctx*
+vp8_lookahead_init(unsigned int width,
+                   unsigned int height,
+                   unsigned int depth)
+{
+    struct lookahead_ctx *ctx = NULL;
+    unsigned int i;
+
+    /* Clamp the lookahead queue depth */
+    if(depth < 1)
+        depth = 1;
+    else if(depth > MAX_LAG_BUFFERS)
+        depth = MAX_LAG_BUFFERS;
+
+    /* Keep last frame in lookahead buffer by increasing depth by 1.*/
+    depth += 1;
+
+    /* Align the buffer dimensions */
+    width = (width + 15) & ~15;
+    height = (height + 15) & ~15;
+
+    /* Allocate the lookahead structures */
+    ctx = calloc(1, sizeof(*ctx));
+    if(ctx)
+    {
+        ctx->max_sz = depth;
+        ctx->buf = calloc(depth, sizeof(*ctx->buf));
+        if(!ctx->buf)
+            goto bail;
+        for(i=0; i<depth; i++)
+            if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img,
+                                            width, height, VP8BORDERINPIXELS))
+                goto bail;
+    }
+    return ctx;
+bail:
+    vp8_lookahead_destroy(ctx);
+    return NULL;
+}
+
+
+int
+vp8_lookahead_push(struct lookahead_ctx *ctx,
+                   YV12_BUFFER_CONFIG   *src,
+                   int64_t               ts_start,
+                   int64_t               ts_end,
+                   unsigned int          flags,
+                   unsigned char        *active_map)
+{
+    struct lookahead_entry* buf;
+    int row, col, active_end;
+    int mb_rows = (src->y_height + 15) >> 4;
+    int mb_cols = (src->y_width + 15) >> 4;
+
+    if(ctx->sz + 2 > ctx->max_sz)
+        return 1;
+    ctx->sz++;
+    buf = pop(ctx, &ctx->write_idx);
+
+    /* Only do this partial copy if the following conditions are all met:
+     * 1. Lookahead queue has has size of 1.
+     * 2. Active map is provided.
+     * 3. This is not a key frame, golden nor altref frame.
+     */
+    if (ctx->max_sz == 1 && active_map && !flags)
+    {
+        for (row = 0; row < mb_rows; ++row)
+        {
+            col = 0;
+
+            while (1)
+            {
+                /* Find the first active macroblock in this row. */
+                for (; col < mb_cols; ++col)
+                {
+                    if (active_map[col])
+                        break;
+                }
+
+                /* No more active macroblock in this row. */
+                if (col == mb_cols)
+                    break;
+
+                /* Find the end of active region in this row. */
+                active_end = col;
+
+                for (; active_end < mb_cols; ++active_end)
+                {
+                    if (!active_map[active_end])
+                        break;
+                }
+
+                /* Only copy this active region. */
+                vp8_copy_and_extend_frame_with_rect(src, &buf->img,
+                                                    row << 4,
+                                                    col << 4, 16,
+                                                    (active_end - col) << 4);
+
+                /* Start again from the end of this active region. */
+                col = active_end;
+            }
+
+            active_map += mb_cols;
+        }
+    }
+    else
+    {
+        vp8_copy_and_extend_frame(src, &buf->img);
+    }
+    buf->ts_start = ts_start;
+    buf->ts_end = ts_end;
+    buf->flags = flags;
+    return 0;
+}
+
+
+struct lookahead_entry*
+vp8_lookahead_pop(struct lookahead_ctx *ctx,
+                  int                   drain)
+{
+    struct lookahead_entry* buf = NULL;
+
+    if(ctx->sz && (drain || ctx->sz == ctx->max_sz - 1))
+    {
+        buf = pop(ctx, &ctx->read_idx);
+        ctx->sz--;
+    }
+    return buf;
+}
+
+
+struct lookahead_entry*
+vp8_lookahead_peek(struct lookahead_ctx *ctx,
+                   unsigned int          index,
+                   int                   direction)
+{
+    struct lookahead_entry* buf = NULL;
+
+    if (direction == PEEK_FORWARD)
+    {
+        assert(index < ctx->max_sz - 1);
+        if(index < ctx->sz)
+        {
+            index += ctx->read_idx;
+            if(index >= ctx->max_sz)
+                index -= ctx->max_sz;
+            buf = ctx->buf + index;
+        }
+    }
+    else if (direction == PEEK_BACKWARD)
+    {
+        assert(index == 1);
+
+        if(ctx->read_idx == 0)
+            index = ctx->max_sz - 1;
+        else
+            index = ctx->read_idx - index;
+        buf = ctx->buf + index;
+    }
+
+    return buf;
+}
+
+
+unsigned int
+vp8_lookahead_depth(struct lookahead_ctx *ctx)
+{
+    return ctx->sz;
+}
diff --git a/libvpx/vp8/encoder/lookahead.h b/libvpx/vp8/encoder/lookahead.h
new file mode 100644
index 0000000..cf56b75
--- /dev/null
+++ b/libvpx/vp8/encoder/lookahead.h
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef LOOKAHEAD_H
+#define LOOKAHEAD_H
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+
+struct lookahead_entry
+{
+    YV12_BUFFER_CONFIG  img;
+    int64_t             ts_start;
+    int64_t             ts_end;
+    unsigned int        flags;
+};
+
+
+struct lookahead_ctx;
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ *
+ *
+ */
+struct lookahead_ctx* vp8_lookahead_init(unsigned int width,
+                                         unsigned int height,
+                                         unsigned int depth
+                                         );
+
+
+/**\brief Destroys the lookahead stage
+ *
+ */
+void vp8_lookahead_destroy(struct lookahead_ctx *ctx);
+
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * If active_map is non-NULL and there is only one frame in the queue, then copy
+ * only active macroblocks.
+ *
+ * \param[in] ctx         Pointer to the lookahead context
+ * \param[in] src         Pointer to the image to enqueue
+ * \param[in] ts_start    Timestamp for the start of this frame
+ * \param[in] ts_end      Timestamp for the end of this frame
+ * \param[in] flags       Flags set on this frame
+ * \param[in] active_map  Map that specifies which macroblock is active
+ */
+int
+vp8_lookahead_push(struct lookahead_ctx *ctx,
+                   YV12_BUFFER_CONFIG   *src,
+                   int64_t               ts_start,
+                   int64_t               ts_end,
+                   unsigned int          flags,
+                   unsigned char        *active_map);
+
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] drain     Flag indicating the buffer should be drained
+ *                      (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ *
+ */
+struct lookahead_entry*
+vp8_lookahead_pop(struct lookahead_ctx *ctx,
+                  int                   drain);
+
+
+#define PEEK_FORWARD   1
+#define PEEK_BACKWARD -1
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] index     Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ *
+ */
+struct lookahead_entry*
+vp8_lookahead_peek(struct lookahead_ctx *ctx,
+                   unsigned int          index,
+                   int                   direction);
+
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ */
+unsigned int
+vp8_lookahead_depth(struct lookahead_ctx *ctx);
+
+
+#endif
diff --git a/libvpx/vp8/encoder/mcomp.c b/libvpx/vp8/encoder/mcomp.c
new file mode 100644
index 0000000..b08c7a5
--- /dev/null
+++ b/libvpx/vp8/encoder/mcomp.c
@@ -0,0 +1,2026 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyx_int.h"
+#include "mcomp.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_config.h"
+#include <stdio.h>
+#include <limits.h>
+#include <math.h>
+#include "vp8/common/findnearmv.h"
+
+#ifdef ENTROPY_STATS
+static int mv_ref_ct [31] [4] [2];
+static int mv_mode_cts [4] [2];
+#endif
+
+int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight)
+{
+    /* MV costing is based on the distribution of vectors in the previous
+     * frame and as such will tend to over state the cost of vectors. In
+     * addition coding a new vector can have a knock on effect on the cost
+     * of subsequent vectors and the quality of prediction from NEAR and
+     * NEAREST for subsequent blocks. The "Weight" parameter allows, to a
+     * limited extent, for some account to be taken of these factors.
+     */
+    return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] + mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) * Weight) >> 7;
+}
+
+static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int error_per_bit)
+{
+    /* Ignore mv costing if mvcost is NULL */
+    if (mvcost)
+        return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] +
+                 mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1])
+                 * error_per_bit + 128) >> 8;
+    return 0;
+}
+
+static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvsadcost[2], int error_per_bit)
+{
+    /* Calculate sad error cost on full pixel basis. */
+    /* Ignore mv costing if mvsadcost is NULL */
+    if (mvsadcost)
+        return ((mvsadcost[0][(mv->as_mv.row - ref->as_mv.row)] +
+                 mvsadcost[1][(mv->as_mv.col - ref->as_mv.col)])
+                * error_per_bit + 128) >> 8;
+    return 0;
+}
+
+void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride)
+{
+    int Len;
+    int search_site_count = 0;
+
+
+    /* Generate offsets for 4 search sites per step. */
+    Len = MAX_FIRST_STEP;
+    x->ss[search_site_count].mv.col = 0;
+    x->ss[search_site_count].mv.row = 0;
+    x->ss[search_site_count].offset = 0;
+    search_site_count++;
+
+    while (Len > 0)
+    {
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = -Len;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = Len;
+        search_site_count++;
+
+        /* Contract. */
+        Len /= 2;
+    }
+
+    x->ss_count = search_site_count;
+    x->searches_per_step = 4;
+}
+
+void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
+{
+    int Len;
+    int search_site_count = 0;
+
+    /* Generate offsets for 8 search sites per step. */
+    Len = MAX_FIRST_STEP;
+    x->ss[search_site_count].mv.col = 0;
+    x->ss[search_site_count].mv.row = 0;
+    x->ss[search_site_count].offset = 0;
+    search_site_count++;
+
+    while (Len > 0)
+    {
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = -Len;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = Len;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride - Len;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride + Len;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride - Len;
+        search_site_count++;
+
+        /* Compute offsets for search sites. */
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride + Len;
+        search_site_count++;
+
+
+        /* Contract. */
+        Len /= 2;
+    }
+
+    x->ss_count = search_site_count;
+    x->searches_per_step = 8;
+}
+
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+/* estimated cost of a motion vector (r,c) */
+#define MVC(r,c) (mvcost ? ((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 : 0)
+/* pointer to predictor base of a motionvector */
+#define PRE(r,c) (y + (((r)>>2) * y_stride + ((c)>>2) -(offset)))
+/* convert motion vector component to offset for svf calc */
+#define SP(x) (((x)&3)<<1)
+/* returns subpixel variance error function. */
+#define DIST(r,c) vfp->svf( PRE(r,c), y_stride, SP(c),SP(r), z,b->src_stride,&sse)
+#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
+/* returns distortion + motion vector cost */
+#define ERR(r,c) (MVC(r,c)+DIST(r,c))
+/* checks if (r,c) has better score than previous best */
+#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=UINT_MAX;)
+
+int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                                             int_mv *bestmv, int_mv *ref_mv,
+                                             int error_per_bit,
+                                             const vp8_variance_fn_ptr_t *vfp,
+                                             int *mvcost[2], int *distortion,
+                                             unsigned int *sse1)
+{
+    unsigned char *z = (*(b->base_src) + b->src);
+
+    int rr = ref_mv->as_mv.row >> 1, rc = ref_mv->as_mv.col >> 1;
+    int br = bestmv->as_mv.row << 2, bc = bestmv->as_mv.col << 2;
+    int tr = br, tc = bc;
+    unsigned int besterr;
+    unsigned int left, right, up, down, diag;
+    unsigned int sse;
+    unsigned int whichdir;
+    unsigned int halfiters = 4;
+    unsigned int quarteriters = 4;
+    int thismse;
+
+    int minc = MAX(x->mv_col_min << 2, (ref_mv->as_mv.col >> 1) - ((1 << mvlong_width) - 1));
+    int maxc = MIN(x->mv_col_max << 2, (ref_mv->as_mv.col >> 1) + ((1 << mvlong_width) - 1));
+    int minr = MAX(x->mv_row_min << 2, (ref_mv->as_mv.row >> 1) - ((1 << mvlong_width) - 1));
+    int maxr = MIN(x->mv_row_max << 2, (ref_mv->as_mv.row >> 1) + ((1 << mvlong_width) - 1));
+
+    int y_stride;
+    int offset;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+
+#if ARCH_X86 || ARCH_X86_64
+    MACROBLOCKD *xd = &x->e_mbd;
+    unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    unsigned char *y;
+    int buf_r1, buf_r2, buf_c1, buf_c2;
+
+    /* Clamping to avoid out-of-range data access */
+    buf_r1 = ((bestmv->as_mv.row - 3) < x->mv_row_min)?(bestmv->as_mv.row - x->mv_row_min):3;
+    buf_r2 = ((bestmv->as_mv.row + 3) > x->mv_row_max)?(x->mv_row_max - bestmv->as_mv.row):3;
+    buf_c1 = ((bestmv->as_mv.col - 3) < x->mv_col_min)?(bestmv->as_mv.col - x->mv_col_min):3;
+    buf_c2 = ((bestmv->as_mv.col + 3) > x->mv_col_max)?(x->mv_col_max - bestmv->as_mv.col):3;
+    y_stride = 32;
+
+    /* Copy to intermediate buffer before searching. */
+    vfp->copymem(y0 - buf_c1 - pre_stride*buf_r1, pre_stride, xd->y_buf, y_stride, 16+buf_r1+buf_r2);
+    y = xd->y_buf + y_stride*buf_r1 +buf_c1;
+#else
+    unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    y_stride = pre_stride;
+#endif
+
+    offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+    /* central mv */
+    bestmv->as_mv.row <<= 3;
+    bestmv->as_mv.col <<= 3;
+
+    /* calculate central point error */
+    besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+    *distortion = besterr;
+    besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+    /* TODO: Each subsequent iteration checks at least one point in common
+     * with the last iteration could be 2 ( if diag selected)
+     */
+    while (--halfiters)
+    {
+        /* 1/2 pel */
+        CHECK_BETTER(left, tr, tc - 2);
+        CHECK_BETTER(right, tr, tc + 2);
+        CHECK_BETTER(up, tr - 2, tc);
+        CHECK_BETTER(down, tr + 2, tc);
+
+        whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+        switch (whichdir)
+        {
+        case 0:
+            CHECK_BETTER(diag, tr - 2, tc - 2);
+            break;
+        case 1:
+            CHECK_BETTER(diag, tr - 2, tc + 2);
+            break;
+        case 2:
+            CHECK_BETTER(diag, tr + 2, tc - 2);
+            break;
+        case 3:
+            CHECK_BETTER(diag, tr + 2, tc + 2);
+            break;
+        }
+
+        /* no reason to check the same one again. */
+        if (tr == br && tc == bc)
+            break;
+
+        tr = br;
+        tc = bc;
+    }
+
+    /* TODO: Each subsequent iteration checks at least one point in common
+     * with the last iteration could be 2 ( if diag selected)
+     */
+
+    /* 1/4 pel */
+    while (--quarteriters)
+    {
+        CHECK_BETTER(left, tr, tc - 1);
+        CHECK_BETTER(right, tr, tc + 1);
+        CHECK_BETTER(up, tr - 1, tc);
+        CHECK_BETTER(down, tr + 1, tc);
+
+        whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+        switch (whichdir)
+        {
+        case 0:
+            CHECK_BETTER(diag, tr - 1, tc - 1);
+            break;
+        case 1:
+            CHECK_BETTER(diag, tr - 1, tc + 1);
+            break;
+        case 2:
+            CHECK_BETTER(diag, tr + 1, tc - 1);
+            break;
+        case 3:
+            CHECK_BETTER(diag, tr + 1, tc + 1);
+            break;
+        }
+
+        /* no reason to check the same one again. */
+        if (tr == br && tc == bc)
+            break;
+
+        tr = br;
+        tc = bc;
+    }
+
+    bestmv->as_mv.row = br << 1;
+    bestmv->as_mv.col = bc << 1;
+
+    if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL<<3)) ||
+        (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL<<3)))
+        return INT_MAX;
+
+    return besterr;
+}
+#undef MVC
+#undef PRE
+#undef SP
+#undef DIST
+#undef IFMVCV
+#undef ERR
+#undef CHECK_BETTER
+
+int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp8_variance_fn_ptr_t *vfp,
+                                 int *mvcost[2], int *distortion,
+                                 unsigned int *sse1)
+{
+    int bestmse = INT_MAX;
+    int_mv startmv;
+    int_mv this_mv;
+    unsigned char *z = (*(b->base_src) + b->src);
+    int left, right, up, down, diag;
+    unsigned int sse;
+    int whichdir ;
+    int thismse;
+    int y_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+#if ARCH_X86 || ARCH_X86_64
+    MACROBLOCKD *xd = &x->e_mbd;
+    unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    unsigned char *y;
+
+    y_stride = 32;
+    /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
+     vfp->copymem(y0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18);
+     y = xd->y_buf + y_stride + 1;
+#else
+     unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+     y_stride = pre_stride;
+#endif
+
+    /* central mv */
+    bestmv->as_mv.row <<= 3;
+    bestmv->as_mv.col <<= 3;
+    startmv = *bestmv;
+
+    /* calculate central point error */
+    bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+    *distortion = bestmse;
+    bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+    /* go left then right and check error */
+    this_mv.as_mv.row = startmv.as_mv.row;
+    this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
+    thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+    left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (left < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = left;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    this_mv.as_mv.col += 8;
+    thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+    right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (right < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = right;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    /* go up then down and check error */
+    this_mv.as_mv.col = startmv.as_mv.col;
+    this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
+    thismse =  vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+    up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (up < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = up;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    this_mv.as_mv.row += 8;
+    thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+    down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (down < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = down;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+
+    /* now check 1 more diagonal */
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+    this_mv = startmv;
+
+    switch (whichdir)
+    {
+    case 0:
+        this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+        this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+        thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
+        break;
+    case 1:
+        this_mv.as_mv.col += 4;
+        this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+        thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+        break;
+    case 2:
+        this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+        this_mv.as_mv.row += 4;
+        thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+        break;
+    case 3:
+    default:
+        this_mv.as_mv.col += 4;
+        this_mv.as_mv.row += 4;
+        thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+        break;
+    }
+
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+
+    /* time to check quarter pels. */
+    if (bestmv->as_mv.row < startmv.as_mv.row)
+        y -= y_stride;
+
+    if (bestmv->as_mv.col < startmv.as_mv.col)
+        y--;
+
+    startmv = *bestmv;
+
+
+
+    /* go left then right and check error */
+    this_mv.as_mv.row = startmv.as_mv.row;
+
+    if (startmv.as_mv.col & 7)
+    {
+        this_mv.as_mv.col = startmv.as_mv.col - 2;
+        thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+    }
+    else
+    {
+        this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+        thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+    }
+
+    left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (left < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = left;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    this_mv.as_mv.col += 4;
+    thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+    right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (right < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = right;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    /* go up then down and check error */
+    this_mv.as_mv.col = startmv.as_mv.col;
+
+    if (startmv.as_mv.row & 7)
+    {
+        this_mv.as_mv.row = startmv.as_mv.row - 2;
+        thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+    }
+    else
+    {
+        this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+        thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
+    }
+
+    up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (up < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = up;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    this_mv.as_mv.row += 4;
+    thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+    down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (down < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = down;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+
+    /* now check 1 more diagonal */
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    this_mv = startmv;
+
+    switch (whichdir)
+    {
+    case 0:
+
+        if (startmv.as_mv.row & 7)
+        {
+            this_mv.as_mv.row -= 2;
+
+            if (startmv.as_mv.col & 7)
+            {
+                this_mv.as_mv.col -= 2;
+                thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+            }
+            else
+            {
+                this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+                thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);;
+            }
+        }
+        else
+        {
+            this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+
+            if (startmv.as_mv.col & 7)
+            {
+                this_mv.as_mv.col -= 2;
+                thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
+            }
+            else
+            {
+                this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+                thismse = vfp->svf(y - y_stride - 1, y_stride, 6, 6, z, b->src_stride, &sse);
+            }
+        }
+
+        break;
+    case 1:
+        this_mv.as_mv.col += 2;
+
+        if (startmv.as_mv.row & 7)
+        {
+            this_mv.as_mv.row -= 2;
+            thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+        }
+        else
+        {
+            this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6;
+            thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse);
+        }
+
+        break;
+    case 2:
+        this_mv.as_mv.row += 2;
+
+        if (startmv.as_mv.col & 7)
+        {
+            this_mv.as_mv.col -= 2;
+            thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+        }
+        else
+        {
+            this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6;
+            thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+        }
+
+        break;
+    case 3:
+        this_mv.as_mv.col += 2;
+        this_mv.as_mv.row += 2;
+        thismse = vfp->svf(y, y_stride,  this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse);
+        break;
+    }
+
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    return bestmse;
+}
+
+int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                                  int_mv *bestmv, int_mv *ref_mv,
+                                  int error_per_bit,
+                                  const vp8_variance_fn_ptr_t *vfp,
+                                  int *mvcost[2], int *distortion,
+                                  unsigned int *sse1)
+{
+    int bestmse = INT_MAX;
+    int_mv startmv;
+    int_mv this_mv;
+    unsigned char *z = (*(b->base_src) + b->src);
+    int left, right, up, down, diag;
+    unsigned int sse;
+    int whichdir ;
+    int thismse;
+    int y_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+#if ARCH_X86 || ARCH_X86_64
+    MACROBLOCKD *xd = &x->e_mbd;
+    unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    unsigned char *y;
+
+    y_stride = 32;
+    /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
+    vfp->copymem(y0 - 1 - pre_stride, pre_stride, xd->y_buf, y_stride, 18);
+    y = xd->y_buf + y_stride + 1;
+#else
+    unsigned char *y = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
+    y_stride = pre_stride;
+#endif
+
+    /* central mv */
+    bestmv->as_mv.row <<= 3;
+    bestmv->as_mv.col <<= 3;
+    startmv = *bestmv;
+
+    /* calculate central point error */
+    bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
+    *distortion = bestmse;
+    bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+    /* go left then right and check error */
+    this_mv.as_mv.row = startmv.as_mv.row;
+    this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
+    thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
+    left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (left < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = left;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    this_mv.as_mv.col += 8;
+    thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse);
+    right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (right < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = right;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    /* go up then down and check error */
+    this_mv.as_mv.col = startmv.as_mv.col;
+    this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
+    thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
+    up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (up < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = up;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    this_mv.as_mv.row += 8;
+    thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse);
+    down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (down < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = down;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    /* now check 1 more diagonal - */
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+    this_mv = startmv;
+
+    switch (whichdir)
+    {
+    case 0:
+        this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+        this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+        thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse);
+        break;
+    case 1:
+        this_mv.as_mv.col += 4;
+        this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4;
+        thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse);
+        break;
+    case 2:
+        this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4;
+        this_mv.as_mv.row += 4;
+        thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse);
+        break;
+    case 3:
+    default:
+        this_mv.as_mv.col += 4;
+        this_mv.as_mv.row += 4;
+        thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse);
+        break;
+    }
+
+    diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+        *distortion = thismse;
+        *sse1 = sse;
+    }
+
+    return bestmse;
+}
+
+#define CHECK_BOUNDS(range) \
+{\
+    all_in = 1;\
+    all_in &= ((br-range) >= x->mv_row_min);\
+    all_in &= ((br+range) <= x->mv_row_max);\
+    all_in &= ((bc-range) >= x->mv_col_min);\
+    all_in &= ((bc+range) <= x->mv_col_max);\
+}
+
+#define CHECK_POINT \
+{\
+    if (this_mv.as_mv.col < x->mv_col_min) continue;\
+    if (this_mv.as_mv.col > x->mv_col_max) continue;\
+    if (this_mv.as_mv.row < x->mv_row_min) continue;\
+    if (this_mv.as_mv.row > x->mv_row_max) continue;\
+}
+
+#define CHECK_BETTER \
+{\
+    if (thissad < bestsad)\
+    {\
+        thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);\
+        if (thissad < bestsad)\
+        {\
+            bestsad = thissad;\
+            best_site = i;\
+        }\
+    }\
+}
+
+static const MV next_chkpts[6][3] =
+{
+    {{ -2, 0}, { -1, -2}, {1, -2}},
+    {{ -1, -2}, {1, -2}, {2, 0}},
+    {{1, -2}, {2, 0}, {1, 2}},
+    {{2, 0}, {1, 2}, { -1, 2}},
+    {{1, 2}, { -1, 2}, { -2, 0}},
+    {{ -1, 2}, { -2, 0}, { -1, -2}}
+};
+
+int vp8_hex_search
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    int_mv *ref_mv,
+    int_mv *best_mv,
+    int search_param,
+    int sad_per_bit,
+    const vp8_variance_fn_ptr_t *vfp,
+    int *mvsadcost[2],
+    int *mvcost[2],
+    int_mv *center_mv
+)
+{
+    MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
+    MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}} ;
+    int i, j;
+
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+    int in_what_stride = pre_stride;
+    int br, bc;
+    int_mv this_mv;
+    unsigned int bestsad;
+    unsigned int thissad;
+    unsigned char *base_offset;
+    unsigned char *this_offset;
+    int k = -1;
+    int all_in;
+    int best_site = -1;
+    int hex_range = 127;
+    int dia_range = 8;
+
+    int_mv fcenter_mv;
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    /* adjust ref_mv to make sure it is within MV range */
+    vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+    br = ref_mv->as_mv.row;
+    bc = ref_mv->as_mv.col;
+
+    /* Work out the start point for the search */
+    base_offset = (unsigned char *)(base_pre + d->offset);
+    this_offset = base_offset + (br * (pre_stride)) + bc;
+    this_mv.as_mv.row = br;
+    this_mv.as_mv.col = bc;
+    bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, UINT_MAX)
+            + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+#if CONFIG_MULTI_RES_ENCODING
+    /* Lower search range based on prediction info */
+    if (search_param >= 6) goto cal_neighbors;
+    else if (search_param >= 5) hex_range = 4;
+    else if (search_param >= 4) hex_range = 6;
+    else if (search_param >= 3) hex_range = 15;
+    else if (search_param >= 2) hex_range = 31;
+    else if (search_param >= 1) hex_range = 63;
+
+    dia_range = 8;
+#endif
+
+    /* hex search */
+    CHECK_BOUNDS(2)
+
+    if(all_in)
+    {
+        for (i = 0; i < 6; i++)
+        {
+            this_mv.as_mv.row = br + hex[i].row;
+            this_mv.as_mv.col = bc + hex[i].col;
+            this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+            CHECK_BETTER
+        }
+    }else
+    {
+        for (i = 0; i < 6; i++)
+        {
+            this_mv.as_mv.row = br + hex[i].row;
+            this_mv.as_mv.col = bc + hex[i].col;
+            CHECK_POINT
+            this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+            CHECK_BETTER
+        }
+    }
+
+    if (best_site == -1)
+        goto cal_neighbors;
+    else
+    {
+        br += hex[best_site].row;
+        bc += hex[best_site].col;
+        k = best_site;
+    }
+
+    for (j = 1; j < hex_range; j++)
+    {
+        best_site = -1;
+        CHECK_BOUNDS(2)
+
+        if(all_in)
+        {
+            for (i = 0; i < 3; i++)
+            {
+                this_mv.as_mv.row = br + next_chkpts[k][i].row;
+                this_mv.as_mv.col = bc + next_chkpts[k][i].col;
+                this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+                CHECK_BETTER
+            }
+        }else
+        {
+            for (i = 0; i < 3; i++)
+            {
+                this_mv.as_mv.row = br + next_chkpts[k][i].row;
+                this_mv.as_mv.col = bc + next_chkpts[k][i].col;
+                CHECK_POINT
+                this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+                CHECK_BETTER
+            }
+        }
+
+        if (best_site == -1)
+            break;
+        else
+        {
+            br += next_chkpts[k][best_site].row;
+            bc += next_chkpts[k][best_site].col;
+            k += 5 + best_site;
+            if (k >= 12) k -= 12;
+            else if (k >= 6) k -= 6;
+        }
+    }
+
+    /* check 4 1-away neighbors */
+cal_neighbors:
+    for (j = 0; j < dia_range; j++)
+    {
+        best_site = -1;
+        CHECK_BOUNDS(1)
+
+        if(all_in)
+        {
+            for (i = 0; i < 4; i++)
+            {
+                this_mv.as_mv.row = br + neighbors[i].row;
+                this_mv.as_mv.col = bc + neighbors[i].col;
+                this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+                CHECK_BETTER
+            }
+        }else
+        {
+            for (i = 0; i < 4; i++)
+            {
+                this_mv.as_mv.row = br + neighbors[i].row;
+                this_mv.as_mv.col = bc + neighbors[i].col;
+                CHECK_POINT
+                this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+                CHECK_BETTER
+            }
+        }
+
+        if (best_site == -1)
+            break;
+        else
+        {
+            br += neighbors[best_site].row;
+            bc += neighbors[best_site].col;
+        }
+    }
+
+    best_mv->as_mv.row = br;
+    best_mv->as_mv.col = bc;
+
+    return bestsad;
+}
+#undef CHECK_BOUNDS
+#undef CHECK_POINT
+#undef CHECK_BETTER
+
+int vp8_diamond_search_sad_c
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    int_mv *ref_mv,
+    int_mv *best_mv,
+    int search_param,
+    int sad_per_bit,
+    int *num00,
+    vp8_variance_fn_ptr_t *fn_ptr,
+    int *mvcost[2],
+    int_mv *center_mv
+)
+{
+    int i, j, step;
+
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    unsigned char *best_address;
+
+    int tot_steps;
+    int_mv this_mv;
+
+    unsigned int bestsad;
+    unsigned int thissad;
+    int best_site = 0;
+    int last_site = 0;
+
+    int ref_row;
+    int ref_col;
+    int this_row_offset;
+    int this_col_offset;
+    search_site *ss;
+
+    unsigned char *check_here;
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+    ref_row = ref_mv->as_mv.row;
+    ref_col = ref_mv->as_mv.col;
+    *num00 = 0;
+    best_mv->as_mv.row = ref_row;
+    best_mv->as_mv.col = ref_col;
+
+    /* Work out the start point for the search */
+    in_what = (unsigned char *)(base_pre + d->offset + (ref_row * pre_stride) + ref_col);
+    best_address = in_what;
+
+    /* Check the starting position */
+    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+    /* search_param determines the length of the initial step and hence
+     * the number of iterations 0 = initial step (MAX_FIRST_STEP) pel :
+     * 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+     */
+    ss = &x->ss[search_param * x->searches_per_step];
+    tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+    i = 1;
+
+    for (step = 0; step < tot_steps ; step++)
+    {
+        for (j = 0 ; j < x->searches_per_step ; j++)
+        {
+            /* Trap illegal vectors */
+            this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
+            this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
+
+            if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+
+            {
+                check_here = ss[i].offset + best_address;
+                thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+                if (thissad < bestsad)
+                {
+                    this_mv.as_mv.row = this_row_offset;
+                    this_mv.as_mv.col = this_col_offset;
+                    thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                              mvsadcost, sad_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_site = i;
+                    }
+                }
+            }
+
+            i++;
+        }
+
+        if (best_site != last_site)
+        {
+            best_mv->as_mv.row += ss[best_site].mv.row;
+            best_mv->as_mv.col += ss[best_site].mv.col;
+            best_address += ss[best_site].offset;
+            last_site = best_site;
+        }
+        else if (best_address == in_what)
+            (*num00)++;
+    }
+
+    this_mv.as_mv.row = best_mv->as_mv.row << 3;
+    this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_diamond_search_sadx4
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    int_mv *ref_mv,
+    int_mv *best_mv,
+    int search_param,
+    int sad_per_bit,
+    int *num00,
+    vp8_variance_fn_ptr_t *fn_ptr,
+    int *mvcost[2],
+    int_mv *center_mv
+)
+{
+    int i, j, step;
+
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    unsigned char *best_address;
+
+    int tot_steps;
+    int_mv this_mv;
+
+    unsigned int bestsad;
+    unsigned int thissad;
+    int best_site = 0;
+    int last_site = 0;
+
+    int ref_row;
+    int ref_col;
+    int this_row_offset;
+    int this_col_offset;
+    search_site *ss;
+
+    unsigned char *check_here;
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+    ref_row = ref_mv->as_mv.row;
+    ref_col = ref_mv->as_mv.col;
+    *num00 = 0;
+    best_mv->as_mv.row = ref_row;
+    best_mv->as_mv.col = ref_col;
+
+    /* Work out the start point for the search */
+    in_what = (unsigned char *)(base_pre + d->offset + (ref_row * pre_stride) + ref_col);
+    best_address = in_what;
+
+    /* Check the starting position */
+    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+    /* search_param determines the length of the initial step and hence the
+     * number of iterations 0 = initial step (MAX_FIRST_STEP) pel : 1 =
+     * (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+     */
+    ss = &x->ss[search_param * x->searches_per_step];
+    tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+    i = 1;
+
+    for (step = 0; step < tot_steps ; step++)
+    {
+        int all_in = 1, t;
+
+        /* To know if all neighbor points are within the bounds, 4 bounds
+         * checking are enough instead of checking 4 bounds for each
+         * points.
+         */
+        all_in &= ((best_mv->as_mv.row + ss[i].mv.row)> x->mv_row_min);
+        all_in &= ((best_mv->as_mv.row + ss[i+1].mv.row) < x->mv_row_max);
+        all_in &= ((best_mv->as_mv.col + ss[i+2].mv.col) > x->mv_col_min);
+        all_in &= ((best_mv->as_mv.col + ss[i+3].mv.col) < x->mv_col_max);
+
+        if (all_in)
+        {
+            unsigned int sad_array[4];
+
+            for (j = 0 ; j < x->searches_per_step ; j += 4)
+            {
+                const unsigned char *block_offset[4];
+
+                for (t = 0; t < 4; t++)
+                    block_offset[t] = ss[i+t].offset + best_address;
+
+                fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+
+                for (t = 0; t < 4; t++, i++)
+                {
+                    if (sad_array[t] < bestsad)
+                    {
+                        this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;
+                        this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;
+                        sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                                       mvsadcost, sad_per_bit);
+
+                        if (sad_array[t] < bestsad)
+                        {
+                            bestsad = sad_array[t];
+                            best_site = i;
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            for (j = 0 ; j < x->searches_per_step ; j++)
+            {
+                /* Trap illegal vectors */
+                this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
+                this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
+
+                if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+                (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+                {
+                    check_here = ss[i].offset + best_address;
+                    thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+                    if (thissad < bestsad)
+                    {
+                        this_mv.as_mv.row = this_row_offset;
+                        this_mv.as_mv.col = this_col_offset;
+                        thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                                  mvsadcost, sad_per_bit);
+
+                        if (thissad < bestsad)
+                        {
+                            bestsad = thissad;
+                            best_site = i;
+                        }
+                    }
+                }
+                i++;
+            }
+        }
+
+        if (best_site != last_site)
+        {
+            best_mv->as_mv.row += ss[best_site].mv.row;
+            best_mv->as_mv.col += ss[best_site].mv.col;
+            best_address += ss[best_site].offset;
+            last_site = best_site;
+        }
+        else if (best_address == in_what)
+            (*num00)++;
+    }
+
+    this_mv.as_mv.row = best_mv->as_mv.row << 3;
+    this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                        int sad_per_bit, int distance,
+                        vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                        int_mv *center_mv)
+{
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    int mv_stride = pre_stride;
+    unsigned char *bestaddress;
+    int_mv *best_mv = &d->bmi.mv;
+    int_mv this_mv;
+    unsigned int bestsad;
+    unsigned int thissad;
+    int r, c;
+
+    unsigned char *check_here;
+
+    int ref_row = ref_mv->as_mv.row;
+    int ref_col = ref_mv->as_mv.col;
+
+    int row_min = ref_row - distance;
+    int row_max = ref_row + distance;
+    int col_min = ref_col - distance;
+    int col_max = ref_col + distance;
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    /* Work out the mid point for the search */
+    in_what = base_pre + d->offset;
+    bestaddress = in_what + (ref_row * pre_stride) + ref_col;
+
+    best_mv->as_mv.row = ref_row;
+    best_mv->as_mv.col = ref_col;
+
+    /* Baseline value at the centre */
+    bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
+                          in_what_stride, UINT_MAX)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+    /* Apply further limits to prevent us looking using vectors that
+     * stretch beyiond the UMV border
+     */
+    if (col_min < x->mv_col_min)
+        col_min = x->mv_col_min;
+
+    if (col_max > x->mv_col_max)
+        col_max = x->mv_col_max;
+
+    if (row_min < x->mv_row_min)
+        row_min = x->mv_row_min;
+
+    if (row_max > x->mv_row_max)
+        row_max = x->mv_row_max;
+
+    for (r = row_min; r < row_max ; r++)
+    {
+        this_mv.as_mv.row = r;
+        check_here = r * mv_stride + in_what + col_min;
+
+        for (c = col_min; c < col_max; c++)
+        {
+            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+            this_mv.as_mv.col = c;
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                      mvsadcost, sad_per_bit);
+
+            if (thissad < bestsad)
+            {
+                bestsad = thissad;
+                best_mv->as_mv.row = r;
+                best_mv->as_mv.col = c;
+                bestaddress = check_here;
+            }
+
+            check_here++;
+        }
+    }
+
+    this_mv.as_mv.row = best_mv->as_mv.row << 3;
+    this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                          int sad_per_bit, int distance,
+                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                          int_mv *center_mv)
+{
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    int mv_stride = pre_stride;
+    unsigned char *bestaddress;
+    int_mv *best_mv = &d->bmi.mv;
+    int_mv this_mv;
+    unsigned int bestsad;
+    unsigned int thissad;
+    int r, c;
+
+    unsigned char *check_here;
+
+    int ref_row = ref_mv->as_mv.row;
+    int ref_col = ref_mv->as_mv.col;
+
+    int row_min = ref_row - distance;
+    int row_max = ref_row + distance;
+    int col_min = ref_col - distance;
+    int col_max = ref_col + distance;
+
+    unsigned int sad_array[3];
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    /* Work out the mid point for the search */
+    in_what = base_pre + d->offset;
+    bestaddress = in_what + (ref_row * pre_stride) + ref_col;
+
+    best_mv->as_mv.row = ref_row;
+    best_mv->as_mv.col = ref_col;
+
+    /* Baseline value at the centre */
+    bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
+                          in_what_stride, UINT_MAX)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+    /* Apply further limits to prevent us looking using vectors that stretch
+     * beyond the UMV border
+     */
+    if (col_min < x->mv_col_min)
+        col_min = x->mv_col_min;
+
+    if (col_max > x->mv_col_max)
+        col_max = x->mv_col_max;
+
+    if (row_min < x->mv_row_min)
+        row_min = x->mv_row_min;
+
+    if (row_max > x->mv_row_max)
+        row_max = x->mv_row_max;
+
+    for (r = row_min; r < row_max ; r++)
+    {
+        this_mv.as_mv.row = r;
+        check_here = r * mv_stride + in_what + col_min;
+        c = col_min;
+
+        while ((c + 2) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
+
+            for (i = 0; i < 3; i++)
+            {
+                thissad = sad_array[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.as_mv.col = c;
+                    thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                              mvsadcost, sad_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->as_mv.row = r;
+                        best_mv->as_mv.col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while (c < col_max)
+        {
+            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+
+            if (thissad < bestsad)
+            {
+                this_mv.as_mv.col = c;
+                thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                          mvsadcost, sad_per_bit);
+
+                if (thissad < bestsad)
+                {
+                    bestsad = thissad;
+                    best_mv->as_mv.row = r;
+                    best_mv->as_mv.col = c;
+                    bestaddress = check_here;
+                }
+            }
+
+            check_here ++;
+            c ++;
+        }
+
+    }
+
+    this_mv.as_mv.row = best_mv->as_mv.row << 3;
+    this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                          int sad_per_bit, int distance,
+                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                          int_mv *center_mv)
+{
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    unsigned char *in_what;
+    int in_what_stride = pre_stride;
+    int mv_stride = pre_stride;
+    unsigned char *bestaddress;
+    int_mv *best_mv = &d->bmi.mv;
+    int_mv this_mv;
+    unsigned int bestsad;
+    unsigned int thissad;
+    int r, c;
+
+    unsigned char *check_here;
+
+    int ref_row = ref_mv->as_mv.row;
+    int ref_col = ref_mv->as_mv.col;
+
+    int row_min = ref_row - distance;
+    int row_max = ref_row + distance;
+    int col_min = ref_col - distance;
+    int col_max = ref_col + distance;
+
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
+    unsigned int sad_array[3];
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    /* Work out the mid point for the search */
+    in_what = base_pre + d->offset;
+    bestaddress = in_what + (ref_row * pre_stride) + ref_col;
+
+    best_mv->as_mv.row = ref_row;
+    best_mv->as_mv.col = ref_col;
+
+    /* Baseline value at the centre */
+    bestsad = fn_ptr->sdf(what, what_stride,
+                          bestaddress, in_what_stride, UINT_MAX)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+
+    /* Apply further limits to prevent us looking using vectors that stretch
+     * beyond the UMV border
+     */
+    if (col_min < x->mv_col_min)
+        col_min = x->mv_col_min;
+
+    if (col_max > x->mv_col_max)
+        col_max = x->mv_col_max;
+
+    if (row_min < x->mv_row_min)
+        row_min = x->mv_row_min;
+
+    if (row_max > x->mv_row_max)
+        row_max = x->mv_row_max;
+
+    for (r = row_min; r < row_max ; r++)
+    {
+        this_mv.as_mv.row = r;
+        check_here = r * mv_stride + in_what + col_min;
+        c = col_min;
+
+        while ((c + 7) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
+
+            for (i = 0; i < 8; i++)
+            {
+                thissad = sad_array8[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.as_mv.col = c;
+                    thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                              mvsadcost, sad_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->as_mv.row = r;
+                        best_mv->as_mv.col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while ((c + 2) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
+
+            for (i = 0; i < 3; i++)
+            {
+                thissad = sad_array[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.as_mv.col = c;
+                    thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+                        mvsadcost, sad_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->as_mv.row = r;
+                        best_mv->as_mv.col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while (c < col_max)
+        {
+            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+            if (thissad < bestsad)
+            {
+                this_mv.as_mv.col = c;
+                thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
+                    mvsadcost, sad_per_bit);
+
+                if (thissad < bestsad)
+                {
+                    bestsad = thissad;
+                    best_mv->as_mv.row = r;
+                    best_mv->as_mv.col = c;
+                    bestaddress = check_here;
+                }
+            }
+
+            check_here ++;
+            c ++;
+        }
+    }
+
+    this_mv.as_mv.row = best_mv->as_mv.row << 3;
+    this_mv.as_mv.col = best_mv->as_mv.col << 3;
+
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                            int error_per_bit, int search_range,
+                            vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                            int_mv *center_mv)
+{
+    MV neighbors[4] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}};
+    int i, j;
+    short this_row_offset, this_col_offset;
+
+    int what_stride = b->src_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    unsigned char *what = (*(b->base_src) + b->src);
+    unsigned char *best_address = (unsigned char *)(base_pre + d->offset +
+        (ref_mv->as_mv.row * pre_stride) + ref_mv->as_mv.col);
+    unsigned char *check_here;
+    int_mv this_mv;
+    unsigned int bestsad;
+    unsigned int thissad;
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    bestsad = fn_ptr->sdf(what, what_stride, best_address,
+                          in_what_stride, UINT_MAX)
+            + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+    for (i=0; i<search_range; i++)
+    {
+        int best_site = -1;
+
+        for (j = 0 ; j < 4 ; j++)
+        {
+            this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+            this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+            if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+            {
+                check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
+                thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+                if (thissad < bestsad)
+                {
+                    this_mv.as_mv.row = this_row_offset;
+                    this_mv.as_mv.col = this_col_offset;
+                    thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_site = j;
+                    }
+                }
+            }
+        }
+
+        if (best_site == -1)
+            break;
+        else
+        {
+            ref_mv->as_mv.row += neighbors[best_site].row;
+            ref_mv->as_mv.col += neighbors[best_site].col;
+            best_address += (neighbors[best_site].row)*in_what_stride + neighbors[best_site].col;
+        }
+    }
+
+    this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+    this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
+                              int_mv *ref_mv, int error_per_bit,
+                              int search_range, vp8_variance_fn_ptr_t *fn_ptr,
+                              int *mvcost[2], int_mv *center_mv)
+{
+    MV neighbors[4] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}};
+    int i, j;
+    short this_row_offset, this_col_offset;
+
+    int what_stride = b->src_stride;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int in_what_stride = pre_stride;
+    unsigned char *what = (*(b->base_src) + b->src);
+    unsigned char *best_address = (unsigned char *)(base_pre + d->offset +
+        (ref_mv->as_mv.row * pre_stride) + ref_mv->as_mv.col);
+    unsigned char *check_here;
+    int_mv this_mv;
+    unsigned int bestsad;
+    unsigned int thissad;
+
+    int *mvsadcost[2];
+    int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
+    fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+    fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+    bestsad = fn_ptr->sdf(what, what_stride, best_address,
+                          in_what_stride, UINT_MAX)
+            + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+    for (i=0; i<search_range; i++)
+    {
+        int best_site = -1;
+        int all_in = 1;
+
+        all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min);
+        all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max);
+        all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min);
+        all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max);
+
+        if(all_in)
+        {
+            unsigned int sad_array[4];
+            const unsigned char *block_offset[4];
+            block_offset[0] = best_address - in_what_stride;
+            block_offset[1] = best_address - 1;
+            block_offset[2] = best_address + 1;
+            block_offset[3] = best_address + in_what_stride;
+
+            fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+
+            for (j = 0; j < 4; j++)
+            {
+                if (sad_array[j] < bestsad)
+                {
+                    this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;
+                    this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;
+                    sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+                    if (sad_array[j] < bestsad)
+                    {
+                        bestsad = sad_array[j];
+                        best_site = j;
+                    }
+                }
+            }
+        }
+        else
+        {
+            for (j = 0 ; j < 4 ; j++)
+            {
+                this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+                this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+                if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+                (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+                {
+                    check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
+                    thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+                    if (thissad < bestsad)
+                    {
+                        this_mv.as_mv.row = this_row_offset;
+                        this_mv.as_mv.col = this_col_offset;
+                        thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
+
+                        if (thissad < bestsad)
+                        {
+                            bestsad = thissad;
+                            best_site = j;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (best_site == -1)
+            break;
+        else
+        {
+            ref_mv->as_mv.row += neighbors[best_site].row;
+            ref_mv->as_mv.col += neighbors[best_site].col;
+            best_address += (neighbors[best_site].row)*in_what_stride + neighbors[best_site].col;
+        }
+    }
+
+    this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+    this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+}
+
+#ifdef ENTROPY_STATS
+void print_mode_context(void)
+{
+    FILE *f = fopen("modecont.c", "w");
+    int i, j;
+
+    fprintf(f, "#include \"entropy.h\"\n");
+    fprintf(f, "const int vp8_mode_contexts[6][4] =\n");
+    fprintf(f, "{\n");
+
+    for (j = 0; j < 6; j++)
+    {
+        fprintf(f, "  { /* %d */\n", j);
+        fprintf(f, "    ");
+
+        for (i = 0; i < 4; i++)
+        {
+            int overal_prob;
+            int this_prob;
+            int count;
+
+            /* Overall probs */
+            count = mv_mode_cts[i][0] + mv_mode_cts[i][1];
+
+            if (count)
+                overal_prob = 256 * mv_mode_cts[i][0] / count;
+            else
+                overal_prob = 128;
+
+            if (overal_prob == 0)
+                overal_prob = 1;
+
+            /* context probs */
+            count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
+
+            if (count)
+                this_prob = 256 * mv_ref_ct[j][i][0] / count;
+            else
+                this_prob = 128;
+
+            if (this_prob == 0)
+                this_prob = 1;
+
+            fprintf(f, "%5d, ", this_prob);
+        }
+
+        fprintf(f, "  },\n");
+    }
+
+    fprintf(f, "};\n");
+    fclose(f);
+}
+
+/* MV ref count ENTROPY_STATS stats code */
+#ifdef ENTROPY_STATS
+void init_mv_ref_counts()
+{
+    vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
+    vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
+}
+
+void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4])
+{
+    if (m == ZEROMV)
+    {
+        ++mv_ref_ct [ct[0]] [0] [0];
+        ++mv_mode_cts[0][0];
+    }
+    else
+    {
+        ++mv_ref_ct [ct[0]] [0] [1];
+        ++mv_mode_cts[0][1];
+
+        if (m == NEARESTMV)
+        {
+            ++mv_ref_ct [ct[1]] [1] [0];
+            ++mv_mode_cts[1][0];
+        }
+        else
+        {
+            ++mv_ref_ct [ct[1]] [1] [1];
+            ++mv_mode_cts[1][1];
+
+            if (m == NEARMV)
+            {
+                ++mv_ref_ct [ct[2]] [2] [0];
+                ++mv_mode_cts[2][0];
+            }
+            else
+            {
+                ++mv_ref_ct [ct[2]] [2] [1];
+                ++mv_mode_cts[2][1];
+
+                if (m == NEWMV)
+                {
+                    ++mv_ref_ct [ct[3]] [3] [0];
+                    ++mv_mode_cts[3][0];
+                }
+                else
+                {
+                    ++mv_ref_ct [ct[3]] [3] [1];
+                    ++mv_mode_cts[3][1];
+                }
+            }
+        }
+    }
+}
+
+#endif/* END MV ref count ENTROPY_STATS stats code */
+
+#endif
diff --git a/libvpx/vp8/encoder/mcomp.h b/libvpx/vp8/encoder/mcomp.h
new file mode 100644
index 0000000..890113f
--- /dev/null
+++ b/libvpx/vp8/encoder/mcomp.h
@@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MCOMP_H
+#define __INC_MCOMP_H
+
+#include "block.h"
+#include "vp8/common/variance.h"
+
+#ifdef ENTROPY_STATS
+extern void init_mv_ref_counts();
+extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
+#endif
+
+
+/* The maximum number of steps in a step search given the largest allowed
+ * initial step
+ */
+#define MAX_MVSEARCH_STEPS 8
+
+/* Max full pel mv specified in 1 pel units */
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)
+
+/* Maximum size of the first step in full pel units */
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
+
+extern void print_mode_context(void);
+extern int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight);
+extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride);
+extern void vp8_init3smotion_compensation(MACROBLOCK *x,  int stride);
+
+
+extern int vp8_hex_search
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    int_mv *ref_mv,
+    int_mv *best_mv,
+    int search_param,
+    int error_per_bit,
+    const vp8_variance_fn_ptr_t *vf,
+    int *mvsadcost[2],
+    int *mvcost[2],
+    int_mv *center_mv
+);
+
+typedef int (fractional_mv_step_fp)
+    (MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv,
+     int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2],
+     int *distortion, unsigned int *sse);
+
+extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
+extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
+extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
+extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
+
+typedef int (*vp8_full_search_fn_t)
+    (
+     MACROBLOCK *x,
+     BLOCK *b,
+     BLOCKD *d,
+     int_mv *ref_mv,
+     int sad_per_bit,
+     int distance,
+     vp8_variance_fn_ptr_t *fn_ptr,
+     int *mvcost[2],
+     int_mv *center_mv
+    );
+
+typedef int (*vp8_refining_search_fn_t)
+    (
+     MACROBLOCK *x,
+     BLOCK *b,
+     BLOCKD *d,
+     int_mv *ref_mv,
+     int sad_per_bit,
+     int distance,
+     vp8_variance_fn_ptr_t *fn_ptr,
+     int *mvcost[2],
+     int_mv *center_mv
+    );
+
+typedef int (*vp8_diamond_search_fn_t)
+    (
+     MACROBLOCK *x,
+     BLOCK *b,
+     BLOCKD *d,
+     int_mv *ref_mv,
+     int_mv *best_mv,
+     int search_param,
+     int sad_per_bit,
+     int *num00,
+     vp8_variance_fn_ptr_t *fn_ptr,
+     int *mvcost[2],
+     int_mv *center_mv
+    );
+
+#endif
diff --git a/libvpx/vp8/encoder/modecosts.c b/libvpx/vp8/encoder/modecosts.c
new file mode 100644
index 0000000..c61563c
--- /dev/null
+++ b/libvpx/vp8/encoder/modecosts.c
@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/blockd.h"
+#include "onyx_int.h"
+#include "treewriter.h"
+#include "vp8/common/entropymode.h"
+
+
+void vp8_init_mode_costs(VP8_COMP *c)
+{
+    VP8_COMMON *x = &c->common;
+    struct rd_costs_struct *rd_costs = &c->rd_costs;
+
+    {
+        const vp8_tree_p T = vp8_bmode_tree;
+
+        int i = 0;
+
+        do
+        {
+            int j = 0;
+
+            do
+            {
+                vp8_cost_tokens(rd_costs->bmode_costs[i][j],
+                                vp8_kf_bmode_prob[i][j], T);
+            }
+            while (++j < VP8_BINTRAMODES);
+        }
+        while (++i < VP8_BINTRAMODES);
+
+        vp8_cost_tokens(rd_costs->inter_bmode_costs, x->fc.bmode_prob, T);
+    }
+    vp8_cost_tokens(rd_costs->inter_bmode_costs, x->fc.sub_mv_ref_prob,
+                    vp8_sub_mv_ref_tree);
+
+    vp8_cost_tokens(rd_costs->mbmode_cost[1], x->fc.ymode_prob, vp8_ymode_tree);
+    vp8_cost_tokens(rd_costs->mbmode_cost[0], vp8_kf_ymode_prob,
+                    vp8_kf_ymode_tree);
+
+    vp8_cost_tokens(rd_costs->intra_uv_mode_cost[1], x->fc.uv_mode_prob,
+                    vp8_uv_mode_tree);
+    vp8_cost_tokens(rd_costs->intra_uv_mode_cost[0], vp8_kf_uv_mode_prob,
+                    vp8_uv_mode_tree);
+}
diff --git a/libvpx/vp8/encoder/modecosts.h b/libvpx/vp8/encoder/modecosts.h
new file mode 100644
index 0000000..99ef119
--- /dev/null
+++ b/libvpx/vp8/encoder/modecosts.h
@@ -0,0 +1,17 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MODECOSTS_H
+#define __INC_MODECOSTS_H
+
+void vp8_init_mode_costs(VP8_COMP *x);
+
+#endif
diff --git a/libvpx/vp8/encoder/mr_dissim.c b/libvpx/vp8/encoder/mr_dissim.c
new file mode 100644
index 0000000..71218cc
--- /dev/null
+++ b/libvpx/vp8/encoder/mr_dissim.c
@@ -0,0 +1,236 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include "vpx_config.h"
+#include "onyx_int.h"
+#include "mr_dissim.h"
+#include "vpx_mem/vpx_mem.h"
+#include "rdopt.h"
+
+void vp8_cal_low_res_mb_cols(VP8_COMP *cpi)
+{
+    int low_res_w;
+
+    /* Support arbitrary down-sampling factor */
+    unsigned int iw = cpi->oxcf.Width*cpi->oxcf.mr_down_sampling_factor.den
+                      + cpi->oxcf.mr_down_sampling_factor.num - 1;
+
+    low_res_w = iw/cpi->oxcf.mr_down_sampling_factor.num;
+    cpi->mr_low_res_mb_cols = ((low_res_w + 15) >> 4);
+}
+
+#define GET_MV(x)    \
+if(x->mbmi.ref_frame !=INTRA_FRAME)   \
+{   \
+    mvx[cnt] = x->mbmi.mv.as_mv.row;  \
+    mvy[cnt] = x->mbmi.mv.as_mv.col;  \
+    cnt++;    \
+}
+
+#define GET_MV_SIGN(x)    \
+if(x->mbmi.ref_frame !=INTRA_FRAME)   \
+{   \
+    mvx[cnt] = x->mbmi.mv.as_mv.row;  \
+    mvy[cnt] = x->mbmi.mv.as_mv.col;  \
+    if (cm->ref_frame_sign_bias[x->mbmi.ref_frame]  \
+        != cm->ref_frame_sign_bias[tmp->mbmi.ref_frame])  \
+    {  \
+        mvx[cnt] *= -1;   \
+        mvy[cnt] *= -1;   \
+    }  \
+    cnt++;  \
+}
+
+void vp8_cal_dissimilarity(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+    int i;
+
+    /* Note: The first row & first column in mip are outside the frame, which
+     * were initialized to all 0.(ref_frame, mode, mv...)
+     * Their ref_frame = 0 means they won't be counted in the following
+     * calculation.
+     */
+    if (cpi->oxcf.mr_total_resolutions >1
+        && cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1))
+    {
+        /* Store info for show/no-show frames for supporting alt_ref.
+         * If parent frame is alt_ref, child has one too.
+         */
+        LOWER_RES_FRAME_INFO* store_info
+                      = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+        store_info->frame_type = cm->frame_type;
+
+        if(cm->frame_type != KEY_FRAME)
+        {
+            store_info->is_frame_dropped = 0;
+            for (i = 1; i < MAX_REF_FRAMES; i++)
+                store_info->low_res_ref_frames[i] = cpi->current_ref_frames[i];
+        }
+
+        if(cm->frame_type != KEY_FRAME)
+        {
+            int mb_row;
+            int mb_col;
+            /* Point to beginning of allocated MODE_INFO arrays. */
+            MODE_INFO *tmp = cm->mip + cm->mode_info_stride;
+            LOWER_RES_MB_INFO* store_mode_info = store_info->mb_info;
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+            {
+                tmp++;
+                for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
+                {
+                    int dissim = INT_MAX;
+
+                    if(tmp->mbmi.ref_frame !=INTRA_FRAME)
+                    {
+                        int              mvx[8];
+                        int              mvy[8];
+                        int              mmvx;
+                        int              mmvy;
+                        int              cnt=0;
+                        const MODE_INFO *here = tmp;
+                        const MODE_INFO *above = here - cm->mode_info_stride;
+                        const MODE_INFO *left = here - 1;
+                        const MODE_INFO *aboveleft = above - 1;
+                        const MODE_INFO *aboveright = NULL;
+                        const MODE_INFO *right = NULL;
+                        const MODE_INFO *belowleft = NULL;
+                        const MODE_INFO *below = NULL;
+                        const MODE_INFO *belowright = NULL;
+
+                        /* If alternate reference frame is used, we have to
+                         * check sign of MV. */
+                        if(cpi->oxcf.play_alternate)
+                        {
+                            /* Gather mv of neighboring MBs */
+                            GET_MV_SIGN(above)
+                            GET_MV_SIGN(left)
+                            GET_MV_SIGN(aboveleft)
+
+                            if(mb_col < (cm->mb_cols-1))
+                            {
+                                right = here + 1;
+                                aboveright = above + 1;
+                                GET_MV_SIGN(right)
+                                GET_MV_SIGN(aboveright)
+                            }
+
+                            if(mb_row < (cm->mb_rows-1))
+                            {
+                                below = here + cm->mode_info_stride;
+                                belowleft = below - 1;
+                                GET_MV_SIGN(below)
+                                GET_MV_SIGN(belowleft)
+                            }
+
+                            if(mb_col < (cm->mb_cols-1)
+                                && mb_row < (cm->mb_rows-1))
+                            {
+                                belowright = below + 1;
+                                GET_MV_SIGN(belowright)
+                            }
+                        }else
+                        {
+                            /* No alt_ref and gather mv of neighboring MBs */
+                            GET_MV(above)
+                            GET_MV(left)
+                            GET_MV(aboveleft)
+
+                            if(mb_col < (cm->mb_cols-1))
+                            {
+                                right = here + 1;
+                                aboveright = above + 1;
+                                GET_MV(right)
+                                GET_MV(aboveright)
+                            }
+
+                            if(mb_row < (cm->mb_rows-1))
+                            {
+                                below = here + cm->mode_info_stride;
+                                belowleft = below - 1;
+                                GET_MV(below)
+                                GET_MV(belowleft)
+                            }
+
+                            if(mb_col < (cm->mb_cols-1)
+                                && mb_row < (cm->mb_rows-1))
+                            {
+                                belowright = below + 1;
+                                GET_MV(belowright)
+                            }
+                        }
+
+                        if (cnt > 0)
+                        {
+                            int max_mvx = mvx[0];
+                            int min_mvx = mvx[0];
+                            int max_mvy = mvy[0];
+                            int min_mvy = mvy[0];
+                            int i;
+
+                            if (cnt > 1)
+                            {
+                                for (i=1; i< cnt; i++)
+                                {
+                                    if (mvx[i] > max_mvx) max_mvx = mvx[i];
+                                    else if (mvx[i] < min_mvx) min_mvx = mvx[i];
+                                    if (mvy[i] > max_mvy) max_mvy = mvy[i];
+                                    else if (mvy[i] < min_mvy) min_mvy = mvy[i];
+                                }
+                            }
+
+                            mmvx = MAX(abs(min_mvx - here->mbmi.mv.as_mv.row),
+                                       abs(max_mvx - here->mbmi.mv.as_mv.row));
+                            mmvy = MAX(abs(min_mvy - here->mbmi.mv.as_mv.col),
+                                       abs(max_mvy - here->mbmi.mv.as_mv.col));
+                            dissim = MAX(mmvx, mmvy);
+                        }
+                    }
+
+                    /* Store mode info for next resolution encoding */
+                    store_mode_info->mode = tmp->mbmi.mode;
+                    store_mode_info->ref_frame = tmp->mbmi.ref_frame;
+                    store_mode_info->mv.as_int = tmp->mbmi.mv.as_int;
+                    store_mode_info->dissim = dissim;
+                    tmp++;
+                    store_mode_info++;
+                }
+            }
+        }
+    }
+}
+
+/* This function is called only when this frame is dropped at current
+   resolution level. */
+void vp8_store_drop_frame_info(VP8_COMP *cpi)
+{
+    /* If the frame is dropped in lower-resolution encoding, this information
+       is passed to higher resolution level so that the encoder knows there
+       is no mode & motion info available.
+     */
+    if (cpi->oxcf.mr_total_resolutions >1
+        && cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1))
+    {
+        /* Store info for show/no-show frames for supporting alt_ref.
+         * If parent frame is alt_ref, child has one too.
+         */
+        LOWER_RES_FRAME_INFO* store_info
+                      = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+        /* Set frame_type to be INTER_FRAME since we won't drop key frame. */
+        store_info->frame_type = INTER_FRAME;
+        store_info->is_frame_dropped = 1;
+    }
+}
diff --git a/libvpx/vp8/encoder/mr_dissim.h b/libvpx/vp8/encoder/mr_dissim.h
new file mode 100644
index 0000000..f8cb135
--- /dev/null
+++ b/libvpx/vp8/encoder/mr_dissim.h
@@ -0,0 +1,20 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MR_DISSIM_H
+#define __INC_MR_DISSIM_H
+#include "vpx_config.h"
+
+extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi);
+extern void vp8_cal_dissimilarity(VP8_COMP *cpi);
+extern void vp8_store_drop_frame_info(VP8_COMP *cpi);
+
+#endif
diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c
new file mode 100644
index 0000000..c7d81b1
--- /dev/null
+++ b/libvpx/vp8/encoder/onyx_if.c
@@ -0,0 +1,5568 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8/common/onyxc_int.h"
+#include "vp8/common/blockd.h"
+#include "onyx_int.h"
+#include "vp8/common/systemdependent.h"
+#include "quantize.h"
+#include "vp8/common/alloccommon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "psnr.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp8/common/extend.h"
+#include "ratectrl.h"
+#include "vp8/common/quant_common.h"
+#include "segmentation.h"
+#if CONFIG_POSTPROC
+#include "vp8/common/postproc.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "vp8/common/threading.h"
+#include "vpx_ports/vpx_timer.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+#if CONFIG_MULTI_RES_ENCODING
+#include "mr_dissim.h"
+#endif
+#include "encodeframe.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <limits.h>
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+extern int vp8_update_coef_context(VP8_COMP *cpi);
+extern void vp8_update_coef_probs(VP8_COMP *cpi);
+#endif
+
+extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
+extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
+extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
+
+extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int filt_lvl, int low_var_thresh, int flag);
+extern void print_parms(VP8_CONFIG *ocf, char *filenam);
+extern unsigned int vp8_get_processor_freq();
+extern void print_tree_update_probs();
+extern int vp8cx_create_encoder_threads(VP8_COMP *cpi);
+extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi);
+
+int vp8_estimate_entropy_savings(VP8_COMP *cpi);
+
+int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
+
+extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance);
+
+static void set_default_lf_deltas(VP8_COMP *cpi);
+
+extern const int vp8_gf_interval_table[101];
+
+#if CONFIG_INTERNAL_STATS
+#include "math.h"
+
+extern double vp8_calc_ssim
+(
+    YV12_BUFFER_CONFIG *source,
+    YV12_BUFFER_CONFIG *dest,
+    int lumamask,
+    double *weight
+);
+
+
+extern double vp8_calc_ssimg
+(
+    YV12_BUFFER_CONFIG *source,
+    YV12_BUFFER_CONFIG *dest,
+    double *ssim_y,
+    double *ssim_u,
+    double *ssim_v
+);
+
+
+#endif
+
+
+#ifdef OUTPUT_YUV_SRC
+FILE *yuv_file;
+#endif
+
+#if 0
+FILE *framepsnr;
+FILE *kf_list;
+FILE *keyfile;
+#endif
+
+#if 0
+extern int skip_true_count;
+extern int skip_false_count;
+#endif
+
+
+#ifdef ENTROPY_STATS
+extern int intra_mode_stats[10][10][10];
+#endif
+
+#ifdef SPEEDSTATS
+unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int tot_pm = 0;
+unsigned int cnt_pm = 0;
+unsigned int tot_ef = 0;
+unsigned int cnt_ef = 0;
+#endif
+
+#ifdef MODE_STATS
+extern unsigned __int64 Sectionbits[50];
+extern int y_modes[5]  ;
+extern int uv_modes[4] ;
+extern int b_modes[10]  ;
+
+extern int inter_y_modes[10] ;
+extern int inter_uv_modes[4] ;
+extern unsigned int inter_b_modes[15];
+#endif
+
+extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
+
+extern const int qrounding_factors[129];
+extern const int qzbin_factors[129];
+extern void vp8cx_init_quantizer(VP8_COMP *cpi);
+extern const int vp8cx_base_skip_false_prob[128];
+
+/* Tables relating active max Q to active min Q */
+static const unsigned char kf_low_motion_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
+    3,3,3,3,3,3,4,4,4,5,5,5,5,5,6,6,
+    6,6,7,7,8,8,8,8,9,9,10,10,10,10,11,11,
+    11,11,12,12,13,13,13,13,14,14,15,15,15,15,16,16,
+    16,16,17,17,18,18,18,18,19,20,20,21,21,22,23,23
+};
+static const unsigned char kf_high_motion_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,1,1,1,1,1,1,1,2,2,2,2,3,3,3,3,
+    3,3,3,3,4,4,4,4,5,5,5,5,5,5,6,6,
+    6,6,7,7,8,8,8,8,9,9,10,10,10,10,11,11,
+    11,11,12,12,13,13,13,13,14,14,15,15,15,15,16,16,
+    16,16,17,17,18,18,18,18,19,19,20,20,20,20,21,21,
+    21,21,22,22,23,23,24,25,25,26,26,27,28,28,29,30
+};
+static const unsigned char gf_low_motion_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
+    3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,
+    7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,
+    11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,
+    19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,
+    27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,
+    35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,
+    43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
+};
+static const unsigned char gf_mid_motion_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,1,1,1,1,1,1,2,2,3,3,3,4,
+    4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
+    9,10,10,10,10,11,11,11,12,12,12,12,13,13,13,14,
+    14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,
+    22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,
+    30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,
+    38,39,39,40,40,41,41,42,42,43,43,44,45,46,47,48,
+    49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64
+};
+static const unsigned char gf_high_motion_minq[QINDEX_RANGE] =
+{
+    0,0,0,0,1,1,1,1,1,2,2,2,3,3,3,4,
+    4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
+    9,10,10,10,11,11,12,12,13,13,14,14,15,15,16,16,
+    17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,
+    25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,
+    33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,
+    41,41,42,42,43,44,45,46,47,48,49,50,51,52,53,54,
+    55,56,57,58,59,60,62,64,66,68,70,72,74,76,78,80
+};
+static const unsigned char inter_minq[QINDEX_RANGE] =
+{
+    0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
+    9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
+    20,21,22,22,23,24,24,25,26,27,27,28,29,30,30,31,
+    32,33,33,34,35,36,36,37,38,39,39,40,41,42,42,43,
+    44,45,46,46,47,48,49,50,50,51,52,53,54,55,55,56,
+    57,58,59,60,60,61,62,63,64,65,66,67,67,68,69,70,
+    71,72,73,74,75,75,76,77,78,79,80,81,82,83,84,85,
+    86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
+};
+
+#ifdef PACKET_TESTING
+extern FILE *vpxlogc;
+#endif
+
+static void save_layer_context(VP8_COMP *cpi)
+{
+    LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer];
+
+    /* Save layer dependent coding state */
+    lc->target_bandwidth                 = cpi->target_bandwidth;
+    lc->starting_buffer_level            = cpi->oxcf.starting_buffer_level;
+    lc->optimal_buffer_level             = cpi->oxcf.optimal_buffer_level;
+    lc->maximum_buffer_size              = cpi->oxcf.maximum_buffer_size;
+    lc->starting_buffer_level_in_ms      = cpi->oxcf.starting_buffer_level_in_ms;
+    lc->optimal_buffer_level_in_ms       = cpi->oxcf.optimal_buffer_level_in_ms;
+    lc->maximum_buffer_size_in_ms        = cpi->oxcf.maximum_buffer_size_in_ms;
+    lc->buffer_level                     = cpi->buffer_level;
+    lc->bits_off_target                  = cpi->bits_off_target;
+    lc->total_actual_bits                = cpi->total_actual_bits;
+    lc->worst_quality                    = cpi->worst_quality;
+    lc->active_worst_quality             = cpi->active_worst_quality;
+    lc->best_quality                     = cpi->best_quality;
+    lc->active_best_quality              = cpi->active_best_quality;
+    lc->ni_av_qi                         = cpi->ni_av_qi;
+    lc->ni_tot_qi                        = cpi->ni_tot_qi;
+    lc->ni_frames                        = cpi->ni_frames;
+    lc->avg_frame_qindex                 = cpi->avg_frame_qindex;
+    lc->rate_correction_factor           = cpi->rate_correction_factor;
+    lc->key_frame_rate_correction_factor = cpi->key_frame_rate_correction_factor;
+    lc->gf_rate_correction_factor        = cpi->gf_rate_correction_factor;
+    lc->zbin_over_quant                  = cpi->zbin_over_quant;
+    lc->inter_frame_target               = cpi->inter_frame_target;
+    lc->total_byte_count                 = cpi->total_byte_count;
+    lc->filter_level                     = cpi->common.filter_level;
+
+    lc->last_frame_percent_intra         = cpi->last_frame_percent_intra;
+
+    memcpy (lc->count_mb_ref_frame_usage,
+            cpi->count_mb_ref_frame_usage,
+            sizeof(cpi->count_mb_ref_frame_usage));
+}
+
+static void restore_layer_context(VP8_COMP *cpi, const int layer)
+{
+    LAYER_CONTEXT *lc = &cpi->layer_context[layer];
+
+    /* Restore layer dependent coding state */
+    cpi->current_layer                    = layer;
+    cpi->target_bandwidth                 = lc->target_bandwidth;
+    cpi->oxcf.target_bandwidth            = lc->target_bandwidth;
+    cpi->oxcf.starting_buffer_level       = lc->starting_buffer_level;
+    cpi->oxcf.optimal_buffer_level        = lc->optimal_buffer_level;
+    cpi->oxcf.maximum_buffer_size         = lc->maximum_buffer_size;
+    cpi->oxcf.starting_buffer_level_in_ms = lc->starting_buffer_level_in_ms;
+    cpi->oxcf.optimal_buffer_level_in_ms  = lc->optimal_buffer_level_in_ms;
+    cpi->oxcf.maximum_buffer_size_in_ms   = lc->maximum_buffer_size_in_ms;
+    cpi->buffer_level                     = lc->buffer_level;
+    cpi->bits_off_target                  = lc->bits_off_target;
+    cpi->total_actual_bits                = lc->total_actual_bits;
+    cpi->active_worst_quality             = lc->active_worst_quality;
+    cpi->active_best_quality              = lc->active_best_quality;
+    cpi->ni_av_qi                         = lc->ni_av_qi;
+    cpi->ni_tot_qi                        = lc->ni_tot_qi;
+    cpi->ni_frames                        = lc->ni_frames;
+    cpi->avg_frame_qindex                 = lc->avg_frame_qindex;
+    cpi->rate_correction_factor           = lc->rate_correction_factor;
+    cpi->key_frame_rate_correction_factor = lc->key_frame_rate_correction_factor;
+    cpi->gf_rate_correction_factor        = lc->gf_rate_correction_factor;
+    cpi->zbin_over_quant                  = lc->zbin_over_quant;
+    cpi->inter_frame_target               = lc->inter_frame_target;
+    cpi->total_byte_count                 = lc->total_byte_count;
+    cpi->common.filter_level              = lc->filter_level;
+
+    cpi->last_frame_percent_intra         = lc->last_frame_percent_intra;
+
+    memcpy (cpi->count_mb_ref_frame_usage,
+            lc->count_mb_ref_frame_usage,
+            sizeof(cpi->count_mb_ref_frame_usage));
+}
+
+static void setup_features(VP8_COMP *cpi)
+{
+    // If segmentation enabled set the update flags
+    if ( cpi->mb.e_mbd.segmentation_enabled )
+    {
+        cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+        cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+    }
+    else
+    {
+        cpi->mb.e_mbd.update_mb_segmentation_map = 0;
+        cpi->mb.e_mbd.update_mb_segmentation_data = 0;
+    }
+
+    cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 0;
+    cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
+    vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+    vpx_memset(cpi->mb.e_mbd.last_ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    vpx_memset(cpi->mb.e_mbd.last_mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+
+    set_default_lf_deltas(cpi);
+
+}
+
+
+static void dealloc_raw_frame_buffers(VP8_COMP *cpi);
+
+
+static void dealloc_compressor_data(VP8_COMP *cpi)
+{
+    vpx_free(cpi->tplist);
+    cpi->tplist = NULL;
+
+    /* Delete last frame MV storage buffers */
+    vpx_free(cpi->lfmv);
+    cpi->lfmv = 0;
+
+    vpx_free(cpi->lf_ref_frame_sign_bias);
+    cpi->lf_ref_frame_sign_bias = 0;
+
+    vpx_free(cpi->lf_ref_frame);
+    cpi->lf_ref_frame = 0;
+
+    /* Delete sementation map */
+    vpx_free(cpi->segmentation_map);
+    cpi->segmentation_map = 0;
+
+    vpx_free(cpi->active_map);
+    cpi->active_map = 0;
+
+    vp8_de_alloc_frame_buffers(&cpi->common);
+
+    vp8_yv12_de_alloc_frame_buffer(&cpi->pick_lf_lvl_frame);
+    vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
+    dealloc_raw_frame_buffers(cpi);
+
+    vpx_free(cpi->tok);
+    cpi->tok = 0;
+
+    /* Structure used to monitor GF usage */
+    vpx_free(cpi->gf_active_flags);
+    cpi->gf_active_flags = 0;
+
+    /* Activity mask based per mb zbin adjustments */
+    vpx_free(cpi->mb_activity_map);
+    cpi->mb_activity_map = 0;
+
+    vpx_free(cpi->mb.pip);
+    cpi->mb.pip = 0;
+
+#if CONFIG_MULTITHREAD
+    vpx_free(cpi->mt_current_mb_col);
+    cpi->mt_current_mb_col = NULL;
+#endif
+}
+
+static void enable_segmentation(VP8_COMP *cpi)
+{
+    /* Set the appropriate feature bit */
+    cpi->mb.e_mbd.segmentation_enabled = 1;
+    cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+    cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+static void disable_segmentation(VP8_COMP *cpi)
+{
+    /* Clear the appropriate feature bit */
+    cpi->mb.e_mbd.segmentation_enabled = 0;
+}
+
+/* Valid values for a segment are 0 to 3
+ * Segmentation map is arrange as [Rows][Columns]
+ */
+static void set_segmentation_map(VP8_COMP *cpi, unsigned char *segmentation_map)
+{
+    /* Copy in the new segmentation map */
+    vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
+
+    /* Signal that the map should be updated. */
+    cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+    cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+
+/* The values given for each segment can be either deltas (from the default
+ * value chosen for the frame) or absolute values.
+ *
+ * Valid range for abs values is:
+ *    (0-127 for MB_LVL_ALT_Q), (0-63 for SEGMENT_ALT_LF)
+ * Valid range for delta values are:
+ *    (+/-127 for MB_LVL_ALT_Q), (+/-63 for SEGMENT_ALT_LF)
+ *
+ * abs_delta = SEGMENT_DELTADATA (deltas)
+ * abs_delta = SEGMENT_ABSDATA (use the absolute values given).
+ *
+ */
+static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned char abs_delta)
+{
+    cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
+    vpx_memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data));
+}
+
+
+static void segmentation_test_function(VP8_COMP *cpi)
+{
+    unsigned char *seg_map;
+    signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+
+    // Create a temporary map for segmentation data.
+    CHECK_MEM_ERROR(seg_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+
+    // Set the segmentation Map
+    set_segmentation_map(cpi, seg_map);
+
+    // Activate segmentation.
+    enable_segmentation(cpi);
+
+    // Set up the quant segment data
+    feature_data[MB_LVL_ALT_Q][0] = 0;
+    feature_data[MB_LVL_ALT_Q][1] = 4;
+    feature_data[MB_LVL_ALT_Q][2] = 0;
+    feature_data[MB_LVL_ALT_Q][3] = 0;
+    // Set up the loop segment data
+    feature_data[MB_LVL_ALT_LF][0] = 0;
+    feature_data[MB_LVL_ALT_LF][1] = 0;
+    feature_data[MB_LVL_ALT_LF][2] = 0;
+    feature_data[MB_LVL_ALT_LF][3] = 0;
+
+    // Initialise the feature data structure
+    // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
+    set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+    // Delete sementation map
+    vpx_free(seg_map);
+
+    seg_map = 0;
+}
+
+/* A simple function to cyclically refresh the background at a lower Q */
+static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
+{
+    unsigned char *seg_map = cpi->segmentation_map;
+    signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+    int i;
+    int block_count = cpi->cyclic_refresh_mode_max_mbs_perframe;
+    int mbs_in_frame = cpi->common.mb_rows * cpi->common.mb_cols;
+
+    cpi->cyclic_refresh_q = Q / 2;
+
+    // Set every macroblock to be eligible for update.
+    // For key frame this will reset seg map to 0.
+    vpx_memset(cpi->segmentation_map, 0, mbs_in_frame);
+
+    if (cpi->common.frame_type != KEY_FRAME)
+    {
+        /* Cycle through the macro_block rows */
+        /* MB loop to set local segmentation map */
+        i = cpi->cyclic_refresh_mode_index;
+        assert(i < mbs_in_frame);
+        do
+        {
+          /* If the MB is as a candidate for clean up then mark it for
+           * possible boost/refresh (segment 1) The segment id may get
+           * reset to 0 later if the MB gets coded anything other than
+           * last frame 0,0 as only (last frame 0,0) MBs are eligable for
+           * refresh : that is to say Mbs likely to be background blocks.
+           */
+          if (cpi->cyclic_refresh_map[i] == 0)
+          {
+              seg_map[i] = 1;
+              block_count --;
+          }
+          else if (cpi->cyclic_refresh_map[i] < 0)
+              cpi->cyclic_refresh_map[i]++;
+
+          i++;
+          if (i == mbs_in_frame)
+              i = 0;
+
+        }
+        while(block_count && i != cpi->cyclic_refresh_mode_index);
+
+        cpi->cyclic_refresh_mode_index = i;
+    }
+
+    /* Activate segmentation. */
+    cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+    cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+    enable_segmentation(cpi);
+
+    /* Set up the quant segment data */
+    feature_data[MB_LVL_ALT_Q][0] = 0;
+    feature_data[MB_LVL_ALT_Q][1] = (cpi->cyclic_refresh_q - Q);
+    feature_data[MB_LVL_ALT_Q][2] = 0;
+    feature_data[MB_LVL_ALT_Q][3] = 0;
+
+    /* Set up the loop segment data */
+    feature_data[MB_LVL_ALT_LF][0] = 0;
+    feature_data[MB_LVL_ALT_LF][1] = lf_adjustment;
+    feature_data[MB_LVL_ALT_LF][2] = 0;
+    feature_data[MB_LVL_ALT_LF][3] = 0;
+
+    /* Initialise the feature data structure */
+    set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+}
+
+static void set_default_lf_deltas(VP8_COMP *cpi)
+{
+    cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
+    cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
+
+    vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+
+    /* Test of ref frame deltas */
+    cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
+    cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;
+    cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;
+    cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;
+
+    cpi->mb.e_mbd.mode_lf_deltas[0] = 4;               /* BPRED */
+
+    if(cpi->oxcf.Mode == MODE_REALTIME)
+      cpi->mb.e_mbd.mode_lf_deltas[1] = -12;              /* Zero */
+    else
+      cpi->mb.e_mbd.mode_lf_deltas[1] = -2;              /* Zero */
+
+    cpi->mb.e_mbd.mode_lf_deltas[2] = 2;               /* New mv */
+    cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               /* Split mv */
+}
+
+/* Convenience macros for mapping speed and mode into a continuous
+ * range
+ */
+#define GOOD(x) (x+1)
+#define RT(x) (x+7)
+
+static int speed_map(int speed, const int *map)
+{
+    int res;
+
+    do
+    {
+        res = *map++;
+    } while(speed >= *map++);
+    return res;
+}
+
+static const int thresh_mult_map_znn[] = {
+    /* map common to zero, nearest, and near */
+    0, GOOD(2), 1500, GOOD(3), 2000, RT(0), 1000, RT(2), 2000, INT_MAX
+};
+
+static const int thresh_mult_map_vhpred[] = {
+    1000, GOOD(2), 1500, GOOD(3), 2000, RT(0), 1000, RT(1), 2000,
+    RT(7), INT_MAX, INT_MAX
+};
+
+static const int thresh_mult_map_bpred[] = {
+    2000, GOOD(0), 2500, GOOD(2), 5000, GOOD(3), 7500, RT(0), 2500, RT(1), 5000,
+    RT(6), INT_MAX, INT_MAX
+};
+
+static const int thresh_mult_map_tm[] = {
+    1000, GOOD(2), 1500, GOOD(3), 2000, RT(0), 0, RT(1), 1000, RT(2), 2000,
+    RT(7), INT_MAX, INT_MAX
+};
+
+static const int thresh_mult_map_new1[] = {
+    1000, GOOD(2), 2000, RT(0), 2000, INT_MAX
+};
+
+static const int thresh_mult_map_new2[] = {
+    1000, GOOD(2), 2000, GOOD(3), 2500, GOOD(5), 4000, RT(0), 2000, RT(2), 2500,
+    RT(5), 4000, INT_MAX
+};
+
+static const int thresh_mult_map_split1[] = {
+    2500, GOOD(0), 1700, GOOD(2), 10000, GOOD(3), 25000, GOOD(4), INT_MAX,
+    RT(0), 5000, RT(1), 10000, RT(2), 25000, RT(3), INT_MAX, INT_MAX
+};
+
+static const int thresh_mult_map_split2[] = {
+    5000, GOOD(0), 4500, GOOD(2), 20000, GOOD(3), 50000, GOOD(4), INT_MAX,
+    RT(0), 10000, RT(1), 20000, RT(2), 50000, RT(3), INT_MAX, INT_MAX
+};
+
+static const int mode_check_freq_map_zn2[] = {
+    /* {zero,nearest}{2,3} */
+    0, RT(10), 1<<1, RT(11), 1<<2, RT(12), 1<<3, INT_MAX
+};
+
+static const int mode_check_freq_map_vhbpred[] = {
+    0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(5), 4, INT_MAX
+};
+
+static const int mode_check_freq_map_near2[] = {
+    0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(10), 1<<2, RT(11), 1<<3, RT(12), 1<<4,
+    INT_MAX
+};
+
+static const int mode_check_freq_map_new1[] = {
+    0, RT(10), 1<<1, RT(11), 1<<2, RT(12), 1<<3, INT_MAX
+};
+
+static const int mode_check_freq_map_new2[] = {
+    0, GOOD(5), 4, RT(0), 0, RT(3), 4, RT(10), 1<<3, RT(11), 1<<4, RT(12), 1<<5,
+    INT_MAX
+};
+
+static const int mode_check_freq_map_split1[] = {
+    0, GOOD(2), 2, GOOD(3), 7, RT(1), 2, RT(2), 7, INT_MAX
+};
+
+static const int mode_check_freq_map_split2[] = {
+    0, GOOD(1), 2, GOOD(2), 4, GOOD(3), 15, RT(1), 4, RT(2), 15, INT_MAX
+};
+
+void vp8_set_speed_features(VP8_COMP *cpi)
+{
+    SPEED_FEATURES *sf = &cpi->sf;
+    int Mode = cpi->compressor_speed;
+    int Speed = cpi->Speed;
+    int i;
+    VP8_COMMON *cm = &cpi->common;
+    int last_improved_quant = sf->improved_quant;
+    int ref_frames;
+
+    /* Initialise default mode frequency sampling variables */
+    for (i = 0; i < MAX_MODES; i ++)
+    {
+        cpi->mode_check_freq[i] = 0;
+        cpi->mode_test_hit_counts[i] = 0;
+        cpi->mode_chosen_counts[i] = 0;
+    }
+
+    cpi->mbs_tested_so_far = 0;
+
+    /* best quality defaults */
+    sf->RD = 1;
+    sf->search_method = NSTEP;
+    sf->improved_quant = 1;
+    sf->improved_dct = 1;
+    sf->auto_filter = 1;
+    sf->recode_loop = 1;
+    sf->quarter_pixel_search = 1;
+    sf->half_pixel_search = 1;
+    sf->iterative_sub_pixel = 1;
+    sf->optimize_coefficients = 1;
+    sf->use_fastquant_for_pick = 0;
+    sf->no_skip_block4x4_search = 1;
+
+    sf->first_step = 0;
+    sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+    sf->improved_mv_pred = 1;
+
+    /* default thresholds to 0 */
+    for (i = 0; i < MAX_MODES; i++)
+        sf->thresh_mult[i] = 0;
+
+    /* Count enabled references */
+    ref_frames = 1;
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+        ref_frames++;
+    if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+        ref_frames++;
+    if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+        ref_frames++;
+
+    /* Convert speed to continuous range, with clamping */
+    if (Mode == 0)
+        Speed = 0;
+    else if (Mode == 2)
+        Speed = RT(Speed);
+    else
+    {
+        if (Speed > 5)
+            Speed = 5;
+        Speed = GOOD(Speed);
+    }
+
+    sf->thresh_mult[THR_ZERO1] =
+    sf->thresh_mult[THR_NEAREST1] =
+    sf->thresh_mult[THR_NEAR1] =
+    sf->thresh_mult[THR_DC] = 0; /* always */
+
+    sf->thresh_mult[THR_ZERO2] =
+    sf->thresh_mult[THR_ZERO3] =
+    sf->thresh_mult[THR_NEAREST2] =
+    sf->thresh_mult[THR_NEAREST3] =
+    sf->thresh_mult[THR_NEAR2]  =
+    sf->thresh_mult[THR_NEAR3]  = speed_map(Speed, thresh_mult_map_znn);
+
+    sf->thresh_mult[THR_V_PRED] =
+    sf->thresh_mult[THR_H_PRED] = speed_map(Speed, thresh_mult_map_vhpred);
+    sf->thresh_mult[THR_B_PRED] = speed_map(Speed, thresh_mult_map_bpred);
+    sf->thresh_mult[THR_TM]     = speed_map(Speed, thresh_mult_map_tm);
+    sf->thresh_mult[THR_NEW1]   = speed_map(Speed, thresh_mult_map_new1);
+    sf->thresh_mult[THR_NEW2]   =
+    sf->thresh_mult[THR_NEW3]   = speed_map(Speed, thresh_mult_map_new2);
+    sf->thresh_mult[THR_SPLIT1] = speed_map(Speed, thresh_mult_map_split1);
+    sf->thresh_mult[THR_SPLIT2] =
+    sf->thresh_mult[THR_SPLIT3] = speed_map(Speed, thresh_mult_map_split2);
+
+    cpi->mode_check_freq[THR_ZERO1] =
+    cpi->mode_check_freq[THR_NEAREST1] =
+    cpi->mode_check_freq[THR_NEAR1] =
+    cpi->mode_check_freq[THR_TM]     =
+    cpi->mode_check_freq[THR_DC] = 0; /* always */
+
+    cpi->mode_check_freq[THR_ZERO2] =
+    cpi->mode_check_freq[THR_ZERO3] =
+    cpi->mode_check_freq[THR_NEAREST2] =
+    cpi->mode_check_freq[THR_NEAREST3] = speed_map(Speed,
+                                                   mode_check_freq_map_zn2);
+
+    cpi->mode_check_freq[THR_NEAR2]  =
+    cpi->mode_check_freq[THR_NEAR3]  = speed_map(Speed,
+                                                 mode_check_freq_map_near2);
+
+    cpi->mode_check_freq[THR_V_PRED] =
+    cpi->mode_check_freq[THR_H_PRED] =
+    cpi->mode_check_freq[THR_B_PRED] = speed_map(Speed,
+                                                 mode_check_freq_map_vhbpred);
+    cpi->mode_check_freq[THR_NEW1]   = speed_map(Speed,
+                                                 mode_check_freq_map_new1);
+    cpi->mode_check_freq[THR_NEW2]   =
+    cpi->mode_check_freq[THR_NEW3]   = speed_map(Speed,
+                                                 mode_check_freq_map_new2);
+    cpi->mode_check_freq[THR_SPLIT1] = speed_map(Speed,
+                                                 mode_check_freq_map_split1);
+    cpi->mode_check_freq[THR_SPLIT2] =
+    cpi->mode_check_freq[THR_SPLIT3] = speed_map(Speed,
+                                                 mode_check_freq_map_split2);
+    Speed = cpi->Speed;
+    switch (Mode)
+    {
+#if !(CONFIG_REALTIME_ONLY)
+    case 0: /* best quality mode */
+        sf->first_step = 0;
+        sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+        break;
+    case 1:
+    case 3:
+        if (Speed > 0)
+        {
+            /* Disable coefficient optimization above speed 0 */
+            sf->optimize_coefficients = 0;
+            sf->use_fastquant_for_pick = 1;
+            sf->no_skip_block4x4_search = 0;
+
+            sf->first_step = 1;
+        }
+
+        if (Speed > 2)
+        {
+            sf->improved_quant = 0;
+            sf->improved_dct = 0;
+
+            /* Only do recode loop on key frames, golden frames and
+             * alt ref frames
+             */
+            sf->recode_loop = 2;
+
+        }
+
+        if (Speed > 3)
+        {
+            sf->auto_filter = 1;
+            sf->recode_loop = 0; /* recode loop off */
+            sf->RD = 0;         /* Turn rd off */
+
+        }
+
+        if (Speed > 4)
+        {
+            sf->auto_filter = 0;  /* Faster selection of loop filter */
+        }
+
+        break;
+#endif
+    case 2:
+        sf->optimize_coefficients = 0;
+        sf->recode_loop = 0;
+        sf->auto_filter = 1;
+        sf->iterative_sub_pixel = 1;
+        sf->search_method = NSTEP;
+
+        if (Speed > 0)
+        {
+            sf->improved_quant = 0;
+            sf->improved_dct = 0;
+
+            sf->use_fastquant_for_pick = 1;
+            sf->no_skip_block4x4_search = 0;
+            sf->first_step = 1;
+        }
+
+        if (Speed > 2)
+            sf->auto_filter = 0;  /* Faster selection of loop filter */
+
+        if (Speed > 3)
+        {
+            sf->RD = 0;
+            sf->auto_filter = 1;
+        }
+
+        if (Speed > 4)
+        {
+            sf->auto_filter = 0;  /* Faster selection of loop filter */
+            sf->search_method = HEX;
+            sf->iterative_sub_pixel = 0;
+        }
+
+        if (Speed > 6)
+        {
+            unsigned int sum = 0;
+            unsigned int total_mbs = cm->MBs;
+            int i, thresh;
+            unsigned int total_skip;
+
+            int min = 2000;
+
+            if (cpi->oxcf.encode_breakout > 2000)
+                min = cpi->oxcf.encode_breakout;
+
+            min >>= 7;
+
+            for (i = 0; i < min; i++)
+            {
+                sum += cpi->error_bins[i];
+            }
+
+            total_skip = sum;
+            sum = 0;
+
+            /* i starts from 2 to make sure thresh started from 2048 */
+            for (; i < 1024; i++)
+            {
+                sum += cpi->error_bins[i];
+
+                if (10 * sum >= (unsigned int)(cpi->Speed - 6)*(total_mbs - total_skip))
+                    break;
+            }
+
+            i--;
+            thresh = (i << 7);
+
+            if (thresh < 2000)
+                thresh = 2000;
+
+            if (ref_frames > 1)
+            {
+                sf->thresh_mult[THR_NEW1 ] = thresh;
+                sf->thresh_mult[THR_NEAREST1  ] = thresh >> 1;
+                sf->thresh_mult[THR_NEAR1     ] = thresh >> 1;
+            }
+
+            if (ref_frames > 2)
+            {
+                sf->thresh_mult[THR_NEW2] = thresh << 1;
+                sf->thresh_mult[THR_NEAREST2 ] = thresh;
+                sf->thresh_mult[THR_NEAR2    ] = thresh;
+            }
+
+            if (ref_frames > 3)
+            {
+                sf->thresh_mult[THR_NEW3] = thresh << 1;
+                sf->thresh_mult[THR_NEAREST3 ] = thresh;
+                sf->thresh_mult[THR_NEAR3    ] = thresh;
+            }
+
+            sf->improved_mv_pred = 0;
+        }
+
+        if (Speed > 8)
+            sf->quarter_pixel_search = 0;
+
+        if(cm->version == 0)
+        {
+            cm->filter_type = NORMAL_LOOPFILTER;
+
+            if (Speed >= 14)
+                cm->filter_type = SIMPLE_LOOPFILTER;
+        }
+        else
+        {
+            cm->filter_type = SIMPLE_LOOPFILTER;
+        }
+
+        /* This has a big hit on quality. Last resort */
+        if (Speed >= 15)
+            sf->half_pixel_search = 0;
+
+        vpx_memset(cpi->error_bins, 0, sizeof(cpi->error_bins));
+
+    }; /* switch */
+
+    /* Slow quant, dct and trellis not worthwhile for first pass
+     * so make sure they are always turned off.
+     */
+    if ( cpi->pass == 1 )
+    {
+        sf->improved_quant = 0;
+        sf->optimize_coefficients = 0;
+        sf->improved_dct = 0;
+    }
+
+    if (cpi->sf.search_method == NSTEP)
+    {
+        vp8_init3smotion_compensation(&cpi->mb, cm->yv12_fb[cm->lst_fb_idx].y_stride);
+    }
+    else if (cpi->sf.search_method == DIAMOND)
+    {
+        vp8_init_dsmotion_compensation(&cpi->mb, cm->yv12_fb[cm->lst_fb_idx].y_stride);
+    }
+
+    if (cpi->sf.improved_dct)
+    {
+        cpi->mb.short_fdct8x4 = vp8_short_fdct8x4;
+        cpi->mb.short_fdct4x4 = vp8_short_fdct4x4;
+    }
+    else
+    {
+        /* No fast FDCT defined for any platform at this time. */
+        cpi->mb.short_fdct8x4 = vp8_short_fdct8x4;
+        cpi->mb.short_fdct4x4 = vp8_short_fdct4x4;
+    }
+
+    cpi->mb.short_walsh4x4 = vp8_short_walsh4x4;
+
+    if (cpi->sf.improved_quant)
+    {
+        cpi->mb.quantize_b      = vp8_regular_quantize_b;
+        cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair;
+    }
+    else
+    {
+        cpi->mb.quantize_b      = vp8_fast_quantize_b;
+        cpi->mb.quantize_b_pair = vp8_fast_quantize_b_pair;
+    }
+    if (cpi->sf.improved_quant != last_improved_quant)
+        vp8cx_init_quantizer(cpi);
+
+    if (cpi->sf.iterative_sub_pixel == 1)
+    {
+        cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step_iteratively;
+    }
+    else if (cpi->sf.quarter_pixel_search)
+    {
+        cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step;
+    }
+    else if (cpi->sf.half_pixel_search)
+    {
+        cpi->find_fractional_mv_step = vp8_find_best_half_pixel_step;
+    }
+    else
+    {
+        cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step;
+    }
+
+    if (cpi->sf.optimize_coefficients == 1 && cpi->pass!=1)
+        cpi->mb.optimize = 1;
+    else
+        cpi->mb.optimize = 0;
+
+    if (cpi->common.full_pixel)
+        cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step;
+
+#ifdef SPEEDSTATS
+    frames_at_speed[cpi->Speed]++;
+#endif
+}
+#undef GOOD
+#undef RT
+
+static void alloc_raw_frame_buffers(VP8_COMP *cpi)
+{
+#if VP8_TEMPORAL_ALT_REF
+    int width = (cpi->oxcf.Width + 15) & ~15;
+    int height = (cpi->oxcf.Height + 15) & ~15;
+#endif
+
+    cpi->lookahead = vp8_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height,
+                                        cpi->oxcf.lag_in_frames);
+    if(!cpi->lookahead)
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate lag buffers");
+
+#if VP8_TEMPORAL_ALT_REF
+
+    if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,
+                                    width, height, VP8BORDERINPIXELS))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate altref buffer");
+
+#endif
+}
+
+
+static void dealloc_raw_frame_buffers(VP8_COMP *cpi)
+{
+#if VP8_TEMPORAL_ALT_REF
+    vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);
+#endif
+    vp8_lookahead_destroy(cpi->lookahead);
+}
+
+
+static int vp8_alloc_partition_data(VP8_COMP *cpi)
+{
+        vpx_free(cpi->mb.pip);
+
+    cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *
+                                (cpi->common.mb_rows + 1),
+                                sizeof(PARTITION_INFO));
+    if(!cpi->mb.pip)
+        return 1;
+
+    cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;
+
+    return 0;
+}
+
+void vp8_alloc_compressor_data(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = & cpi->common;
+
+    int width = cm->Width;
+    int height = cm->Height;
+
+    if (vp8_alloc_frame_buffers(cm, width, height))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate frame buffers");
+
+    if (vp8_alloc_partition_data(cpi))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate partition data");
+
+
+    if ((width & 0xf) != 0)
+        width += 16 - (width & 0xf);
+
+    if ((height & 0xf) != 0)
+        height += 16 - (height & 0xf);
+
+
+    if (vp8_yv12_alloc_frame_buffer(&cpi->pick_lf_lvl_frame,
+                                    width, height, VP8BORDERINPIXELS))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate last frame buffer");
+
+    if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,
+                                    width, height, VP8BORDERINPIXELS))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate scaled source buffer");
+
+    vpx_free(cpi->tok);
+
+    {
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        unsigned int tokens = 8 * 24 * 16; /* one MB for each thread */
+#else
+        unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
+#endif
+        CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+    }
+
+    /* Data used for real time vc mode to see if gf needs refreshing */
+    cpi->inter_zz_count = 0;
+    cpi->zeromv_count = 0;
+    cpi->gf_bad_count = 0;
+    cpi->gf_update_recommended = 0;
+
+
+    /* Structures used to monitor GF usage */
+    vpx_free(cpi->gf_active_flags);
+    CHECK_MEM_ERROR(cpi->gf_active_flags,
+                    vpx_calloc(sizeof(*cpi->gf_active_flags),
+                    cm->mb_rows * cm->mb_cols));
+    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+    vpx_free(cpi->mb_activity_map);
+    CHECK_MEM_ERROR(cpi->mb_activity_map,
+                    vpx_calloc(sizeof(*cpi->mb_activity_map),
+                    cm->mb_rows * cm->mb_cols));
+
+    /* allocate memory for storing last frame's MVs for MV prediction. */
+    vpx_free(cpi->lfmv);
+    CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
+                    sizeof(*cpi->lfmv)));
+    vpx_free(cpi->lf_ref_frame_sign_bias);
+    CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias,
+                    vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
+                    sizeof(*cpi->lf_ref_frame_sign_bias)));
+    vpx_free(cpi->lf_ref_frame);
+    CHECK_MEM_ERROR(cpi->lf_ref_frame,
+                    vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
+                    sizeof(*cpi->lf_ref_frame)));
+
+    /* Create the encoder segmentation map and set all entries to 0 */
+    vpx_free(cpi->segmentation_map);
+    CHECK_MEM_ERROR(cpi->segmentation_map,
+                    vpx_calloc(cm->mb_rows * cm->mb_cols,
+                    sizeof(*cpi->segmentation_map)));
+    cpi->cyclic_refresh_mode_index = 0;
+    vpx_free(cpi->active_map);
+    CHECK_MEM_ERROR(cpi->active_map,
+                    vpx_calloc(cm->mb_rows * cm->mb_cols,
+                    sizeof(*cpi->active_map)));
+    vpx_memset(cpi->active_map , 1, (cm->mb_rows * cm->mb_cols));
+
+#if CONFIG_MULTITHREAD
+    if (width < 640)
+        cpi->mt_sync_range = 1;
+    else if (width <= 1280)
+        cpi->mt_sync_range = 4;
+    else if (width <= 2560)
+        cpi->mt_sync_range = 8;
+    else
+        cpi->mt_sync_range = 16;
+
+    if (cpi->oxcf.multi_threaded > 1)
+    {
+        vpx_free(cpi->mt_current_mb_col);
+        CHECK_MEM_ERROR(cpi->mt_current_mb_col,
+                    vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
+    }
+
+#endif
+
+    vpx_free(cpi->tplist);
+    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
+}
+
+
+/* Quant MOD */
+static const int q_trans[] =
+{
+    0,   1,  2,  3,  4,  5,  7,  8,
+    9,  10, 12, 13, 15, 17, 18, 19,
+    20,  21, 23, 24, 25, 26, 27, 28,
+    29,  30, 31, 33, 35, 37, 39, 41,
+    43,  45, 47, 49, 51, 53, 55, 57,
+    59,  61, 64, 67, 70, 73, 76, 79,
+    82,  85, 88, 91, 94, 97, 100, 103,
+    106, 109, 112, 115, 118, 121, 124, 127,
+};
+
+int vp8_reverse_trans(int x)
+{
+    int i;
+
+    for (i = 0; i < 64; i++)
+        if (q_trans[i] >= x)
+            return i;
+
+    return 63;
+}
+void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
+{
+    if(framerate < .1)
+        framerate = 30;
+
+    cpi->frame_rate             = framerate;
+    cpi->output_frame_rate      = framerate;
+    cpi->per_frame_bandwidth    = (int)(cpi->oxcf.target_bandwidth /
+                                  cpi->output_frame_rate);
+    cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
+    cpi->min_frame_bandwidth    = (int)(cpi->av_per_frame_bandwidth *
+                                  cpi->oxcf.two_pass_vbrmin_section / 100);
+
+    /* Set Maximum gf/arf interval */
+    cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);
+
+    if(cpi->max_gf_interval < 12)
+        cpi->max_gf_interval = 12;
+
+    /* Extended interval for genuinely static scenes */
+    cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;
+
+     /* Special conditions when altr ref frame enabled in lagged compress mode */
+    if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames)
+    {
+        if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+            cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+
+        if (cpi->twopass.static_scene_max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+            cpi->twopass.static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+    }
+
+    if ( cpi->max_gf_interval > cpi->twopass.static_scene_max_gf_interval )
+        cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
+}
+
+
+static int
+rescale(int val, int num, int denom)
+{
+    int64_t llnum = num;
+    int64_t llden = denom;
+    int64_t llval = val;
+
+    return (int)(llval * llnum / llden);
+}
+
+
+static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    cpi->oxcf = *oxcf;
+
+    cpi->auto_gold = 1;
+    cpi->auto_adjust_gold_quantizer = 1;
+
+    cm->version = oxcf->Version;
+    vp8_setup_version(cm);
+
+    /* frame rate is not available on the first frame, as it's derived from
+     * the observed timestamps. The actual value used here doesn't matter
+     * too much, as it will adapt quickly. If the reciprocal of the timebase
+     * seems like a reasonable framerate, then use that as a guess, otherwise
+     * use 30.
+     */
+    cpi->frame_rate = (double)(oxcf->timebase.den) /
+                      (double)(oxcf->timebase.num);
+
+    if (cpi->frame_rate > 180)
+        cpi->frame_rate = 30;
+
+    cpi->ref_frame_rate = cpi->frame_rate;
+
+    /* change includes all joint functionality */
+    vp8_change_config(cpi, oxcf);
+
+    /* Initialize active best and worst q and average q values. */
+    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
+    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
+    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
+
+    /* Initialise the starting buffer levels */
+    cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;
+    cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
+
+    cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
+    cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;
+
+    cpi->total_actual_bits            = 0;
+    cpi->total_target_vs_actual       = 0;
+
+    /* Temporal scalabilty */
+    if (cpi->oxcf.number_of_layers > 1)
+    {
+        unsigned int i;
+        double prev_layer_frame_rate=0;
+
+        for (i=0; i<cpi->oxcf.number_of_layers; i++)
+        {
+            LAYER_CONTEXT *lc = &cpi->layer_context[i];
+
+            /* Layer configuration */
+            lc->frame_rate =
+                        cpi->output_frame_rate / cpi->oxcf.rate_decimator[i];
+            lc->target_bandwidth = cpi->oxcf.target_bitrate[i] * 1000;
+
+            lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level;
+            lc->optimal_buffer_level_in_ms  = oxcf->optimal_buffer_level;
+            lc->maximum_buffer_size_in_ms   = oxcf->maximum_buffer_size;
+
+            lc->starting_buffer_level =
+              rescale((int)(oxcf->starting_buffer_level),
+                          lc->target_bandwidth, 1000);
+
+            if (oxcf->optimal_buffer_level == 0)
+                lc->optimal_buffer_level = lc->target_bandwidth / 8;
+            else
+                lc->optimal_buffer_level =
+                  rescale((int)(oxcf->optimal_buffer_level),
+                          lc->target_bandwidth, 1000);
+
+            if (oxcf->maximum_buffer_size == 0)
+                lc->maximum_buffer_size = lc->target_bandwidth / 8;
+            else
+                lc->maximum_buffer_size =
+                  rescale((int)oxcf->maximum_buffer_size,
+                          lc->target_bandwidth, 1000);
+
+            /* Work out the average size of a frame within this layer */
+            if (i > 0)
+                lc->avg_frame_size_for_layer =
+                  (int)((cpi->oxcf.target_bitrate[i] -
+                         cpi->oxcf.target_bitrate[i-1]) * 1000 /
+                        (lc->frame_rate - prev_layer_frame_rate));
+
+            lc->active_worst_quality         = cpi->oxcf.worst_allowed_q;
+            lc->active_best_quality          = cpi->oxcf.best_allowed_q;
+            lc->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
+
+            lc->buffer_level                 = lc->starting_buffer_level;
+            lc->bits_off_target              = lc->starting_buffer_level;
+
+            lc->total_actual_bits                 = 0;
+            lc->ni_av_qi                          = 0;
+            lc->ni_tot_qi                         = 0;
+            lc->ni_frames                         = 0;
+            lc->rate_correction_factor            = 1.0;
+            lc->key_frame_rate_correction_factor  = 1.0;
+            lc->gf_rate_correction_factor         = 1.0;
+            lc->inter_frame_target                = 0;
+
+            prev_layer_frame_rate = lc->frame_rate;
+        }
+    }
+
+#if VP8_TEMPORAL_ALT_REF
+    {
+        int i;
+
+        cpi->fixed_divide[0] = 0;
+
+        for (i = 1; i < 512; i++)
+            cpi->fixed_divide[i] = 0x80000 / i;
+    }
+#endif
+}
+
+static void update_layer_contexts (VP8_COMP *cpi)
+{
+    VP8_CONFIG *oxcf = &cpi->oxcf;
+
+    /* Update snapshots of the layer contexts to reflect new parameters */
+    if (oxcf->number_of_layers > 1)
+    {
+        unsigned int i;
+        double prev_layer_frame_rate=0;
+
+        for (i=0; i<oxcf->number_of_layers; i++)
+        {
+            LAYER_CONTEXT *lc = &cpi->layer_context[i];
+
+            lc->frame_rate =
+                cpi->ref_frame_rate / oxcf->rate_decimator[i];
+            lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
+
+            lc->starting_buffer_level = rescale(
+                          (int)oxcf->starting_buffer_level_in_ms,
+                          lc->target_bandwidth, 1000);
+
+            if (oxcf->optimal_buffer_level == 0)
+                lc->optimal_buffer_level = lc->target_bandwidth / 8;
+            else
+                lc->optimal_buffer_level = rescale(
+                          (int)oxcf->optimal_buffer_level_in_ms,
+                          lc->target_bandwidth, 1000);
+
+            if (oxcf->maximum_buffer_size == 0)
+                lc->maximum_buffer_size = lc->target_bandwidth / 8;
+            else
+                lc->maximum_buffer_size = rescale(
+                          (int)oxcf->maximum_buffer_size_in_ms,
+                          lc->target_bandwidth, 1000);
+
+            /* Work out the average size of a frame within this layer */
+            if (i > 0)
+                lc->avg_frame_size_for_layer =
+                   (int)((oxcf->target_bitrate[i] -
+                          oxcf->target_bitrate[i-1]) * 1000 /
+                          (lc->frame_rate - prev_layer_frame_rate));
+
+            prev_layer_frame_rate = lc->frame_rate;
+        }
+    }
+}
+
+void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
+{
+    VP8_COMMON *cm = &cpi->common;
+    int last_w, last_h;
+
+    if (!cpi)
+        return;
+
+    if (!oxcf)
+        return;
+
+#if CONFIG_MULTITHREAD
+    /*  wait for the last picture loopfilter thread done */
+    if (cpi->b_lpf_running)
+    {
+        sem_wait(&cpi->h_event_end_lpf);
+        cpi->b_lpf_running = 0;
+    }
+#endif
+
+    if (cm->version != oxcf->Version)
+    {
+        cm->version = oxcf->Version;
+        vp8_setup_version(cm);
+    }
+
+    last_w = cpi->oxcf.Width;
+    last_h = cpi->oxcf.Height;
+
+    cpi->oxcf = *oxcf;
+
+    switch (cpi->oxcf.Mode)
+    {
+
+    case MODE_REALTIME:
+        cpi->pass = 0;
+        cpi->compressor_speed = 2;
+
+        if (cpi->oxcf.cpu_used < -16)
+        {
+            cpi->oxcf.cpu_used = -16;
+        }
+
+        if (cpi->oxcf.cpu_used > 16)
+            cpi->oxcf.cpu_used = 16;
+
+        break;
+
+    case MODE_GOODQUALITY:
+        cpi->pass = 0;
+        cpi->compressor_speed = 1;
+
+        if (cpi->oxcf.cpu_used < -5)
+        {
+            cpi->oxcf.cpu_used = -5;
+        }
+
+        if (cpi->oxcf.cpu_used > 5)
+            cpi->oxcf.cpu_used = 5;
+
+        break;
+
+    case MODE_BESTQUALITY:
+        cpi->pass = 0;
+        cpi->compressor_speed = 0;
+        break;
+
+    case MODE_FIRSTPASS:
+        cpi->pass = 1;
+        cpi->compressor_speed = 1;
+        break;
+    case MODE_SECONDPASS:
+        cpi->pass = 2;
+        cpi->compressor_speed = 1;
+
+        if (cpi->oxcf.cpu_used < -5)
+        {
+            cpi->oxcf.cpu_used = -5;
+        }
+
+        if (cpi->oxcf.cpu_used > 5)
+            cpi->oxcf.cpu_used = 5;
+
+        break;
+    case MODE_SECONDPASS_BEST:
+        cpi->pass = 2;
+        cpi->compressor_speed = 0;
+        break;
+    }
+
+    if (cpi->pass == 0)
+        cpi->auto_worst_q = 1;
+
+    cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
+    cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
+    cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
+
+    if (oxcf->fixed_q >= 0)
+    {
+        if (oxcf->worst_allowed_q < 0)
+            cpi->oxcf.fixed_q = q_trans[0];
+        else
+            cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q];
+
+        if (oxcf->alt_q < 0)
+            cpi->oxcf.alt_q = q_trans[0];
+        else
+            cpi->oxcf.alt_q = q_trans[oxcf->alt_q];
+
+        if (oxcf->key_q < 0)
+            cpi->oxcf.key_q = q_trans[0];
+        else
+            cpi->oxcf.key_q = q_trans[oxcf->key_q];
+
+        if (oxcf->gold_q < 0)
+            cpi->oxcf.gold_q = q_trans[0];
+        else
+            cpi->oxcf.gold_q = q_trans[oxcf->gold_q];
+
+    }
+
+    cpi->baseline_gf_interval =
+        cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
+
+    cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
+
+    cm->refresh_golden_frame = 0;
+    cm->refresh_last_frame = 1;
+    cm->refresh_entropy_probs = 1;
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    cpi->oxcf.token_partitions = 3;
+#endif
+
+    if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
+        cm->multi_token_partition =
+            (TOKEN_PARTITION) cpi->oxcf.token_partitions;
+
+    setup_features(cpi);
+
+    {
+        int i;
+
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+            cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
+    }
+
+    /* At the moment the first order values may not be > MAXQ */
+    if (cpi->oxcf.fixed_q > MAXQ)
+        cpi->oxcf.fixed_q = MAXQ;
+
+    /* local file playback mode == really big buffer */
+    if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
+    {
+        cpi->oxcf.starting_buffer_level       = 60000;
+        cpi->oxcf.optimal_buffer_level        = 60000;
+        cpi->oxcf.maximum_buffer_size         = 240000;
+        cpi->oxcf.starting_buffer_level_in_ms = 60000;
+        cpi->oxcf.optimal_buffer_level_in_ms  = 60000;
+        cpi->oxcf.maximum_buffer_size_in_ms   = 240000;
+    }
+
+    /* Convert target bandwidth from Kbit/s to Bit/s */
+    cpi->oxcf.target_bandwidth       *= 1000;
+
+    cpi->oxcf.starting_buffer_level =
+        rescale((int)cpi->oxcf.starting_buffer_level,
+                cpi->oxcf.target_bandwidth, 1000);
+
+    /* Set or reset optimal and maximum buffer levels. */
+    if (cpi->oxcf.optimal_buffer_level == 0)
+        cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
+    else
+        cpi->oxcf.optimal_buffer_level =
+            rescale((int)cpi->oxcf.optimal_buffer_level,
+                    cpi->oxcf.target_bandwidth, 1000);
+
+    if (cpi->oxcf.maximum_buffer_size == 0)
+        cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
+    else
+        cpi->oxcf.maximum_buffer_size =
+            rescale((int)cpi->oxcf.maximum_buffer_size,
+                    cpi->oxcf.target_bandwidth, 1000);
+
+    /* Set up frame rate and related parameters rate control values. */
+    vp8_new_frame_rate(cpi, cpi->frame_rate);
+
+    /* Set absolute upper and lower quality limits */
+    cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
+    cpi->best_quality                = cpi->oxcf.best_allowed_q;
+
+    /* active values should only be modified if out of new range */
+    if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q)
+    {
+      cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+    }
+    /* less likely */
+    else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q)
+    {
+      cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
+    }
+    if (cpi->active_best_quality < cpi->oxcf.best_allowed_q)
+    {
+      cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+    }
+    /* less likely */
+    else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q)
+    {
+      cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
+    }
+
+    cpi->buffered_mode = cpi->oxcf.optimal_buffer_level > 0;
+
+    cpi->cq_target_quality = cpi->oxcf.cq_level;
+
+    /* Only allow dropped frames in buffered mode */
+    cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
+
+    cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
+
+
+    cm->Width       = cpi->oxcf.Width;
+    cm->Height      = cpi->oxcf.Height;
+
+    /* TODO(jkoleszar): if an internal spatial resampling is active,
+     * and we downsize the input image, maybe we should clear the
+     * internal scale immediately rather than waiting for it to
+     * correct.
+     */
+
+    /* VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) */
+    if (cpi->oxcf.Sharpness > 7)
+        cpi->oxcf.Sharpness = 7;
+
+    cm->sharpness_level = cpi->oxcf.Sharpness;
+
+    if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL)
+    {
+        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+
+        Scale2Ratio(cm->horiz_scale, &hr, &hs);
+        Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+        /* always go to the next whole number */
+        cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
+        cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
+    }
+
+    if (last_w != cpi->oxcf.Width || last_h != cpi->oxcf.Height)
+        cpi->force_next_frame_intra = 1;
+
+    if (((cm->Width + 15) & 0xfffffff0) !=
+          cm->yv12_fb[cm->lst_fb_idx].y_width ||
+        ((cm->Height + 15) & 0xfffffff0) !=
+          cm->yv12_fb[cm->lst_fb_idx].y_height ||
+        cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
+    {
+        dealloc_raw_frame_buffers(cpi);
+        alloc_raw_frame_buffers(cpi);
+        vp8_alloc_compressor_data(cpi);
+    }
+
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        cpi->last_q[0] = cpi->oxcf.fixed_q;
+        cpi->last_q[1] = cpi->oxcf.fixed_q;
+    }
+
+    cpi->Speed = cpi->oxcf.cpu_used;
+
+    /* force to allowlag to 0 if lag_in_frames is 0; */
+    if (cpi->oxcf.lag_in_frames == 0)
+    {
+        cpi->oxcf.allow_lag = 0;
+    }
+    /* Limit on lag buffers as these are not currently dynamically allocated */
+    else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
+        cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
+
+    /* YX Temp */
+    cpi->alt_ref_source = NULL;
+    cpi->is_src_frame_alt_ref = 0;
+
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+      if (!cpi->denoiser.yv12_mc_running_avg.buffer_alloc)
+      {
+        int width = (cpi->oxcf.Width + 15) & ~15;
+        int height = (cpi->oxcf.Height + 15) & ~15;
+        vp8_denoiser_allocate(&cpi->denoiser, width, height);
+      }
+    }
+#endif
+
+#if 0
+    /* Experimental RD Code */
+    cpi->frame_distortion = 0;
+    cpi->last_frame_distortion = 0;
+#endif
+
+}
+
+#define M_LOG2_E 0.693147180559945309417
+#define log2f(x) (log (x) / (float) M_LOG2_E)
+static void cal_mvsadcosts(int *mvsadcost[2])
+{
+    int i = 1;
+
+    mvsadcost [0] [0] = 300;
+    mvsadcost [1] [0] = 300;
+
+    do
+    {
+        double z = 256 * (2 * (log2f(8 * i) + .6));
+        mvsadcost [0][i] = (int) z;
+        mvsadcost [1][i] = (int) z;
+        mvsadcost [0][-i] = (int) z;
+        mvsadcost [1][-i] = (int) z;
+    }
+    while (++i <= mvfp_max);
+}
+
+struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
+{
+    int i;
+
+    VP8_COMP *cpi;
+    VP8_COMMON *cm;
+
+    cpi = vpx_memalign(32, sizeof(VP8_COMP));
+    /* Check that the CPI instance is valid */
+    if (!cpi)
+        return 0;
+
+    cm = &cpi->common;
+
+    vpx_memset(cpi, 0, sizeof(VP8_COMP));
+
+    if (setjmp(cm->error.jmp))
+    {
+        cpi->common.error.setjmp = 0;
+        vp8_remove_compressor(&cpi);
+        return 0;
+    }
+
+    cpi->common.error.setjmp = 1;
+
+    CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
+
+    vp8_create_common(&cpi->common);
+
+    init_config(cpi, oxcf);
+
+    memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob));
+    cpi->common.current_video_frame   = 0;
+    cpi->kf_overspend_bits            = 0;
+    cpi->kf_bitrate_adjustment        = 0;
+    cpi->frames_till_gf_update_due      = 0;
+    cpi->gf_overspend_bits            = 0;
+    cpi->non_gf_bitrate_adjustment     = 0;
+    cpi->prob_last_coded              = 128;
+    cpi->prob_gf_coded                = 128;
+    cpi->prob_intra_coded             = 63;
+
+    /* Prime the recent reference frame usage counters.
+     * Hereafter they will be maintained as a sort of moving average
+     */
+    cpi->recent_ref_frame_usage[INTRA_FRAME]  = 1;
+    cpi->recent_ref_frame_usage[LAST_FRAME]   = 1;
+    cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+    cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+
+    /* Set reference frame sign bias for ALTREF frame to 1 (for now) */
+    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+
+    cpi->twopass.gf_decay_rate = 0;
+    cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+
+    cpi->gold_is_last = 0 ;
+    cpi->alt_is_last  = 0 ;
+    cpi->gold_is_alt  = 0 ;
+
+    cpi->active_map_enabled = 0;
+
+#if 0
+    /* Experimental code for lagged and one pass */
+    /* Initialise one_pass GF frames stats */
+    /* Update stats used for GF selection */
+    if (cpi->pass == 0)
+    {
+        cpi->one_pass_frame_index = 0;
+
+        for (i = 0; i < MAX_LAG_BUFFERS; i++)
+        {
+            cpi->one_pass_frame_stats[i].frames_so_far = 0;
+            cpi->one_pass_frame_stats[i].frame_intra_error = 0.0;
+            cpi->one_pass_frame_stats[i].frame_coded_error = 0.0;
+            cpi->one_pass_frame_stats[i].frame_pcnt_inter = 0.0;
+            cpi->one_pass_frame_stats[i].frame_pcnt_motion = 0.0;
+            cpi->one_pass_frame_stats[i].frame_mvr = 0.0;
+            cpi->one_pass_frame_stats[i].frame_mvr_abs = 0.0;
+            cpi->one_pass_frame_stats[i].frame_mvc = 0.0;
+            cpi->one_pass_frame_stats[i].frame_mvc_abs = 0.0;
+        }
+    }
+#endif
+
+    /* Should we use the cyclic refresh method.
+     * Currently this is tied to error resilliant mode
+     */
+    cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode;
+    cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 5;
+    cpi->cyclic_refresh_mode_index = 0;
+    cpi->cyclic_refresh_q = 32;
+
+    if (cpi->cyclic_refresh_mode_enabled)
+    {
+        CHECK_MEM_ERROR(cpi->cyclic_refresh_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+    }
+    else
+        cpi->cyclic_refresh_map = (signed char *) NULL;
+
+#ifdef ENTROPY_STATS
+    init_context_counters();
+#endif
+
+    /*Initialize the feed-forward activity masking.*/
+    cpi->activity_avg = 90<<12;
+
+    /* Give a sensible default for the first frame. */
+    cpi->frames_since_key = 8;
+    cpi->key_frame_frequency = cpi->oxcf.key_freq;
+    cpi->this_key_frame_forced = 0;
+    cpi->next_key_frame_forced = 0;
+
+    cpi->source_alt_ref_pending = 0;
+    cpi->source_alt_ref_active = 0;
+    cpi->common.refresh_alt_ref_frame = 0;
+
+    cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+#if CONFIG_INTERNAL_STATS
+    cpi->b_calculate_ssimg = 0;
+
+    cpi->count = 0;
+    cpi->bytes = 0;
+
+    if (cpi->b_calculate_psnr)
+    {
+        cpi->total_sq_error = 0.0;
+        cpi->total_sq_error2 = 0.0;
+        cpi->total_y = 0.0;
+        cpi->total_u = 0.0;
+        cpi->total_v = 0.0;
+        cpi->total = 0.0;
+        cpi->totalp_y = 0.0;
+        cpi->totalp_u = 0.0;
+        cpi->totalp_v = 0.0;
+        cpi->totalp = 0.0;
+        cpi->tot_recode_hits = 0;
+        cpi->summed_quality = 0;
+        cpi->summed_weights = 0;
+    }
+
+    if (cpi->b_calculate_ssimg)
+    {
+        cpi->total_ssimg_y = 0;
+        cpi->total_ssimg_u = 0;
+        cpi->total_ssimg_v = 0;
+        cpi->total_ssimg_all = 0;
+    }
+
+#endif
+
+    cpi->first_time_stamp_ever = 0x7FFFFFFF;
+
+    cpi->frames_till_gf_update_due      = 0;
+    cpi->key_frame_count              = 1;
+
+    cpi->ni_av_qi                     = cpi->oxcf.worst_allowed_q;
+    cpi->ni_tot_qi                    = 0;
+    cpi->ni_frames                   = 0;
+    cpi->total_byte_count             = 0;
+
+    cpi->drop_frame                  = 0;
+
+    cpi->rate_correction_factor         = 1.0;
+    cpi->key_frame_rate_correction_factor = 1.0;
+    cpi->gf_rate_correction_factor  = 1.0;
+    cpi->twopass.est_max_qcorrection_factor  = 1.0;
+
+    for (i = 0; i < KEY_FRAME_CONTEXT; i++)
+    {
+        cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
+    }
+
+#ifdef OUTPUT_YUV_SRC
+    yuv_file = fopen("bd.yuv", "ab");
+#endif
+
+#if 0
+    framepsnr = fopen("framepsnr.stt", "a");
+    kf_list = fopen("kf_list.stt", "w");
+#endif
+
+    cpi->output_pkt_list = oxcf->output_pkt_list;
+
+#if !(CONFIG_REALTIME_ONLY)
+
+    if (cpi->pass == 1)
+    {
+        vp8_init_first_pass(cpi);
+    }
+    else if (cpi->pass == 2)
+    {
+        size_t packet_sz = sizeof(FIRSTPASS_STATS);
+        int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+
+        cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+        cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+        cpi->twopass.stats_in_end = (void*)((char *)cpi->twopass.stats_in
+                            + (packets - 1) * packet_sz);
+        vp8_init_second_pass(cpi);
+    }
+
+#endif
+
+    if (cpi->compressor_speed == 2)
+    {
+        cpi->avg_encode_time      = 0;
+        cpi->avg_pick_mode_time    = 0;
+    }
+
+    vp8_set_speed_features(cpi);
+
+    /* Set starting values of RD threshold multipliers (128 = *1) */
+    for (i = 0; i < MAX_MODES; i++)
+    {
+        cpi->rd_thresh_mult[i] = 128;
+    }
+
+#ifdef ENTROPY_STATS
+    init_mv_ref_counts();
+#endif
+
+#if CONFIG_MULTITHREAD
+    if(vp8cx_create_encoder_threads(cpi))
+    {
+        vp8_remove_compressor(&cpi);
+        return 0;
+    }
+#endif
+
+    cpi->fn_ptr[BLOCK_16X16].sdf            = vp8_sad16x16;
+    cpi->fn_ptr[BLOCK_16X16].vf             = vp8_variance16x16;
+    cpi->fn_ptr[BLOCK_16X16].svf            = vp8_sub_pixel_variance16x16;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vp8_variance_halfpixvar16x16_h;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vp8_variance_halfpixvar16x16_v;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv;
+    cpi->fn_ptr[BLOCK_16X16].sdx3f          = vp8_sad16x16x3;
+    cpi->fn_ptr[BLOCK_16X16].sdx8f          = vp8_sad16x16x8;
+    cpi->fn_ptr[BLOCK_16X16].sdx4df         = vp8_sad16x16x4d;
+
+    cpi->fn_ptr[BLOCK_16X8].sdf            = vp8_sad16x8;
+    cpi->fn_ptr[BLOCK_16X8].vf             = vp8_variance16x8;
+    cpi->fn_ptr[BLOCK_16X8].svf            = vp8_sub_pixel_variance16x8;
+    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_16X8].sdx3f          = vp8_sad16x8x3;
+    cpi->fn_ptr[BLOCK_16X8].sdx8f          = vp8_sad16x8x8;
+    cpi->fn_ptr[BLOCK_16X8].sdx4df         = vp8_sad16x8x4d;
+
+    cpi->fn_ptr[BLOCK_8X16].sdf            = vp8_sad8x16;
+    cpi->fn_ptr[BLOCK_8X16].vf             = vp8_variance8x16;
+    cpi->fn_ptr[BLOCK_8X16].svf            = vp8_sub_pixel_variance8x16;
+    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_8X16].sdx3f          = vp8_sad8x16x3;
+    cpi->fn_ptr[BLOCK_8X16].sdx8f          = vp8_sad8x16x8;
+    cpi->fn_ptr[BLOCK_8X16].sdx4df         = vp8_sad8x16x4d;
+
+    cpi->fn_ptr[BLOCK_8X8].sdf            = vp8_sad8x8;
+    cpi->fn_ptr[BLOCK_8X8].vf             = vp8_variance8x8;
+    cpi->fn_ptr[BLOCK_8X8].svf            = vp8_sub_pixel_variance8x8;
+    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_8X8].sdx3f          = vp8_sad8x8x3;
+    cpi->fn_ptr[BLOCK_8X8].sdx8f          = vp8_sad8x8x8;
+    cpi->fn_ptr[BLOCK_8X8].sdx4df         = vp8_sad8x8x4d;
+
+    cpi->fn_ptr[BLOCK_4X4].sdf            = vp8_sad4x4;
+    cpi->fn_ptr[BLOCK_4X4].vf             = vp8_variance4x4;
+    cpi->fn_ptr[BLOCK_4X4].svf            = vp8_sub_pixel_variance4x4;
+    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_4X4].sdx3f          = vp8_sad4x4x3;
+    cpi->fn_ptr[BLOCK_4X4].sdx8f          = vp8_sad4x4x8;
+    cpi->fn_ptr[BLOCK_4X4].sdx4df         = vp8_sad4x4x4d;
+
+#if ARCH_X86 || ARCH_X86_64
+    cpi->fn_ptr[BLOCK_16X16].copymem      = vp8_copy32xn;
+    cpi->fn_ptr[BLOCK_16X8].copymem       = vp8_copy32xn;
+    cpi->fn_ptr[BLOCK_8X16].copymem       = vp8_copy32xn;
+    cpi->fn_ptr[BLOCK_8X8].copymem        = vp8_copy32xn;
+    cpi->fn_ptr[BLOCK_4X4].copymem        = vp8_copy32xn;
+#endif
+
+    cpi->full_search_sad = vp8_full_search_sad;
+    cpi->diamond_search_sad = vp8_diamond_search_sad;
+    cpi->refining_search_sad = vp8_refining_search_sad;
+
+    /* make sure frame 1 is okay */
+    cpi->error_bins[0] = cpi->common.MBs;
+
+    /* vp8cx_init_quantizer() is first called here. Add check in
+     * vp8cx_frame_init_quantizer() so that vp8cx_init_quantizer is only
+     * called later when needed. This will avoid unnecessary calls of
+     * vp8cx_init_quantizer() for every frame.
+     */
+    vp8cx_init_quantizer(cpi);
+
+    vp8_loop_filter_init(cm);
+
+    cpi->common.error.setjmp = 0;
+
+#if CONFIG_MULTI_RES_ENCODING
+
+    /* Calculate # of MBs in a row in lower-resolution level image. */
+    if (cpi->oxcf.mr_encoder_id > 0)
+        vp8_cal_low_res_mb_cols(cpi);
+
+#endif
+
+    /* setup RD costs to MACROBLOCK struct */
+
+    cpi->mb.mvcost[0] = &cpi->rd_costs.mvcosts[0][mv_max+1];
+    cpi->mb.mvcost[1] = &cpi->rd_costs.mvcosts[1][mv_max+1];
+    cpi->mb.mvsadcost[0] = &cpi->rd_costs.mvsadcosts[0][mvfp_max+1];
+    cpi->mb.mvsadcost[1] = &cpi->rd_costs.mvsadcosts[1][mvfp_max+1];
+
+    cal_mvsadcosts(cpi->mb.mvsadcost);
+
+    cpi->mb.mbmode_cost = cpi->rd_costs.mbmode_cost;
+    cpi->mb.intra_uv_mode_cost = cpi->rd_costs.intra_uv_mode_cost;
+    cpi->mb.bmode_costs = cpi->rd_costs.bmode_costs;
+    cpi->mb.inter_bmode_costs = cpi->rd_costs.inter_bmode_costs;
+    cpi->mb.token_costs = cpi->rd_costs.token_costs;
+
+    /* setup block ptrs & offsets */
+    vp8_setup_block_ptrs(&cpi->mb);
+    vp8_setup_block_dptrs(&cpi->mb.e_mbd);
+
+    return  cpi;
+}
+
+
+void vp8_remove_compressor(VP8_COMP **ptr)
+{
+    VP8_COMP *cpi = *ptr;
+
+    if (!cpi)
+        return;
+
+    if (cpi && (cpi->common.current_video_frame > 0))
+    {
+#if !(CONFIG_REALTIME_ONLY)
+
+        if (cpi->pass == 2)
+        {
+            vp8_end_second_pass(cpi);
+        }
+
+#endif
+
+#ifdef ENTROPY_STATS
+        print_context_counters();
+        print_tree_update_probs();
+        print_mode_context();
+#endif
+
+#if CONFIG_INTERNAL_STATS
+
+        if (cpi->pass != 1)
+        {
+            FILE *f = fopen("opsnr.stt", "a");
+            double time_encoded = (cpi->last_end_time_stamp_seen
+                                   - cpi->first_time_stamp_ever) / 10000000.000;
+            double total_encode_time = (cpi->time_receive_data +
+                                            cpi->time_compress_data) / 1000.000;
+            double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
+
+            if (cpi->b_calculate_psnr)
+            {
+                YV12_BUFFER_CONFIG *lst_yv12 =
+                              &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
+
+                if (cpi->oxcf.number_of_layers > 1)
+                {
+                    int i;
+
+                    fprintf(f, "Layer\tBitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
+                               "GLPsnrP\tVPXSSIM\t\n");
+                    for (i=0; i<(int)cpi->oxcf.number_of_layers; i++)
+                    {
+                        double dr = (double)cpi->bytes_in_layer[i] *
+                                              8.0 / 1000.0  / time_encoded;
+                        double samples = 3.0 / 2 * cpi->frames_in_layer[i] *
+                                         lst_yv12->y_width * lst_yv12->y_height;
+                        double total_psnr = vp8_mse2psnr(samples, 255.0,
+                                                  cpi->total_error2[i]);
+                        double total_psnr2 = vp8_mse2psnr(samples, 255.0,
+                                                  cpi->total_error2_p[i]);
+                        double total_ssim = 100 * pow(cpi->sum_ssim[i] /
+                                                      cpi->sum_weights[i], 8.0);
+
+                        fprintf(f, "%5d\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                                   "%7.3f\t%7.3f\n",
+                                   i, dr,
+                                   cpi->sum_psnr[i] / cpi->frames_in_layer[i],
+                                   total_psnr,
+                                   cpi->sum_psnr_p[i] / cpi->frames_in_layer[i],
+                                   total_psnr2, total_ssim);
+                    }
+                }
+                else
+                {
+                    double samples = 3.0 / 2 * cpi->count *
+                                        lst_yv12->y_width * lst_yv12->y_height;
+                    double total_psnr = vp8_mse2psnr(samples, 255.0,
+                                                         cpi->total_sq_error);
+                    double total_psnr2 = vp8_mse2psnr(samples, 255.0,
+                                                         cpi->total_sq_error2);
+                    double total_ssim = 100 * pow(cpi->summed_quality /
+                                                      cpi->summed_weights, 8.0);
+
+                    fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
+                               "GLPsnrP\tVPXSSIM\t  Time(us)\n");
+                    fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                               "%7.3f\t%8.0f\n",
+                               dr, cpi->total / cpi->count, total_psnr,
+                               cpi->totalp / cpi->count, total_psnr2,
+                               total_ssim, total_encode_time);
+                }
+            }
+
+            if (cpi->b_calculate_ssimg)
+            {
+                if (cpi->oxcf.number_of_layers > 1)
+                {
+                    int i;
+
+                    fprintf(f, "Layer\tBitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t"
+                               "Time(us)\n");
+                    for (i=0; i<(int)cpi->oxcf.number_of_layers; i++)
+                    {
+                        double dr = (double)cpi->bytes_in_layer[i] *
+                                    8.0 / 1000.0  / time_encoded;
+                        fprintf(f, "%5d\t%7.3f\t%6.4f\t"
+                                "%6.4f\t%6.4f\t%6.4f\t%8.0f\n",
+                                i, dr,
+                                cpi->total_ssimg_y_in_layer[i] /
+                                     cpi->frames_in_layer[i],
+                                cpi->total_ssimg_u_in_layer[i] /
+                                     cpi->frames_in_layer[i],
+                                cpi->total_ssimg_v_in_layer[i] /
+                                     cpi->frames_in_layer[i],
+                                cpi->total_ssimg_all_in_layer[i] /
+                                     cpi->frames_in_layer[i],
+                                total_encode_time);
+                    }
+                }
+                else
+                {
+                    fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t"
+                               "Time(us)\n");
+                    fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
+                            cpi->total_ssimg_y / cpi->count,
+                            cpi->total_ssimg_u / cpi->count,
+                            cpi->total_ssimg_v / cpi->count,
+                            cpi->total_ssimg_all / cpi->count, total_encode_time);
+                }
+            }
+
+            fclose(f);
+#if 0
+            f = fopen("qskip.stt", "a");
+            fprintf(f, "minq:%d -maxq:%d skiptrue:skipfalse = %d:%d\n", cpi->oxcf.best_allowed_q, cpi->oxcf.worst_allowed_q, skiptruecount, skipfalsecount);
+            fclose(f);
+#endif
+
+        }
+
+#endif
+
+
+#ifdef SPEEDSTATS
+
+        if (cpi->compressor_speed == 2)
+        {
+            int i;
+            FILE *f = fopen("cxspeed.stt", "a");
+            cnt_pm /= cpi->common.MBs;
+
+            for (i = 0; i < 16; i++)
+                fprintf(f, "%5d", frames_at_speed[i]);
+
+            fprintf(f, "\n");
+            fclose(f);
+        }
+
+#endif
+
+
+#ifdef MODE_STATS
+        {
+            extern int count_mb_seg[4];
+            FILE *f = fopen("modes.stt", "a");
+            double dr = (double)cpi->frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+            fprintf(f, "intra_mode in Intra Frames:\n");
+            fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
+            fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
+            fprintf(f, "B: ");
+            {
+                int i;
+
+                for (i = 0; i < 10; i++)
+                    fprintf(f, "%8d, ", b_modes[i]);
+
+                fprintf(f, "\n");
+
+            }
+
+            fprintf(f, "Modes in Inter Frames:\n");
+            fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d\n",
+                    inter_y_modes[0], inter_y_modes[1], inter_y_modes[2], inter_y_modes[3], inter_y_modes[4],
+                    inter_y_modes[5], inter_y_modes[6], inter_y_modes[7], inter_y_modes[8], inter_y_modes[9]);
+            fprintf(f, "UV:%8d, %8d, %8d, %8d\n", inter_uv_modes[0], inter_uv_modes[1], inter_uv_modes[2], inter_uv_modes[3]);
+            fprintf(f, "B: ");
+            {
+                int i;
+
+                for (i = 0; i < 15; i++)
+                    fprintf(f, "%8d, ", inter_b_modes[i]);
+
+                fprintf(f, "\n");
+
+            }
+            fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);
+            fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);
+
+
+
+            fclose(f);
+        }
+#endif
+
+#ifdef ENTROPY_STATS
+        {
+            int i, j, k;
+            FILE *fmode = fopen("modecontext.c", "w");
+
+            fprintf(fmode, "\n#include \"entropymode.h\"\n\n");
+            fprintf(fmode, "const unsigned int vp8_kf_default_bmode_counts ");
+            fprintf(fmode, "[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] =\n{\n");
+
+            for (i = 0; i < 10; i++)
+            {
+
+                fprintf(fmode, "    { /* Above Mode :  %d */\n", i);
+
+                for (j = 0; j < 10; j++)
+                {
+
+                    fprintf(fmode, "        {");
+
+                    for (k = 0; k < 10; k++)
+                    {
+                        if (!intra_mode_stats[i][j][k])
+                            fprintf(fmode, " %5d, ", 1);
+                        else
+                            fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
+                    }
+
+                    fprintf(fmode, "}, /* left_mode %d */\n", j);
+
+                }
+
+                fprintf(fmode, "    },\n");
+
+            }
+
+            fprintf(fmode, "};\n");
+            fclose(fmode);
+        }
+#endif
+
+
+#if defined(SECTIONBITS_OUTPUT)
+
+        if (0)
+        {
+            int i;
+            FILE *f = fopen("tokenbits.stt", "a");
+
+            for (i = 0; i < 28; i++)
+                fprintf(f, "%8d", (int)(Sectionbits[i] / 256));
+
+            fprintf(f, "\n");
+            fclose(f);
+        }
+
+#endif
+
+#if 0
+        {
+            printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
+            printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+            printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+        }
+#endif
+
+    }
+
+#if CONFIG_MULTITHREAD
+    vp8cx_remove_encoder_threads(cpi);
+#endif
+
+#if CONFIG_TEMPORAL_DENOISING
+    vp8_denoiser_free(&cpi->denoiser);
+#endif
+    dealloc_compressor_data(cpi);
+    vpx_free(cpi->mb.ss);
+    vpx_free(cpi->tok);
+    vpx_free(cpi->cyclic_refresh_map);
+
+    vp8_remove_common(&cpi->common);
+    vpx_free(cpi);
+    *ptr = 0;
+
+#ifdef OUTPUT_YUV_SRC
+    fclose(yuv_file);
+#endif
+
+#if 0
+
+    if (keyfile)
+        fclose(keyfile);
+
+    if (framepsnr)
+        fclose(framepsnr);
+
+    if (kf_list)
+        fclose(kf_list);
+
+#endif
+
+}
+
+
+static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
+                                 unsigned char *recon, int recon_stride,
+                                 unsigned int cols, unsigned int rows)
+{
+    unsigned int row, col;
+    uint64_t total_sse = 0;
+    int diff;
+
+    for (row = 0; row + 16 <= rows; row += 16)
+    {
+        for (col = 0; col + 16 <= cols; col += 16)
+        {
+            unsigned int sse;
+
+            vp8_mse16x16(orig + col, orig_stride,
+                                            recon + col, recon_stride,
+                                            &sse);
+            total_sse += sse;
+        }
+
+        /* Handle odd-sized width */
+        if (col < cols)
+        {
+            unsigned int   border_row, border_col;
+            unsigned char *border_orig = orig;
+            unsigned char *border_recon = recon;
+
+            for (border_row = 0; border_row < 16; border_row++)
+            {
+                for (border_col = col; border_col < cols; border_col++)
+                {
+                    diff = border_orig[border_col] - border_recon[border_col];
+                    total_sse += diff * diff;
+                }
+
+                border_orig += orig_stride;
+                border_recon += recon_stride;
+            }
+        }
+
+        orig += orig_stride * 16;
+        recon += recon_stride * 16;
+    }
+
+    /* Handle odd-sized height */
+    for (; row < rows; row++)
+    {
+        for (col = 0; col < cols; col++)
+        {
+            diff = orig[col] - recon[col];
+            total_sse += diff * diff;
+        }
+
+        orig += orig_stride;
+        recon += recon_stride;
+    }
+
+    vp8_clear_system_state();
+    return total_sse;
+}
+
+
+static void generate_psnr_packet(VP8_COMP *cpi)
+{
+    YV12_BUFFER_CONFIG      *orig = cpi->Source;
+    YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
+    struct vpx_codec_cx_pkt  pkt;
+    uint64_t                 sse;
+    int                      i;
+    unsigned int             width = cpi->common.Width;
+    unsigned int             height = cpi->common.Height;
+
+    pkt.kind = VPX_CODEC_PSNR_PKT;
+    sse = calc_plane_error(orig->y_buffer, orig->y_stride,
+                           recon->y_buffer, recon->y_stride,
+                           width, height);
+    pkt.data.psnr.sse[0] = sse;
+    pkt.data.psnr.sse[1] = sse;
+    pkt.data.psnr.samples[0] = width * height;
+    pkt.data.psnr.samples[1] = width * height;
+
+    width = (width + 1) / 2;
+    height = (height + 1) / 2;
+
+    sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                           recon->u_buffer, recon->uv_stride,
+                           width, height);
+    pkt.data.psnr.sse[0] += sse;
+    pkt.data.psnr.sse[2] = sse;
+    pkt.data.psnr.samples[0] += width * height;
+    pkt.data.psnr.samples[2] = width * height;
+
+    sse = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                           recon->v_buffer, recon->uv_stride,
+                           width, height);
+    pkt.data.psnr.sse[0] += sse;
+    pkt.data.psnr.sse[3] = sse;
+    pkt.data.psnr.samples[0] += width * height;
+    pkt.data.psnr.samples[3] = width * height;
+
+    for (i = 0; i < 4; i++)
+        pkt.data.psnr.psnr[i] = vp8_mse2psnr(pkt.data.psnr.samples[i], 255.0,
+                                             (double)(pkt.data.psnr.sse[i]));
+
+    vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+
+int vp8_use_as_reference(VP8_COMP *cpi, int ref_frame_flags)
+{
+    if (ref_frame_flags > 7)
+        return -1 ;
+
+    cpi->ref_frame_flags = ref_frame_flags;
+    return 0;
+}
+int vp8_update_reference(VP8_COMP *cpi, int ref_frame_flags)
+{
+    if (ref_frame_flags > 7)
+        return -1 ;
+
+    cpi->common.refresh_golden_frame = 0;
+    cpi->common.refresh_alt_ref_frame = 0;
+    cpi->common.refresh_last_frame   = 0;
+
+    if (ref_frame_flags & VP8_LAST_FRAME)
+        cpi->common.refresh_last_frame = 1;
+
+    if (ref_frame_flags & VP8_GOLD_FRAME)
+        cpi->common.refresh_golden_frame = 1;
+
+    if (ref_frame_flags & VP8_ALTR_FRAME)
+        cpi->common.refresh_alt_ref_frame = 1;
+
+    return 0;
+}
+
+int vp8_get_reference(VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8_COMMON *cm = &cpi->common;
+    int ref_fb_idx;
+
+    if (ref_frame_flag == VP8_LAST_FRAME)
+        ref_fb_idx = cm->lst_fb_idx;
+    else if (ref_frame_flag == VP8_GOLD_FRAME)
+        ref_fb_idx = cm->gld_fb_idx;
+    else if (ref_frame_flag == VP8_ALTR_FRAME)
+        ref_fb_idx = cm->alt_fb_idx;
+    else
+        return -1;
+
+    vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);
+
+    return 0;
+}
+int vp8_set_reference(VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    int ref_fb_idx;
+
+    if (ref_frame_flag == VP8_LAST_FRAME)
+        ref_fb_idx = cm->lst_fb_idx;
+    else if (ref_frame_flag == VP8_GOLD_FRAME)
+        ref_fb_idx = cm->gld_fb_idx;
+    else if (ref_frame_flag == VP8_ALTR_FRAME)
+        ref_fb_idx = cm->alt_fb_idx;
+    else
+        return -1;
+
+    vp8_yv12_copy_frame(sd, &cm->yv12_fb[ref_fb_idx]);
+
+    return 0;
+}
+int vp8_update_entropy(VP8_COMP *cpi, int update)
+{
+    VP8_COMMON *cm = &cpi->common;
+    cm->refresh_entropy_probs = update;
+
+    return 0;
+}
+
+
+#if OUTPUT_YUV_SRC
+void vp8_write_yuv_frame(const char *name, YV12_BUFFER_CONFIG *s)
+{
+    FILE *yuv_file = fopen(name, "ab");
+    unsigned char *src = s->y_buffer;
+    int h = s->y_height;
+
+    do
+    {
+        fwrite(src, s->y_width, 1,  yuv_file);
+        src += s->y_stride;
+    }
+    while (--h);
+
+    src = s->u_buffer;
+    h = s->uv_height;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1,  yuv_file);
+        src += s->uv_stride;
+    }
+    while (--h);
+
+    src = s->v_buffer;
+    h = s->uv_height;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1, yuv_file);
+        src += s->uv_stride;
+    }
+    while (--h);
+
+    fclose(yuv_file);
+}
+#endif
+
+
+static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    /* are we resizing the image */
+    if (cm->horiz_scale != 0 || cm->vert_scale != 0)
+    {
+#if CONFIG_SPATIAL_RESAMPLING
+        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+        int tmp_height;
+
+        if (cm->vert_scale == 3)
+            tmp_height = 9;
+        else
+            tmp_height = 11;
+
+        Scale2Ratio(cm->horiz_scale, &hr, &hs);
+        Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+        vp8_scale_frame(sd, &cpi->scaled_source, cm->temp_scale_frame.y_buffer,
+                        tmp_height, hs, hr, vs, vr, 0);
+
+        vp8_yv12_extend_frame_borders(&cpi->scaled_source);
+        cpi->Source = &cpi->scaled_source;
+#endif
+    }
+    else
+        cpi->Source = sd;
+}
+
+
+static int resize_key_frame(VP8_COMP *cpi)
+{
+#if CONFIG_SPATIAL_RESAMPLING
+    VP8_COMMON *cm = &cpi->common;
+
+    /* Do we need to apply resampling for one pass cbr.
+     * In one pass this is more limited than in two pass cbr
+     * The test and any change is only made one per key frame sequence
+     */
+    if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER))
+    {
+        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+        int new_width, new_height;
+
+        /* If we are below the resample DOWN watermark then scale down a
+         * notch.
+         */
+        if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))
+        {
+            cm->horiz_scale = (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO;
+            cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO;
+        }
+        /* Should we now start scaling back up */
+        else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))
+        {
+            cm->horiz_scale = (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL;
+            cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
+        }
+
+        /* Get the new hieght and width */
+        Scale2Ratio(cm->horiz_scale, &hr, &hs);
+        Scale2Ratio(cm->vert_scale, &vr, &vs);
+        new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
+        new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs;
+
+        /* If the image size has changed we need to reallocate the buffers
+         * and resample the source image
+         */
+        if ((cm->Width != new_width) || (cm->Height != new_height))
+        {
+            cm->Width = new_width;
+            cm->Height = new_height;
+            vp8_alloc_compressor_data(cpi);
+            scale_and_extend_source(cpi->un_scaled_source, cpi);
+            return 1;
+        }
+    }
+
+#endif
+    return 0;
+}
+
+
+static void update_alt_ref_frame_stats(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    /* Select an interval before next GF or altref */
+    if (!cpi->auto_gold)
+        cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
+
+    if ((cpi->pass != 2) && cpi->frames_till_gf_update_due)
+    {
+        cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+        /* Set the bits per frame that we should try and recover in
+         * subsequent inter frames to account for the extra GF spend...
+         * note that his does not apply for GF updates that occur
+         * coincident with a key frame as the extra cost of key frames is
+         * dealt with elsewhere.
+         */
+        cpi->gf_overspend_bits += cpi->projected_frame_size;
+        cpi->non_gf_bitrate_adjustment = cpi->gf_overspend_bits / cpi->frames_till_gf_update_due;
+    }
+
+    /* Update data structure that monitors level of reference to last GF */
+    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+    cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+    /* this frame refreshes means next frames don't unless specified by user */
+    cpi->common.frames_since_golden = 0;
+
+    /* Clear the alternate reference update pending flag. */
+    cpi->source_alt_ref_pending = 0;
+
+    /* Set the alternate refernce frame active flag */
+    cpi->source_alt_ref_active = 1;
+
+
+}
+static void update_golden_frame_stats(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    /* Update the Golden frame usage counts. */
+    if (cm->refresh_golden_frame)
+    {
+        /* Select an interval before next GF */
+        if (!cpi->auto_gold)
+            cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
+
+        if ((cpi->pass != 2) && (cpi->frames_till_gf_update_due > 0))
+        {
+            cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+            /* Set the bits per frame that we should try and recover in
+             * subsequent inter frames to account for the extra GF spend...
+             * note that his does not apply for GF updates that occur
+             * coincident with a key frame as the extra cost of key frames
+             * is dealt with elsewhere.
+             */
+            if ((cm->frame_type != KEY_FRAME) && !cpi->source_alt_ref_active)
+            {
+                /* Calcluate GF bits to be recovered
+                 * Projected size - av frame bits available for inter
+                 * frames for clip as a whole
+                 */
+                cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->inter_frame_target);
+            }
+
+            cpi->non_gf_bitrate_adjustment = cpi->gf_overspend_bits / cpi->frames_till_gf_update_due;
+
+        }
+
+        /* Update data structure that monitors level of reference to last GF */
+        vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+        cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+        /* this frame refreshes means next frames don't unless specified by
+         * user
+         */
+        cm->refresh_golden_frame = 0;
+        cpi->common.frames_since_golden = 0;
+
+        cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
+        cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
+        cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+        cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+
+        /* ******** Fixed Q test code only ************ */
+        /* If we are going to use the ALT reference for the next group of
+         * frames set a flag to say so.
+         */
+        if (cpi->oxcf.fixed_q >= 0 &&
+            cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame)
+        {
+            cpi->source_alt_ref_pending = 1;
+            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+        }
+
+        if (!cpi->source_alt_ref_pending)
+            cpi->source_alt_ref_active = 0;
+
+        /* Decrement count down till next gf */
+        if (cpi->frames_till_gf_update_due > 0)
+            cpi->frames_till_gf_update_due--;
+
+    }
+    else if (!cpi->common.refresh_alt_ref_frame)
+    {
+        /* Decrement count down till next gf */
+        if (cpi->frames_till_gf_update_due > 0)
+            cpi->frames_till_gf_update_due--;
+
+        if (cpi->common.frames_till_alt_ref_frame)
+            cpi->common.frames_till_alt_ref_frame --;
+
+        cpi->common.frames_since_golden ++;
+
+        if (cpi->common.frames_since_golden > 1)
+        {
+            cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];
+            cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];
+            cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];
+            cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+        }
+    }
+}
+
+/* This function updates the reference frame probability estimates that
+ * will be used during mode selection
+ */
+static void update_rd_ref_frame_probs(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+    if (cm->frame_type == KEY_FRAME)
+    {
+        cpi->prob_intra_coded = 255;
+        cpi->prob_last_coded  = 128;
+        cpi->prob_gf_coded  = 128;
+    }
+    else if (!(rf_intra + rf_inter))
+    {
+        cpi->prob_intra_coded = 63;
+        cpi->prob_last_coded  = 128;
+        cpi->prob_gf_coded    = 128;
+    }
+
+    /* update reference frame costs since we can do better than what we got
+     * last frame.
+     */
+    if (cpi->oxcf.number_of_layers == 1)
+    {
+        if (cpi->common.refresh_alt_ref_frame)
+        {
+            cpi->prob_intra_coded += 40;
+            cpi->prob_last_coded = 200;
+            cpi->prob_gf_coded = 1;
+        }
+        else if (cpi->common.frames_since_golden == 0)
+        {
+            cpi->prob_last_coded = 214;
+        }
+        else if (cpi->common.frames_since_golden == 1)
+        {
+            cpi->prob_last_coded = 192;
+            cpi->prob_gf_coded = 220;
+        }
+        else if (cpi->source_alt_ref_active)
+        {
+            cpi->prob_gf_coded -= 20;
+
+            if (cpi->prob_gf_coded < 10)
+                cpi->prob_gf_coded = 10;
+        }
+        if (!cpi->source_alt_ref_active)
+            cpi->prob_gf_coded = 255;
+    }
+}
+
+
+/* 1 = key, 0 = inter */
+static int decide_key_frame(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    int code_key_frame = 0;
+
+    cpi->kf_boost = 0;
+
+    if (cpi->Speed > 11)
+        return 0;
+
+    /* Clear down mmx registers */
+    vp8_clear_system_state();
+
+    if ((cpi->compressor_speed == 2) && (cpi->Speed >= 5) && (cpi->sf.RD == 0))
+    {
+        double change = 1.0 * abs((int)(cpi->mb.intra_error -
+            cpi->last_intra_error)) / (1 + cpi->last_intra_error);
+        double change2 = 1.0 * abs((int)(cpi->mb.prediction_error -
+            cpi->last_prediction_error)) / (1 + cpi->last_prediction_error);
+        double minerror = cm->MBs * 256;
+
+        cpi->last_intra_error = cpi->mb.intra_error;
+        cpi->last_prediction_error = cpi->mb.prediction_error;
+
+        if (10 * cpi->mb.intra_error / (1 + cpi->mb.prediction_error) < 15
+            && cpi->mb.prediction_error > minerror
+            && (change > .25 || change2 > .25))
+        {
+            /*(change > 1.4 || change < .75)&& cpi->this_frame_percent_intra > cpi->last_frame_percent_intra + 3*/
+            return 1;
+        }
+
+        return 0;
+
+    }
+
+    /* If the following are true we might as well code a key frame */
+    if (((cpi->this_frame_percent_intra == 100) &&
+         (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 2))) ||
+        ((cpi->this_frame_percent_intra > 95) &&
+         (cpi->this_frame_percent_intra >= (cpi->last_frame_percent_intra + 5))))
+    {
+        code_key_frame = 1;
+    }
+    /* in addition if the following are true and this is not a golden frame
+     * then code a key frame Note that on golden frames there often seems
+     * to be a pop in intra useage anyway hence this restriction is
+     * designed to prevent spurious key frames. The Intra pop needs to be
+     * investigated.
+     */
+    else if (((cpi->this_frame_percent_intra > 60) &&
+              (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra * 2))) ||
+             ((cpi->this_frame_percent_intra > 75) &&
+              (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra * 3 / 2))) ||
+             ((cpi->this_frame_percent_intra > 90) &&
+              (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 10))))
+    {
+        if (!cm->refresh_golden_frame)
+            code_key_frame = 1;
+    }
+
+    return code_key_frame;
+
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+static void Pass1Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags)
+{
+    (void) size;
+    (void) dest;
+    (void) frame_flags;
+    vp8_set_quantizer(cpi, 26);
+
+    vp8_first_pass(cpi);
+}
+#endif
+
+#if 0
+void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
+{
+
+    /* write the frame */
+    FILE *yframe;
+    int i;
+    char filename[255];
+
+    sprintf(filename, "cx\\y%04d.raw", this_frame);
+    yframe = fopen(filename, "wb");
+
+    for (i = 0; i < frame->y_height; i++)
+        fwrite(frame->y_buffer + i * frame->y_stride, frame->y_width, 1, yframe);
+
+    fclose(yframe);
+    sprintf(filename, "cx\\u%04d.raw", this_frame);
+    yframe = fopen(filename, "wb");
+
+    for (i = 0; i < frame->uv_height; i++)
+        fwrite(frame->u_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe);
+
+    fclose(yframe);
+    sprintf(filename, "cx\\v%04d.raw", this_frame);
+    yframe = fopen(filename, "wb");
+
+    for (i = 0; i < frame->uv_height; i++)
+        fwrite(frame->v_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe);
+
+    fclose(yframe);
+}
+#endif
+/* return of 0 means drop frame */
+
+/* Function to test for conditions that indeicate we should loop
+ * back and recode a frame.
+ */
+static int recode_loop_test( VP8_COMP *cpi,
+                              int high_limit, int low_limit,
+                              int q, int maxq, int minq )
+{
+    int force_recode = 0;
+    VP8_COMMON *cm = &cpi->common;
+
+    /* Is frame recode allowed at all
+     * Yes if either recode mode 1 is selected or mode two is selcted
+     * and the frame is a key frame. golden frame or alt_ref_frame
+     */
+    if ( (cpi->sf.recode_loop == 1) ||
+         ( (cpi->sf.recode_loop == 2) &&
+           ( (cm->frame_type == KEY_FRAME) ||
+             cm->refresh_golden_frame ||
+             cm->refresh_alt_ref_frame ) ) )
+    {
+        /* General over and under shoot tests */
+        if ( ((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
+             ((cpi->projected_frame_size < low_limit) && (q > minq)) )
+        {
+            force_recode = 1;
+        }
+        /* Special Constrained quality tests */
+        else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+        {
+            /* Undershoot and below auto cq level */
+            if ( (q > cpi->cq_target_quality) &&
+                 (cpi->projected_frame_size <
+                     ((cpi->this_frame_target * 7) >> 3)))
+            {
+                force_recode = 1;
+            }
+            /* Severe undershoot and between auto and user cq level */
+            else if ( (q > cpi->oxcf.cq_level) &&
+                      (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&
+                      (cpi->active_best_quality > cpi->oxcf.cq_level))
+            {
+                force_recode = 1;
+                cpi->active_best_quality = cpi->oxcf.cq_level;
+            }
+        }
+    }
+
+    return force_recode;
+}
+
+static void update_reference_frames(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+    YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb;
+
+    /* At this point the new frame has been encoded.
+     * If any buffer copy / swapping is signaled it should be done here.
+     */
+
+    if (cm->frame_type == KEY_FRAME)
+    {
+        yv12_fb[cm->new_fb_idx].flags |= VP8_GOLD_FRAME | VP8_ALTR_FRAME ;
+
+        yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
+        yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
+
+        cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+        cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
+        cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
+#endif
+    }
+    else    /* For non key frames */
+    {
+        if (cm->refresh_alt_ref_frame)
+        {
+            assert(!cm->copy_buffer_to_arf);
+
+            cm->yv12_fb[cm->new_fb_idx].flags |= VP8_ALTR_FRAME;
+            cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
+            cm->alt_fb_idx = cm->new_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+            cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
+#endif
+        }
+        else if (cm->copy_buffer_to_arf)
+        {
+            assert(!(cm->copy_buffer_to_arf & ~0x3));
+
+            if (cm->copy_buffer_to_arf == 1)
+            {
+                if(cm->alt_fb_idx != cm->lst_fb_idx)
+                {
+                    yv12_fb[cm->lst_fb_idx].flags |= VP8_ALTR_FRAME;
+                    yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
+                    cm->alt_fb_idx = cm->lst_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+                    cpi->current_ref_frames[ALTREF_FRAME] =
+                        cpi->current_ref_frames[LAST_FRAME];
+#endif
+                }
+            }
+            else /* if (cm->copy_buffer_to_arf == 2) */
+            {
+                if(cm->alt_fb_idx != cm->gld_fb_idx)
+                {
+                    yv12_fb[cm->gld_fb_idx].flags |= VP8_ALTR_FRAME;
+                    yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
+                    cm->alt_fb_idx = cm->gld_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+                    cpi->current_ref_frames[ALTREF_FRAME] =
+                        cpi->current_ref_frames[GOLDEN_FRAME];
+#endif
+                }
+            }
+        }
+
+        if (cm->refresh_golden_frame)
+        {
+            assert(!cm->copy_buffer_to_gf);
+
+            cm->yv12_fb[cm->new_fb_idx].flags |= VP8_GOLD_FRAME;
+            cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
+            cm->gld_fb_idx = cm->new_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+            cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
+#endif
+        }
+        else if (cm->copy_buffer_to_gf)
+        {
+            assert(!(cm->copy_buffer_to_arf & ~0x3));
+
+            if (cm->copy_buffer_to_gf == 1)
+            {
+                if(cm->gld_fb_idx != cm->lst_fb_idx)
+                {
+                    yv12_fb[cm->lst_fb_idx].flags |= VP8_GOLD_FRAME;
+                    yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
+                    cm->gld_fb_idx = cm->lst_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+                    cpi->current_ref_frames[GOLDEN_FRAME] =
+                        cpi->current_ref_frames[LAST_FRAME];
+#endif
+                }
+            }
+            else /* if (cm->copy_buffer_to_gf == 2) */
+            {
+                if(cm->alt_fb_idx != cm->gld_fb_idx)
+                {
+                    yv12_fb[cm->alt_fb_idx].flags |= VP8_GOLD_FRAME;
+                    yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
+                    cm->gld_fb_idx = cm->alt_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+                    cpi->current_ref_frames[GOLDEN_FRAME] =
+                        cpi->current_ref_frames[ALTREF_FRAME];
+#endif
+                }
+            }
+        }
+    }
+
+    if (cm->refresh_last_frame)
+    {
+        cm->yv12_fb[cm->new_fb_idx].flags |= VP8_LAST_FRAME;
+        cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP8_LAST_FRAME;
+        cm->lst_fb_idx = cm->new_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+        cpi->current_ref_frames[LAST_FRAME] = cm->current_video_frame;
+#endif
+    }
+}
+
+void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
+{
+    const FRAME_TYPE frame_type = cm->frame_type;
+
+    if (cm->no_lpf)
+    {
+        cm->filter_level = 0;
+    }
+    else
+    {
+        struct vpx_usec_timer timer;
+
+        vp8_clear_system_state();
+
+        vpx_usec_timer_start(&timer);
+        if (cpi->sf.auto_filter == 0)
+            vp8cx_pick_filter_level_fast(cpi->Source, cpi);
+
+        else
+            vp8cx_pick_filter_level(cpi->Source, cpi);
+
+        if (cm->filter_level > 0)
+        {
+            vp8cx_set_alt_lf_level(cpi, cm->filter_level);
+        }
+
+        vpx_usec_timer_mark(&timer);
+        cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+    }
+
+#if CONFIG_MULTITHREAD
+    if (cpi->b_multi_threaded)
+        sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+#endif
+
+    if (cm->filter_level > 0)
+    {
+        vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, frame_type);
+    }
+
+    vp8_yv12_extend_frame_borders(cm->frame_to_show);
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+
+
+        /* we shouldn't have to keep multiple copies as we know in advance which
+         * buffer we should start - for now to get something up and running
+         * I've chosen to copy the buffers
+         */
+        if (cm->frame_type == KEY_FRAME)
+        {
+            int i;
+            vp8_yv12_copy_frame(
+                    cpi->Source,
+                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+
+            vp8_yv12_extend_frame_borders(
+                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+
+            for (i = 2; i < MAX_REF_FRAMES - 1; i++)
+                vp8_yv12_copy_frame(
+                        cpi->Source,
+                        &cpi->denoiser.yv12_running_avg[i]);
+        }
+        else /* For non key frames */
+        {
+            vp8_yv12_extend_frame_borders(
+                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
+
+            if (cm->refresh_alt_ref_frame || cm->copy_buffer_to_arf)
+            {
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[LAST_FRAME],
+                        &cpi->denoiser.yv12_running_avg[ALTREF_FRAME]);
+            }
+            if (cm->refresh_golden_frame || cm->copy_buffer_to_gf)
+            {
+                vp8_yv12_copy_frame(
+                        &cpi->denoiser.yv12_running_avg[LAST_FRAME],
+                        &cpi->denoiser.yv12_running_avg[GOLDEN_FRAME]);
+            }
+        }
+
+    }
+#endif
+
+}
+
+static void encode_frame_to_data_rate
+(
+    VP8_COMP *cpi,
+    unsigned long *size,
+    unsigned char *dest,
+    unsigned char* dest_end,
+    unsigned int *frame_flags
+)
+{
+    int Q;
+    int frame_over_shoot_limit;
+    int frame_under_shoot_limit;
+
+    int Loop = 0;
+    int loop_count;
+
+    VP8_COMMON *cm = &cpi->common;
+    int active_worst_qchanged = 0;
+
+#if !(CONFIG_REALTIME_ONLY)
+    int q_low;
+    int q_high;
+    int zbin_oq_high;
+    int zbin_oq_low = 0;
+    int top_index;
+    int bottom_index;
+    int overshoot_seen = 0;
+    int undershoot_seen = 0;
+#endif
+
+    int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
+                          cpi->oxcf.optimal_buffer_level / 100);
+    int drop_mark75 = drop_mark * 2 / 3;
+    int drop_mark50 = drop_mark / 4;
+    int drop_mark25 = drop_mark / 8;
+
+
+    /* Clear down mmx registers to allow floating point in what follows */
+    vp8_clear_system_state();
+
+#if CONFIG_MULTITHREAD
+    /*  wait for the last picture loopfilter thread done */
+    if (cpi->b_lpf_running)
+    {
+        sem_wait(&cpi->h_event_end_lpf);
+        cpi->b_lpf_running = 0;
+    }
+#endif
+
+    if(cpi->force_next_frame_intra)
+    {
+        cm->frame_type = KEY_FRAME;  /* delayed intra frame */
+        cpi->force_next_frame_intra = 0;
+    }
+
+    /* For an alt ref frame in 2 pass we skip the call to the second pass
+     * function that sets the target bandwidth
+     */
+#if !(CONFIG_REALTIME_ONLY)
+
+    if (cpi->pass == 2)
+    {
+        if (cpi->common.refresh_alt_ref_frame)
+        {
+            /* Per frame bit target for the alt ref frame */
+            cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+            /* per second target bitrate */
+            cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
+                                          cpi->output_frame_rate);
+        }
+    }
+    else
+#endif
+        cpi->per_frame_bandwidth  = (int)(cpi->target_bandwidth / cpi->output_frame_rate);
+
+    /* Default turn off buffer to buffer copying */
+    cm->copy_buffer_to_gf = 0;
+    cm->copy_buffer_to_arf = 0;
+
+    /* Clear zbin over-quant value and mode boost values. */
+    cpi->zbin_over_quant = 0;
+    cpi->zbin_mode_boost = 0;
+
+    /* Enable or disable mode based tweaking of the zbin
+     * For 2 Pass Only used where GF/ARF prediction quality
+     * is above a threshold
+     */
+    cpi->zbin_mode_boost_enabled = 1;
+    if (cpi->pass == 2)
+    {
+        if ( cpi->gfu_boost <= 400 )
+        {
+            cpi->zbin_mode_boost_enabled = 0;
+        }
+    }
+
+    /* Current default encoder behaviour for the altref sign bias */
+    if (cpi->source_alt_ref_active)
+        cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+    else
+        cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;
+
+    /* Check to see if a key frame is signalled
+     * For two pass with auto key frame enabled cm->frame_type may already
+     * be set, but not for one pass.
+     */
+    if ((cm->current_video_frame == 0) ||
+        (cm->frame_flags & FRAMEFLAGS_KEY) ||
+        (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0)))
+    {
+        /* Key frame from VFW/auto-keyframe/first frame */
+        cm->frame_type = KEY_FRAME;
+    }
+
+#if CONFIG_MULTI_RES_ENCODING
+    /* In multi-resolution encoding, frame_type is decided by lowest-resolution
+     * encoder. Same frame_type is adopted while encoding at other resolution.
+     */
+    if (cpi->oxcf.mr_encoder_id)
+    {
+        LOWER_RES_FRAME_INFO* low_res_frame_info
+                        = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+        cm->frame_type = low_res_frame_info->frame_type;
+
+        if(cm->frame_type != KEY_FRAME)
+        {
+            cpi->mr_low_res_mv_avail = 1;
+            cpi->mr_low_res_mv_avail &= !(low_res_frame_info->is_frame_dropped);
+
+            if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+                cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[LAST_FRAME]
+                         == low_res_frame_info->low_res_ref_frames[LAST_FRAME]);
+
+            if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+                cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[GOLDEN_FRAME]
+                         == low_res_frame_info->low_res_ref_frames[GOLDEN_FRAME]);
+
+            if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+                cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[ALTREF_FRAME]
+                         == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
+        }
+    }
+#endif
+
+    /* Set various flags etc to special state if it is a key frame */
+    if (cm->frame_type == KEY_FRAME)
+    {
+        int i;
+
+        // Set the loop filter deltas and segmentation map update
+        setup_features(cpi);
+
+        /* The alternate reference frame cannot be active for a key frame */
+        cpi->source_alt_ref_active = 0;
+
+        /* Reset the RD threshold multipliers to default of * 1 (128) */
+        for (i = 0; i < MAX_MODES; i++)
+        {
+            cpi->rd_thresh_mult[i] = 128;
+        }
+    }
+
+#if 0
+    /* Experimental code for lagged compress and one pass
+     * Initialise one_pass GF frames stats
+     * Update stats used for GF selection
+     */
+    {
+        cpi->one_pass_frame_index = cm->current_video_frame % MAX_LAG_BUFFERS;
+
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frames_so_far = 0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_intra_error = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_coded_error = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_inter = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_motion = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr_abs = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc_abs = 0.0;
+    }
+#endif
+
+    update_rd_ref_frame_probs(cpi);
+
+    if (cpi->drop_frames_allowed)
+    {
+        /* The reset to decimation 0 is only done here for one pass.
+         * Once it is set two pass leaves decimation on till the next kf.
+         */
+        if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0))
+            cpi->decimation_factor --;
+
+        if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0)
+            cpi->decimation_factor = 1;
+
+        else if (cpi->buffer_level < drop_mark25 && (cpi->decimation_factor == 2 || cpi->decimation_factor == 3))
+        {
+            cpi->decimation_factor = 3;
+        }
+        else if (cpi->buffer_level < drop_mark50 && (cpi->decimation_factor == 1 || cpi->decimation_factor == 2))
+        {
+            cpi->decimation_factor = 2;
+        }
+        else if (cpi->buffer_level < drop_mark75 && (cpi->decimation_factor == 0 || cpi->decimation_factor == 1))
+        {
+            cpi->decimation_factor = 1;
+        }
+    }
+
+    /* The following decimates the frame rate according to a regular
+     * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help
+     * prevent buffer under-run in CBR mode. Alternatively it might be
+     * desirable in some situations to drop frame rate but throw more bits
+     * at each frame.
+     *
+     * Note that dropping a key frame can be problematic if spatial
+     * resampling is also active
+     */
+    if (cpi->decimation_factor > 0)
+    {
+        switch (cpi->decimation_factor)
+        {
+        case 1:
+            cpi->per_frame_bandwidth  = cpi->per_frame_bandwidth * 3 / 2;
+            break;
+        case 2:
+            cpi->per_frame_bandwidth  = cpi->per_frame_bandwidth * 5 / 4;
+            break;
+        case 3:
+            cpi->per_frame_bandwidth  = cpi->per_frame_bandwidth * 5 / 4;
+            break;
+        }
+
+        /* Note that we should not throw out a key frame (especially when
+         * spatial resampling is enabled).
+         */
+        if ((cm->frame_type == KEY_FRAME))
+        {
+            cpi->decimation_count = cpi->decimation_factor;
+        }
+        else if (cpi->decimation_count > 0)
+        {
+            cpi->decimation_count --;
+
+            cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+            if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
+                cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+
+#if CONFIG_MULTI_RES_ENCODING
+            vp8_store_drop_frame_info(cpi);
+#endif
+
+            cm->current_video_frame++;
+            cpi->frames_since_key++;
+
+#if CONFIG_INTERNAL_STATS
+            cpi->count ++;
+#endif
+
+            cpi->buffer_level = cpi->bits_off_target;
+
+            if (cpi->oxcf.number_of_layers > 1)
+            {
+                unsigned int i;
+
+                /* Propagate bits saved by dropping the frame to higher
+                 * layers
+                 */
+                for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
+                {
+                    LAYER_CONTEXT *lc = &cpi->layer_context[i];
+                    lc->bits_off_target += cpi->av_per_frame_bandwidth;
+                    if (lc->bits_off_target > lc->maximum_buffer_size)
+                        lc->bits_off_target = lc->maximum_buffer_size;
+                    lc->buffer_level = lc->bits_off_target;
+                }
+            }
+
+            return;
+        }
+        else
+            cpi->decimation_count = cpi->decimation_factor;
+    }
+    else
+        cpi->decimation_count = 0;
+
+    /* Decide how big to make the frame */
+    if (!vp8_pick_frame_size(cpi))
+    {
+        /*TODO: 2 drop_frame and return code could be put together. */
+#if CONFIG_MULTI_RES_ENCODING
+        vp8_store_drop_frame_info(cpi);
+#endif
+        cm->current_video_frame++;
+        cpi->frames_since_key++;
+        return;
+    }
+
+    /* Reduce active_worst_allowed_q for CBR if our buffer is getting too full.
+     * This has a knock on effect on active best quality as well.
+     * For CBR if the buffer reaches its maximum level then we can no longer
+     * save up bits for later frames so we might as well use them up
+     * on the current frame.
+     */
+    if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+        (cpi->buffer_level >= cpi->oxcf.optimal_buffer_level) && cpi->buffered_mode)
+    {
+        /* Max adjustment is 1/4 */
+        int Adjustment = cpi->active_worst_quality / 4;
+
+        if (Adjustment)
+        {
+            int buff_lvl_step;
+
+            if (cpi->buffer_level < cpi->oxcf.maximum_buffer_size)
+            {
+                buff_lvl_step = (int)
+                                ((cpi->oxcf.maximum_buffer_size -
+                                  cpi->oxcf.optimal_buffer_level) /
+                                  Adjustment);
+
+                if (buff_lvl_step)
+                    Adjustment = (int)
+                                 ((cpi->buffer_level -
+                                 cpi->oxcf.optimal_buffer_level) /
+                                 buff_lvl_step);
+                else
+                    Adjustment = 0;
+            }
+
+            cpi->active_worst_quality -= Adjustment;
+
+            if(cpi->active_worst_quality < cpi->active_best_quality)
+                cpi->active_worst_quality = cpi->active_best_quality;
+        }
+    }
+
+    /* Set an active best quality and if necessary active worst quality
+     * There is some odd behavior for one pass here that needs attention.
+     */
+    if ( (cpi->pass == 2) || (cpi->ni_frames > 150))
+    {
+        vp8_clear_system_state();
+
+        Q = cpi->active_worst_quality;
+
+        if ( cm->frame_type == KEY_FRAME )
+        {
+            if ( cpi->pass == 2 )
+            {
+                if (cpi->gfu_boost > 600)
+                   cpi->active_best_quality = kf_low_motion_minq[Q];
+                else
+                   cpi->active_best_quality = kf_high_motion_minq[Q];
+
+                /* Special case for key frames forced because we have reached
+                 * the maximum key frame interval. Here force the Q to a range
+                 * based on the ambient Q to reduce the risk of popping
+                 */
+                if ( cpi->this_key_frame_forced )
+                {
+                    if ( cpi->active_best_quality > cpi->avg_frame_qindex * 7/8)
+                        cpi->active_best_quality = cpi->avg_frame_qindex * 7/8;
+                    else if ( cpi->active_best_quality < cpi->avg_frame_qindex >> 2 )
+                        cpi->active_best_quality = cpi->avg_frame_qindex >> 2;
+                }
+            }
+            /* One pass more conservative */
+            else
+               cpi->active_best_quality = kf_high_motion_minq[Q];
+        }
+
+        else if (cpi->oxcf.number_of_layers==1 &&
+                (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame))
+        {
+            /* Use the lower of cpi->active_worst_quality and recent
+             * average Q as basis for GF/ARF Q limit unless last frame was
+             * a key frame.
+             */
+            if ( (cpi->frames_since_key > 1) &&
+               (cpi->avg_frame_qindex < cpi->active_worst_quality) )
+            {
+                Q = cpi->avg_frame_qindex;
+            }
+
+            /* For constrained quality dont allow Q less than the cq level */
+            if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+                 (Q < cpi->cq_target_quality) )
+            {
+                Q = cpi->cq_target_quality;
+            }
+
+            if ( cpi->pass == 2 )
+            {
+                if ( cpi->gfu_boost > 1000 )
+                    cpi->active_best_quality = gf_low_motion_minq[Q];
+                else if ( cpi->gfu_boost < 400 )
+                    cpi->active_best_quality = gf_high_motion_minq[Q];
+                else
+                    cpi->active_best_quality = gf_mid_motion_minq[Q];
+
+                /* Constrained quality use slightly lower active best. */
+                if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY )
+                {
+                    cpi->active_best_quality =
+                        cpi->active_best_quality * 15/16;
+                }
+            }
+            /* One pass more conservative */
+            else
+                cpi->active_best_quality = gf_high_motion_minq[Q];
+        }
+        else
+        {
+            cpi->active_best_quality = inter_minq[Q];
+
+            /* For the constant/constrained quality mode we dont want
+             * q to fall below the cq level.
+             */
+            if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+                (cpi->active_best_quality < cpi->cq_target_quality) )
+            {
+                /* If we are strongly undershooting the target rate in the last
+                 * frames then use the user passed in cq value not the auto
+                 * cq value.
+                 */
+                if ( cpi->rolling_actual_bits < cpi->min_frame_bandwidth )
+                    cpi->active_best_quality = cpi->oxcf.cq_level;
+                else
+                    cpi->active_best_quality = cpi->cq_target_quality;
+            }
+        }
+
+        /* If CBR and the buffer is as full then it is reasonable to allow
+         * higher quality on the frames to prevent bits just going to waste.
+         */
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            /* Note that the use of >= here elliminates the risk of a devide
+             * by 0 error in the else if clause
+             */
+            if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size)
+                cpi->active_best_quality = cpi->best_quality;
+
+            else if (cpi->buffer_level > cpi->oxcf.optimal_buffer_level)
+            {
+                int Fraction = (int)
+                  (((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) * 128)
+                  / (cpi->oxcf.maximum_buffer_size -
+                  cpi->oxcf.optimal_buffer_level));
+                int min_qadjustment = ((cpi->active_best_quality -
+                                        cpi->best_quality) * Fraction) / 128;
+
+                cpi->active_best_quality -= min_qadjustment;
+            }
+        }
+    }
+    /* Make sure constrained quality mode limits are adhered to for the first
+     * few frames of one pass encodes
+     */
+    else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+    {
+        if ( (cm->frame_type == KEY_FRAME) ||
+             cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame )
+        {
+             cpi->active_best_quality = cpi->best_quality;
+        }
+        else if (cpi->active_best_quality < cpi->cq_target_quality)
+        {
+            cpi->active_best_quality = cpi->cq_target_quality;
+        }
+    }
+
+    /* Clip the active best and worst quality values to limits */
+    if (cpi->active_worst_quality > cpi->worst_quality)
+        cpi->active_worst_quality = cpi->worst_quality;
+
+    if (cpi->active_best_quality < cpi->best_quality)
+        cpi->active_best_quality = cpi->best_quality;
+
+    if ( cpi->active_worst_quality < cpi->active_best_quality )
+        cpi->active_worst_quality = cpi->active_best_quality;
+
+    /* Determine initial Q to try */
+    Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+#if !(CONFIG_REALTIME_ONLY)
+
+    /* Set highest allowed value for Zbin over quant */
+    if (cm->frame_type == KEY_FRAME)
+        zbin_oq_high = 0;
+    else if ((cpi->oxcf.number_of_layers == 1) && ((cm->refresh_alt_ref_frame ||
+              (cm->refresh_golden_frame && !cpi->source_alt_ref_active))))
+    {
+          zbin_oq_high = 16;
+    }
+    else
+        zbin_oq_high = ZBIN_OQ_MAX;
+#endif
+
+    /* Setup background Q adjustment for error resilient mode.
+     * For multi-layer encodes only enable this for the base layer.
+     */
+    if (cpi->cyclic_refresh_mode_enabled)
+    {
+      if (cpi->current_layer==0)
+        cyclic_background_refresh(cpi, Q, 0);
+      else
+        disable_segmentation(cpi);
+    }
+
+    vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
+
+#if !(CONFIG_REALTIME_ONLY)
+    /* Limit Q range for the adaptive loop. */
+    bottom_index = cpi->active_best_quality;
+    top_index    = cpi->active_worst_quality;
+    q_low  = cpi->active_best_quality;
+    q_high = cpi->active_worst_quality;
+#endif
+
+    vp8_save_coding_context(cpi);
+
+    loop_count = 0;
+
+    scale_and_extend_source(cpi->un_scaled_source, cpi);
+
+#if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC && !(CONFIG_TEMPORAL_DENOISING)
+
+    if (cpi->oxcf.noise_sensitivity > 0)
+    {
+        unsigned char *src;
+        int l = 0;
+
+        switch (cpi->oxcf.noise_sensitivity)
+        {
+        case 1:
+            l = 20;
+            break;
+        case 2:
+            l = 40;
+            break;
+        case 3:
+            l = 60;
+            break;
+        case 4:
+            l = 80;
+            break;
+        case 5:
+            l = 100;
+            break;
+        case 6:
+            l = 150;
+            break;
+        }
+
+
+        if (cm->frame_type == KEY_FRAME)
+        {
+            vp8_de_noise(cm, cpi->Source, cpi->Source, l , 1,  0);
+        }
+        else
+        {
+            vp8_de_noise(cm, cpi->Source, cpi->Source, l , 1,  0);
+
+            src = cpi->Source->y_buffer;
+
+            if (cpi->Source->y_stride < 0)
+            {
+                src += cpi->Source->y_stride * (cpi->Source->y_height - 1);
+            }
+        }
+    }
+
+#endif
+
+#ifdef OUTPUT_YUV_SRC
+    vp8_write_yuv_frame(cpi->Source);
+#endif
+
+    do
+    {
+        vp8_clear_system_state();
+
+        vp8_set_quantizer(cpi, Q);
+
+        /* setup skip prob for costing in mode/mv decision */
+        if (cpi->common.mb_no_coeff_skip)
+        {
+            cpi->prob_skip_false = cpi->base_skip_false_prob[Q];
+
+            if (cm->frame_type != KEY_FRAME)
+            {
+                if (cpi->common.refresh_alt_ref_frame)
+                {
+                    if (cpi->last_skip_false_probs[2] != 0)
+                        cpi->prob_skip_false = cpi->last_skip_false_probs[2];
+
+                    /*
+                                        if(cpi->last_skip_false_probs[2]!=0 && abs(Q- cpi->last_skip_probs_q[2])<=16 )
+                       cpi->prob_skip_false = cpi->last_skip_false_probs[2];
+                                        else if (cpi->last_skip_false_probs[2]!=0)
+                       cpi->prob_skip_false = (cpi->last_skip_false_probs[2]  + cpi->prob_skip_false ) / 2;
+                       */
+                }
+                else if (cpi->common.refresh_golden_frame)
+                {
+                    if (cpi->last_skip_false_probs[1] != 0)
+                        cpi->prob_skip_false = cpi->last_skip_false_probs[1];
+
+                    /*
+                                        if(cpi->last_skip_false_probs[1]!=0 && abs(Q- cpi->last_skip_probs_q[1])<=16 )
+                       cpi->prob_skip_false = cpi->last_skip_false_probs[1];
+                                        else if (cpi->last_skip_false_probs[1]!=0)
+                       cpi->prob_skip_false = (cpi->last_skip_false_probs[1]  + cpi->prob_skip_false ) / 2;
+                       */
+                }
+                else
+                {
+                    if (cpi->last_skip_false_probs[0] != 0)
+                        cpi->prob_skip_false = cpi->last_skip_false_probs[0];
+
+                    /*
+                    if(cpi->last_skip_false_probs[0]!=0 && abs(Q- cpi->last_skip_probs_q[0])<=16 )
+                        cpi->prob_skip_false = cpi->last_skip_false_probs[0];
+                    else if(cpi->last_skip_false_probs[0]!=0)
+                        cpi->prob_skip_false = (cpi->last_skip_false_probs[0]  + cpi->prob_skip_false ) / 2;
+                        */
+                }
+
+                /* as this is for cost estimate, let's make sure it does not
+                 * go extreme eitehr way
+                 */
+                if (cpi->prob_skip_false < 5)
+                    cpi->prob_skip_false = 5;
+
+                if (cpi->prob_skip_false > 250)
+                    cpi->prob_skip_false = 250;
+
+                if (cpi->oxcf.number_of_layers == 1 && cpi->is_src_frame_alt_ref)
+                    cpi->prob_skip_false = 1;
+            }
+
+#if 0
+
+            if (cpi->pass != 1)
+            {
+                FILE *f = fopen("skip.stt", "a");
+                fprintf(f, "%d, %d, %4d ", cpi->common.refresh_golden_frame, cpi->common.refresh_alt_ref_frame, cpi->prob_skip_false);
+                fclose(f);
+            }
+
+#endif
+
+        }
+
+        if (cm->frame_type == KEY_FRAME)
+        {
+            if(resize_key_frame(cpi))
+            {
+              /* If the frame size has changed, need to reset Q, quantizer,
+               * and background refresh.
+               */
+              Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+              if (cpi->cyclic_refresh_mode_enabled)
+              {
+                if (cpi->current_layer==0)
+                  cyclic_background_refresh(cpi, Q, 0);
+                else
+                  disable_segmentation(cpi);
+              }
+              vp8_set_quantizer(cpi, Q);
+            }
+
+            vp8_setup_key_frame(cpi);
+        }
+
+
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+        {
+            if(cpi->oxcf.error_resilient_mode)
+                cm->refresh_entropy_probs = 0;
+
+            if (cpi->oxcf.error_resilient_mode & VPX_ERROR_RESILIENT_PARTITIONS)
+            {
+                if (cm->frame_type == KEY_FRAME)
+                    cm->refresh_entropy_probs = 1;
+            }
+
+            if (cm->refresh_entropy_probs == 0)
+            {
+                /* save a copy for later refresh */
+                vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
+            }
+
+            vp8_update_coef_context(cpi);
+
+            vp8_update_coef_probs(cpi);
+
+            /* transform / motion compensation build reconstruction frame
+             * +pack coef partitions
+             */
+            vp8_encode_frame(cpi);
+
+            /* cpi->projected_frame_size is not needed for RT mode */
+        }
+#else
+        /* transform / motion compensation build reconstruction frame */
+        vp8_encode_frame(cpi);
+
+        cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
+        cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
+#endif
+        vp8_clear_system_state();
+
+        /* Test to see if the stats generated for this frame indicate that
+         * we should have coded a key frame (assuming that we didn't)!
+         */
+
+        if (cpi->pass != 2 && cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME
+            && cpi->compressor_speed != 2)
+        {
+#if !(CONFIG_REALTIME_ONLY)
+            if (decide_key_frame(cpi))
+            {
+                /* Reset all our sizing numbers and recode */
+                cm->frame_type = KEY_FRAME;
+
+                vp8_pick_frame_size(cpi);
+
+                /* Clear the Alt reference frame active flag when we have
+                 * a key frame
+                 */
+                cpi->source_alt_ref_active = 0;
+
+                // Set the loop filter deltas and segmentation map update
+                setup_features(cpi);
+
+                vp8_restore_coding_context(cpi);
+
+                Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+                vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
+
+                /* Limit Q range for the adaptive loop. */
+                bottom_index = cpi->active_best_quality;
+                top_index    = cpi->active_worst_quality;
+                q_low  = cpi->active_best_quality;
+                q_high = cpi->active_worst_quality;
+
+                loop_count++;
+                Loop = 1;
+
+                continue;
+            }
+#endif
+        }
+
+        vp8_clear_system_state();
+
+        if (frame_over_shoot_limit == 0)
+            frame_over_shoot_limit = 1;
+
+        /* Are we are overshooting and up against the limit of active max Q. */
+        if (((cpi->pass != 2) || (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) &&
+            (Q == cpi->active_worst_quality)                     &&
+            (cpi->active_worst_quality < cpi->worst_quality)      &&
+            (cpi->projected_frame_size > frame_over_shoot_limit))
+        {
+            int over_size_percent = ((cpi->projected_frame_size - frame_over_shoot_limit) * 100) / frame_over_shoot_limit;
+
+            /* If so is there any scope for relaxing it */
+            while ((cpi->active_worst_quality < cpi->worst_quality) && (over_size_percent > 0))
+            {
+                cpi->active_worst_quality++;
+                /* Assume 1 qstep = about 4% on frame size. */
+                over_size_percent = (int)(over_size_percent * 0.96);
+            }
+#if !(CONFIG_REALTIME_ONLY)
+            top_index = cpi->active_worst_quality;
+#endif
+            /* If we have updated the active max Q do not call
+             * vp8_update_rate_correction_factors() this loop.
+             */
+            active_worst_qchanged = 1;
+        }
+        else
+            active_worst_qchanged = 0;
+
+#if !(CONFIG_REALTIME_ONLY)
+        /* Special case handling for forced key frames */
+        if ( (cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced )
+        {
+            int last_q = Q;
+            int kf_err = vp8_calc_ss_err(cpi->Source,
+                                         &cm->yv12_fb[cm->new_fb_idx]);
+
+            /* The key frame is not good enough */
+            if ( kf_err > ((cpi->ambient_err * 7) >> 3) )
+            {
+                /* Lower q_high */
+                q_high = (Q > q_low) ? (Q - 1) : q_low;
+
+                /* Adjust Q */
+                Q = (q_high + q_low) >> 1;
+            }
+            /* The key frame is much better than the previous frame */
+            else if ( kf_err < (cpi->ambient_err >> 1) )
+            {
+                /* Raise q_low */
+                q_low = (Q < q_high) ? (Q + 1) : q_high;
+
+                /* Adjust Q */
+                Q = (q_high + q_low + 1) >> 1;
+            }
+
+            /* Clamp Q to upper and lower limits: */
+            if (Q > q_high)
+                Q = q_high;
+            else if (Q < q_low)
+                Q = q_low;
+
+            Loop = Q != last_q;
+        }
+
+        /* Is the projected frame size out of range and are we allowed
+         * to attempt to recode.
+         */
+        else if ( recode_loop_test( cpi,
+                               frame_over_shoot_limit, frame_under_shoot_limit,
+                               Q, top_index, bottom_index ) )
+        {
+            int last_q = Q;
+            int Retries = 0;
+
+            /* Frame size out of permitted range. Update correction factor
+             * & compute new Q to try...
+             */
+
+            /* Frame is too large */
+            if (cpi->projected_frame_size > cpi->this_frame_target)
+            {
+                /* Raise Qlow as to at least the current value */
+                q_low = (Q < q_high) ? (Q + 1) : q_high;
+
+                /* If we are using over quant do the same for zbin_oq_low */
+                if (cpi->zbin_over_quant > 0)
+                    zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+
+                if (undershoot_seen)
+                {
+                    /* Update rate_correction_factor unless
+                     * cpi->active_worst_quality has changed.
+                     */
+                    if (!active_worst_qchanged)
+                        vp8_update_rate_correction_factors(cpi, 1);
+
+                    Q = (q_high + q_low + 1) / 2;
+
+                    /* Adjust cpi->zbin_over_quant (only allowed when Q
+                     * is max)
+                     */
+                    if (Q < MAXQ)
+                        cpi->zbin_over_quant = 0;
+                    else
+                    {
+                        zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+                        cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+                    }
+                }
+                else
+                {
+                    /* Update rate_correction_factor unless
+                     * cpi->active_worst_quality has changed.
+                     */
+                    if (!active_worst_qchanged)
+                        vp8_update_rate_correction_factors(cpi, 0);
+
+                    Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+                    while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10))
+                    {
+                        vp8_update_rate_correction_factors(cpi, 0);
+                        Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+                        Retries ++;
+                    }
+                }
+
+                overshoot_seen = 1;
+            }
+            /* Frame is too small */
+            else
+            {
+                if (cpi->zbin_over_quant == 0)
+                    /* Lower q_high if not using over quant */
+                    q_high = (Q > q_low) ? (Q - 1) : q_low;
+                else
+                    /* else lower zbin_oq_high */
+                    zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;
+
+                if (overshoot_seen)
+                {
+                    /* Update rate_correction_factor unless
+                     * cpi->active_worst_quality has changed.
+                     */
+                    if (!active_worst_qchanged)
+                        vp8_update_rate_correction_factors(cpi, 1);
+
+                    Q = (q_high + q_low) / 2;
+
+                    /* Adjust cpi->zbin_over_quant (only allowed when Q
+                     * is max)
+                     */
+                    if (Q < MAXQ)
+                        cpi->zbin_over_quant = 0;
+                    else
+                        cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+                }
+                else
+                {
+                    /* Update rate_correction_factor unless
+                     * cpi->active_worst_quality has changed.
+                     */
+                    if (!active_worst_qchanged)
+                        vp8_update_rate_correction_factors(cpi, 0);
+
+                    Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+                    /* Special case reset for qlow for constrained quality.
+                     * This should only trigger where there is very substantial
+                     * undershoot on a frame and the auto cq level is above
+                     * the user passsed in value.
+                     */
+                    if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+                         (Q < q_low) )
+                    {
+                        q_low = Q;
+                    }
+
+                    while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10))
+                    {
+                        vp8_update_rate_correction_factors(cpi, 0);
+                        Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+                        Retries ++;
+                    }
+                }
+
+                undershoot_seen = 1;
+            }
+
+            /* Clamp Q to upper and lower limits: */
+            if (Q > q_high)
+                Q = q_high;
+            else if (Q < q_low)
+                Q = q_low;
+
+            /* Clamp cpi->zbin_over_quant */
+            cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ? zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ? zbin_oq_high : cpi->zbin_over_quant;
+
+            Loop = Q != last_q;
+        }
+        else
+#endif
+            Loop = 0;
+
+        if (cpi->is_src_frame_alt_ref)
+            Loop = 0;
+
+        if (Loop == 1)
+        {
+            vp8_restore_coding_context(cpi);
+            loop_count++;
+#if CONFIG_INTERNAL_STATS
+            cpi->tot_recode_hits++;
+#endif
+        }
+    }
+    while (Loop == 1);
+
+#if 0
+    /* Experimental code for lagged and one pass
+     * Update stats used for one pass GF selection
+     */
+    {
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_coded_error = (double)cpi->prediction_error;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_intra_error = (double)cpi->intra_error;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_pcnt_inter = (double)(100 - cpi->this_frame_percent_intra) / 100.0;
+    }
+#endif
+
+    /* Special case code to reduce pulsing when key frames are forced at a
+     * fixed interval. Note the reconstruction error if it is the frame before
+     * the force key frame
+     */
+    if ( cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0) )
+    {
+        cpi->ambient_err = vp8_calc_ss_err(cpi->Source,
+                                           &cm->yv12_fb[cm->new_fb_idx]);
+    }
+
+    /* This frame's MVs are saved and will be used in next frame's MV predictor.
+     * Last frame has one more line(add to bottom) and one more column(add to
+     * right) than cm->mip. The edge elements are initialized to 0.
+     */
+#if CONFIG_MULTI_RES_ENCODING
+    if(!cpi->oxcf.mr_encoder_id && cm->show_frame)
+#else
+    if(cm->show_frame)   /* do not save for altref frame */
+#endif
+    {
+        int mb_row;
+        int mb_col;
+        /* Point to beginning of allocated MODE_INFO arrays. */
+        MODE_INFO *tmp = cm->mip;
+
+        if(cm->frame_type != KEY_FRAME)
+        {
+            for (mb_row = 0; mb_row < cm->mb_rows+1; mb_row ++)
+            {
+                for (mb_col = 0; mb_col < cm->mb_cols+1; mb_col ++)
+                {
+                    if(tmp->mbmi.ref_frame != INTRA_FRAME)
+                        cpi->lfmv[mb_col + mb_row*(cm->mode_info_stride+1)].as_int = tmp->mbmi.mv.as_int;
+
+                    cpi->lf_ref_frame_sign_bias[mb_col + mb_row*(cm->mode_info_stride+1)] = cm->ref_frame_sign_bias[tmp->mbmi.ref_frame];
+                    cpi->lf_ref_frame[mb_col + mb_row*(cm->mode_info_stride+1)] = tmp->mbmi.ref_frame;
+                    tmp++;
+                }
+            }
+        }
+    }
+
+    /* Count last ref frame 0,0 usage on current encoded frame. */
+    {
+        int mb_row;
+        int mb_col;
+        /* Point to beginning of MODE_INFO arrays. */
+        MODE_INFO *tmp = cm->mi;
+
+        cpi->inter_zz_count = 0;
+        cpi->zeromv_count = 0;
+
+        if(cm->frame_type != KEY_FRAME)
+        {
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+            {
+                for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
+                {
+                    if(tmp->mbmi.mode == ZEROMV && tmp->mbmi.ref_frame == LAST_FRAME)
+                        cpi->inter_zz_count++;
+                    if(tmp->mbmi.mode == ZEROMV)
+                        cpi->zeromv_count++;
+                    tmp++;
+                }
+                tmp++;
+            }
+        }
+    }
+
+#if CONFIG_MULTI_RES_ENCODING
+    vp8_cal_dissimilarity(cpi);
+#endif
+
+    /* Update the GF useage maps.
+     * This is done after completing the compression of a frame when all
+     * modes etc. are finalized but before loop filter
+     */
+    if (cpi->oxcf.number_of_layers == 1)
+        vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
+
+    if (cm->frame_type == KEY_FRAME)
+        cm->refresh_last_frame = 1;
+
+#if 0
+    {
+        FILE *f = fopen("gfactive.stt", "a");
+        fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);
+        fclose(f);
+    }
+#endif
+
+    /* For inter frames the current default behavior is that when
+     * cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer
+     * This is purely an encoder decision at present.
+     */
+    if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame)
+        cm->copy_buffer_to_arf  = 2;
+    else
+        cm->copy_buffer_to_arf  = 0;
+
+    cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+
+#if CONFIG_MULTITHREAD
+    if (cpi->b_multi_threaded)
+    {
+        /* start loopfilter in separate thread */
+        sem_post(&cpi->h_event_start_lpf);
+        cpi->b_lpf_running = 1;
+    }
+    else
+#endif
+    {
+        vp8_loopfilter_frame(cpi, cm);
+    }
+
+    update_reference_frames(cpi);
+
+#if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    if (cpi->oxcf.error_resilient_mode)
+    {
+        cm->refresh_entropy_probs = 0;
+    }
+#endif
+
+#if CONFIG_MULTITHREAD
+    /* wait that filter_level is picked so that we can continue with stream packing */
+    if (cpi->b_multi_threaded)
+        sem_wait(&cpi->h_event_end_lpf);
+#endif
+
+    /* build the bitstream */
+    vp8_pack_bitstream(cpi, dest, dest_end, size);
+
+#if CONFIG_MULTITHREAD
+    /* if PSNR packets are generated we have to wait for the lpf */
+    if (cpi->b_lpf_running && cpi->b_calculate_psnr)
+    {
+        sem_wait(&cpi->h_event_end_lpf);
+        cpi->b_lpf_running = 0;
+    }
+#endif
+
+    /* Move storing frame_type out of the above loop since it is also
+     * needed in motion search besides loopfilter */
+    cm->last_frame_type = cm->frame_type;
+
+    /* Update rate control heuristics */
+    cpi->total_byte_count += (*size);
+    cpi->projected_frame_size = (*size) << 3;
+
+    if (cpi->oxcf.number_of_layers > 1)
+    {
+        unsigned int i;
+        for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
+          cpi->layer_context[i].total_byte_count += (*size);
+    }
+
+    if (!active_worst_qchanged)
+        vp8_update_rate_correction_factors(cpi, 2);
+
+    cpi->last_q[cm->frame_type] = cm->base_qindex;
+
+    if (cm->frame_type == KEY_FRAME)
+    {
+        vp8_adjust_key_frame_context(cpi);
+    }
+
+    /* Keep a record of ambient average Q. */
+    if (cm->frame_type != KEY_FRAME)
+        cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
+
+    /* Keep a record from which we can calculate the average Q excluding
+     * GF updates and key frames
+     */
+    if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
+        (!cm->refresh_golden_frame && !cm->refresh_alt_ref_frame)))
+    {
+        cpi->ni_frames++;
+
+        /* Calculate the average Q for normal inter frames (not key or GFU
+         * frames).
+         */
+        if ( cpi->pass == 2 )
+        {
+            cpi->ni_tot_qi += Q;
+            cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
+        }
+        else
+        {
+            /* Damp value for first few frames */
+            if (cpi->ni_frames > 150 )
+            {
+                cpi->ni_tot_qi += Q;
+                cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
+            }
+            /* For one pass, early in the clip ... average the current frame Q
+             * value with the worstq entered by the user as a dampening measure
+             */
+            else
+            {
+                cpi->ni_tot_qi += Q;
+                cpi->ni_av_qi = ((cpi->ni_tot_qi / cpi->ni_frames) + cpi->worst_quality + 1) / 2;
+            }
+
+            /* If the average Q is higher than what was used in the last
+             * frame (after going through the recode loop to keep the frame
+             * size within range) then use the last frame value - 1. The -1
+             * is designed to stop Q and hence the data rate, from
+             * progressively falling away during difficult sections, but at
+             * the same time reduce the number of itterations around the
+             * recode loop.
+             */
+            if (Q > cpi->ni_av_qi)
+                cpi->ni_av_qi = Q - 1;
+        }
+    }
+
+    /* Update the buffer level variable. */
+    /* Non-viewable frames are a special case and are treated as pure overhead. */
+    if ( !cm->show_frame )
+        cpi->bits_off_target -= cpi->projected_frame_size;
+    else
+        cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
+
+    /* Clip the buffer level to the maximum specified buffer size */
+    if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
+        cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+
+    /* Rolling monitors of whether we are over or underspending used to
+     * help regulate min and Max Q in two pass.
+     */
+    cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
+    cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
+    cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
+    cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;
+
+    /* Actual bits spent */
+    cpi->total_actual_bits += cpi->projected_frame_size;
+
+    /* Debug stats */
+    cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
+
+    cpi->buffer_level = cpi->bits_off_target;
+
+    /* Propagate values to higher temporal layers */
+    if (cpi->oxcf.number_of_layers > 1)
+    {
+        unsigned int i;
+
+        for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
+        {
+            LAYER_CONTEXT *lc = &cpi->layer_context[i];
+            int bits_off_for_this_layer =
+               (int)(lc->target_bandwidth / lc->frame_rate -
+                     cpi->projected_frame_size);
+
+            lc->bits_off_target += bits_off_for_this_layer;
+
+            /* Clip buffer level to maximum buffer size for the layer */
+            if (lc->bits_off_target > lc->maximum_buffer_size)
+                lc->bits_off_target = lc->maximum_buffer_size;
+
+            lc->total_actual_bits += cpi->projected_frame_size;
+            lc->total_target_vs_actual += bits_off_for_this_layer;
+            lc->buffer_level = lc->bits_off_target;
+        }
+    }
+
+    /* Update bits left to the kf and gf groups to account for overshoot
+     * or undershoot on these frames
+     */
+    if (cm->frame_type == KEY_FRAME)
+    {
+        cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+        if (cpi->twopass.kf_group_bits < 0)
+            cpi->twopass.kf_group_bits = 0 ;
+    }
+    else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
+    {
+        cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+        if (cpi->twopass.gf_group_bits < 0)
+            cpi->twopass.gf_group_bits = 0 ;
+    }
+
+    if (cm->frame_type != KEY_FRAME)
+    {
+        if (cpi->common.refresh_alt_ref_frame)
+        {
+            cpi->last_skip_false_probs[2] = cpi->prob_skip_false;
+            cpi->last_skip_probs_q[2] = cm->base_qindex;
+        }
+        else if (cpi->common.refresh_golden_frame)
+        {
+            cpi->last_skip_false_probs[1] = cpi->prob_skip_false;
+            cpi->last_skip_probs_q[1] = cm->base_qindex;
+        }
+        else
+        {
+            cpi->last_skip_false_probs[0] = cpi->prob_skip_false;
+            cpi->last_skip_probs_q[0] = cm->base_qindex;
+
+            /* update the baseline */
+            cpi->base_skip_false_prob[cm->base_qindex] = cpi->prob_skip_false;
+
+        }
+    }
+
+#if 0 && CONFIG_INTERNAL_STATS
+    {
+        FILE *f = fopen("tmp.stt", "a");
+
+        vp8_clear_system_state();
+
+        if (cpi->twopass.total_left_stats.coded_error != 0.0)
+            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
+                       "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+                       "%10.3f %8d\n",
+                       cpi->common.current_video_frame, cpi->this_frame_target,
+                       cpi->projected_frame_size,
+                       (cpi->projected_frame_size - cpi->this_frame_target),
+                       (int)cpi->total_target_vs_actual,
+                       cpi->buffer_level,
+                       (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
+                       (int)cpi->total_actual_bits, cm->base_qindex,
+                       cpi->active_best_quality, cpi->active_worst_quality,
+                       cpi->ni_av_qi, cpi->cq_target_quality,
+                       cpi->zbin_over_quant,
+                       cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+                       cm->frame_type, cpi->gfu_boost,
+                       cpi->twopass.est_max_qcorrection_factor,
+                       (int)cpi->twopass.bits_left,
+                       cpi->twopass.total_left_stats.coded_error,
+                       (double)cpi->twopass.bits_left /
+                           cpi->twopass.total_left_stats.coded_error,
+                       cpi->tot_recode_hits);
+        else
+            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
+                       "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
+                       "%8d\n",
+                       cpi->common.current_video_frame,
+                       cpi->this_frame_target, cpi->projected_frame_size,
+                       (cpi->projected_frame_size - cpi->this_frame_target),
+                       (int)cpi->total_target_vs_actual,
+                       cpi->buffer_level,
+                       (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
+                       (int)cpi->total_actual_bits, cm->base_qindex,
+                       cpi->active_best_quality, cpi->active_worst_quality,
+                       cpi->ni_av_qi, cpi->cq_target_quality,
+                       cpi->zbin_over_quant,
+                       cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
+                       cm->frame_type, cpi->gfu_boost,
+                       cpi->twopass.est_max_qcorrection_factor,
+                       (int)cpi->twopass.bits_left,
+                       cpi->twopass.total_left_stats.coded_error,
+                       cpi->tot_recode_hits);
+
+        fclose(f);
+
+        {
+            FILE *fmodes = fopen("Modes.stt", "a");
+            int i;
+
+            fprintf(fmodes, "%6d:%1d:%1d:%1d ",
+                        cpi->common.current_video_frame,
+                        cm->frame_type, cm->refresh_golden_frame,
+                        cm->refresh_alt_ref_frame);
+
+            for (i = 0; i < MAX_MODES; i++)
+                fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+
+            fprintf(fmodes, "\n");
+
+            fclose(fmodes);
+        }
+    }
+
+#endif
+
+    if (cm->refresh_golden_frame == 1)
+        cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
+    else
+        cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;
+
+    if (cm->refresh_alt_ref_frame == 1)
+        cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;
+    else
+        cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
+
+
+    if (cm->refresh_last_frame & cm->refresh_golden_frame)
+        /* both refreshed */
+        cpi->gold_is_last = 1;
+    else if (cm->refresh_last_frame ^ cm->refresh_golden_frame)
+        /* 1 refreshed but not the other */
+        cpi->gold_is_last = 0;
+
+    if (cm->refresh_last_frame & cm->refresh_alt_ref_frame)
+        /* both refreshed */
+        cpi->alt_is_last = 1;
+    else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame)
+        /* 1 refreshed but not the other */
+        cpi->alt_is_last = 0;
+
+    if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame)
+        /* both refreshed */
+        cpi->gold_is_alt = 1;
+    else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame)
+        /* 1 refreshed but not the other */
+        cpi->gold_is_alt = 0;
+
+    cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
+
+    if (cpi->gold_is_last)
+        cpi->ref_frame_flags &= ~VP8_GOLD_FRAME;
+
+    if (cpi->alt_is_last)
+        cpi->ref_frame_flags &= ~VP8_ALTR_FRAME;
+
+    if (cpi->gold_is_alt)
+        cpi->ref_frame_flags &= ~VP8_ALTR_FRAME;
+
+
+    if (!cpi->oxcf.error_resilient_mode)
+    {
+        if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME))
+            /* Update the alternate reference frame stats as appropriate. */
+            update_alt_ref_frame_stats(cpi);
+        else
+            /* Update the Golden frame stats as appropriate. */
+            update_golden_frame_stats(cpi);
+    }
+
+    if (cm->frame_type == KEY_FRAME)
+    {
+        /* Tell the caller that the frame was coded as a key frame */
+        *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;
+
+        /* As this frame is a key frame  the next defaults to an inter frame. */
+        cm->frame_type = INTER_FRAME;
+
+        cpi->last_frame_percent_intra = 100;
+    }
+    else
+    {
+        *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;
+
+        cpi->last_frame_percent_intra = cpi->this_frame_percent_intra;
+    }
+
+    /* Clear the one shot update flags for segmentation map and mode/ref
+     * loop filter deltas.
+     */
+    cpi->mb.e_mbd.update_mb_segmentation_map = 0;
+    cpi->mb.e_mbd.update_mb_segmentation_data = 0;
+    cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
+
+
+    /* Dont increment frame counters if this was an altref buffer update
+     * not a real frame
+     */
+    if (cm->show_frame)
+    {
+        cm->current_video_frame++;
+        cpi->frames_since_key++;
+    }
+
+    /* reset to normal state now that we are done. */
+
+
+
+#if 0
+    {
+        char filename[512];
+        FILE *recon_file;
+        sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+        recon_file = fopen(filename, "wb");
+        fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc,
+               cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file);
+        fclose(recon_file);
+    }
+#endif
+
+    /* DEBUG */
+    /* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
+
+
+}
+
+
+static void check_gf_quality(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+    int gf_active_pct = (100 * cpi->gf_active_count) / (cm->mb_rows * cm->mb_cols);
+    int gf_ref_usage_pct = (cpi->count_mb_ref_frame_usage[GOLDEN_FRAME] * 100) / (cm->mb_rows * cm->mb_cols);
+    int last_ref_zz_useage = (cpi->inter_zz_count * 100) / (cm->mb_rows * cm->mb_cols);
+
+    /* Gf refresh is not currently being signalled */
+    if (cpi->gf_update_recommended == 0)
+    {
+        if (cpi->common.frames_since_golden > 7)
+        {
+            /* Low use of gf */
+            if ((gf_active_pct < 10) || ((gf_active_pct + gf_ref_usage_pct) < 15))
+            {
+                /* ...but last frame zero zero usage is reasonbable so a
+                 * new gf might be appropriate
+                 */
+                if (last_ref_zz_useage >= 25)
+                {
+                    cpi->gf_bad_count ++;
+
+                    /* Check that the condition is stable */
+                    if (cpi->gf_bad_count >= 8)
+                    {
+                        cpi->gf_update_recommended = 1;
+                        cpi->gf_bad_count = 0;
+                    }
+                }
+                else
+                    /* Restart count as the background is not stable enough */
+                    cpi->gf_bad_count = 0;
+            }
+            else
+                /* Gf useage has picked up so reset count */
+                cpi->gf_bad_count = 0;
+        }
+    }
+    /* If the signal is set but has not been read should we cancel it. */
+    else if (last_ref_zz_useage < 15)
+    {
+        cpi->gf_update_recommended = 0;
+        cpi->gf_bad_count = 0;
+    }
+
+#if 0
+    {
+        FILE *f = fopen("gfneeded.stt", "a");
+        fprintf(f, "%10d %10d %10d %10d %10ld \n",
+                cm->current_video_frame,
+                cpi->common.frames_since_golden,
+                gf_active_pct, gf_ref_usage_pct,
+                cpi->gf_update_recommended);
+        fclose(f);
+    }
+
+#endif
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned char * dest_end, unsigned int *frame_flags)
+{
+
+    if (!cpi->common.refresh_alt_ref_frame)
+        vp8_second_pass(cpi);
+
+    encode_frame_to_data_rate(cpi, size, dest, dest_end, frame_flags);
+    cpi->twopass.bits_left -= 8 * *size;
+
+    if (!cpi->common.refresh_alt_ref_frame)
+    {
+        double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
+            *cpi->oxcf.two_pass_vbrmin_section / 100);
+        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->frame_rate);
+    }
+}
+#endif
+
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
+#if HAVE_NEON
+extern void vp8_push_neon(int64_t *store);
+extern void vp8_pop_neon(int64_t *store);
+#endif
+
+
+int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time)
+{
+#if HAVE_NEON
+    int64_t store_reg[8];
+#endif
+    VP8_COMMON            *cm = &cpi->common;
+    struct vpx_usec_timer  timer;
+    int                    res = 0;
+
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->cpu_caps & HAS_NEON)
+#endif
+    {
+        vp8_push_neon(store_reg);
+    }
+#endif
+
+    vpx_usec_timer_start(&timer);
+
+    /* Reinit the lookahead buffer if the frame size changes */
+    if (sd->y_width != cpi->oxcf.Width || sd->y_height != cpi->oxcf.Height)
+    {
+        assert(cpi->oxcf.lag_in_frames < 2);
+        dealloc_raw_frame_buffers(cpi);
+        alloc_raw_frame_buffers(cpi);
+    }
+
+    if(vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+                          frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL))
+        res = -1;
+    cm->clr_type = sd->clrtype;
+    vpx_usec_timer_mark(&timer);
+    cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
+
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->cpu_caps & HAS_NEON)
+#endif
+    {
+        vp8_pop_neon(store_reg);
+    }
+#endif
+
+    return res;
+}
+
+
+static int frame_is_reference(const VP8_COMP *cpi)
+{
+    const VP8_COMMON *cm = &cpi->common;
+    const MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+    return cm->frame_type == KEY_FRAME || cm->refresh_last_frame
+           || cm->refresh_golden_frame || cm->refresh_alt_ref_frame
+           || cm->copy_buffer_to_gf || cm->copy_buffer_to_arf
+           || cm->refresh_entropy_probs
+           || xd->mode_ref_lf_delta_update
+           || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data;
+}
+
+
+int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush)
+{
+#if HAVE_NEON
+    int64_t store_reg[8];
+#endif
+    VP8_COMMON *cm;
+    struct vpx_usec_timer  tsctimer;
+    struct vpx_usec_timer  ticktimer;
+    struct vpx_usec_timer  cmptimer;
+    YV12_BUFFER_CONFIG    *force_src_buffer = NULL;
+
+    if (!cpi)
+        return -1;
+
+    cm = &cpi->common;
+
+    if (setjmp(cpi->common.error.jmp))
+    {
+        cpi->common.error.setjmp = 0;
+        return VPX_CODEC_CORRUPT_FRAME;
+    }
+
+    cpi->common.error.setjmp = 1;
+
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->cpu_caps & HAS_NEON)
+#endif
+    {
+        vp8_push_neon(store_reg);
+    }
+#endif
+
+    vpx_usec_timer_start(&cmptimer);
+
+    cpi->source = NULL;
+
+#if !(CONFIG_REALTIME_ONLY)
+    /* Should we code an alternate reference frame */
+    if (cpi->oxcf.error_resilient_mode == 0 &&
+        cpi->oxcf.play_alternate &&
+        cpi->source_alt_ref_pending)
+    {
+        if ((cpi->source = vp8_lookahead_peek(cpi->lookahead,
+                                              cpi->frames_till_gf_update_due,
+                                              PEEK_FORWARD)))
+        {
+            cpi->alt_ref_source = cpi->source;
+            if (cpi->oxcf.arnr_max_frames > 0)
+            {
+                vp8_temporal_filter_prepare_c(cpi,
+                                              cpi->frames_till_gf_update_due);
+                force_src_buffer = &cpi->alt_ref_buffer;
+            }
+            cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
+            cm->refresh_alt_ref_frame = 1;
+            cm->refresh_golden_frame = 0;
+            cm->refresh_last_frame = 0;
+            cm->show_frame = 0;
+            /* Clear Pending alt Ref flag. */
+            cpi->source_alt_ref_pending = 0;
+            cpi->is_src_frame_alt_ref = 0;
+        }
+    }
+#endif
+
+    if (!cpi->source)
+    {
+        /* Read last frame source if we are encoding first pass. */
+        if (cpi->pass == 1 && cm->current_video_frame > 0)
+        {
+            if((cpi->last_source = vp8_lookahead_peek(cpi->lookahead, 1,
+                                                      PEEK_BACKWARD)) == NULL)
+              return -1;
+        }
+
+
+        if ((cpi->source = vp8_lookahead_pop(cpi->lookahead, flush)))
+        {
+            cm->show_frame = 1;
+
+            cpi->is_src_frame_alt_ref = cpi->alt_ref_source
+                                        && (cpi->source == cpi->alt_ref_source);
+
+            if(cpi->is_src_frame_alt_ref)
+                cpi->alt_ref_source = NULL;
+        }
+    }
+
+    if (cpi->source)
+    {
+        cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img;
+        cpi->un_scaled_source = cpi->Source;
+        *time_stamp = cpi->source->ts_start;
+        *time_end = cpi->source->ts_end;
+        *frame_flags = cpi->source->flags;
+
+        if (cpi->pass == 1 && cm->current_video_frame > 0)
+        {
+            cpi->last_frame_unscaled_source = &cpi->last_source->img;
+        }
+    }
+    else
+    {
+        *size = 0;
+#if !(CONFIG_REALTIME_ONLY)
+
+        if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done)
+        {
+            vp8_end_first_pass(cpi);    /* get last stats packet */
+            cpi->twopass.first_pass_done = 1;
+        }
+
+#endif
+
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+        if (cm->cpu_caps & HAS_NEON)
+#endif
+        {
+            vp8_pop_neon(store_reg);
+        }
+#endif
+        return -1;
+    }
+
+    if (cpi->source->ts_start < cpi->first_time_stamp_ever)
+    {
+        cpi->first_time_stamp_ever = cpi->source->ts_start;
+        cpi->last_end_time_stamp_seen = cpi->source->ts_start;
+    }
+
+    /* adjust frame rates based on timestamps given */
+    if (cm->show_frame)
+    {
+        int64_t this_duration;
+        int step = 0;
+
+        if (cpi->source->ts_start == cpi->first_time_stamp_ever)
+        {
+            this_duration = cpi->source->ts_end - cpi->source->ts_start;
+            step = 1;
+        }
+        else
+        {
+            int64_t last_duration;
+
+            this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
+            last_duration = cpi->last_end_time_stamp_seen
+                            - cpi->last_time_stamp_seen;
+            /* do a step update if the duration changes by 10% */
+            if (last_duration)
+                step = (int)(((this_duration - last_duration) *
+                            10 / last_duration));
+        }
+
+        if (this_duration)
+        {
+            if (step)
+                cpi->ref_frame_rate = 10000000.0 / this_duration;
+            else
+            {
+                double avg_duration, interval;
+
+                /* Average this frame's rate into the last second's average
+                 * frame rate. If we haven't seen 1 second yet, then average
+                 * over the whole interval seen.
+                 */
+                interval = (double)(cpi->source->ts_end -
+                                    cpi->first_time_stamp_ever);
+                if(interval > 10000000.0)
+                    interval = 10000000;
+
+                avg_duration = 10000000.0 / cpi->ref_frame_rate;
+                avg_duration *= (interval - avg_duration + this_duration);
+                avg_duration /= interval;
+
+                cpi->ref_frame_rate = 10000000.0 / avg_duration;
+            }
+
+            if (cpi->oxcf.number_of_layers > 1)
+            {
+                unsigned int i;
+
+                /* Update frame rates for each layer */
+                for (i=0; i<cpi->oxcf.number_of_layers; i++)
+                {
+                    LAYER_CONTEXT *lc = &cpi->layer_context[i];
+                    lc->frame_rate = cpi->ref_frame_rate /
+                                  cpi->oxcf.rate_decimator[i];
+                }
+            }
+            else
+                vp8_new_frame_rate(cpi, cpi->ref_frame_rate);
+        }
+
+        cpi->last_time_stamp_seen = cpi->source->ts_start;
+        cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+    }
+
+    if (cpi->oxcf.number_of_layers > 1)
+    {
+        int layer;
+
+        update_layer_contexts (cpi);
+
+        /* Restore layer specific context & set frame rate */
+        layer = cpi->oxcf.layer_id[
+                            cm->current_video_frame % cpi->oxcf.periodicity];
+        restore_layer_context (cpi, layer);
+        vp8_new_frame_rate (cpi, cpi->layer_context[layer].frame_rate);
+    }
+
+    if (cpi->compressor_speed == 2)
+    {
+        if (cpi->oxcf.number_of_layers == 1)
+            check_gf_quality(cpi);
+        vpx_usec_timer_start(&tsctimer);
+        vpx_usec_timer_start(&ticktimer);
+    }
+
+    cpi->lf_zeromv_pct = (cpi->zeromv_count * 100)/cm->MBs;
+
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+    {
+        int i;
+        const int num_part = (1 << cm->multi_token_partition);
+        /* the available bytes in dest */
+        const unsigned long dest_size = dest_end - dest;
+        const int tok_part_buff_size = (dest_size * 9) / (10 * num_part);
+
+        unsigned char *dp = dest;
+
+        cpi->partition_d[0] = dp;
+        dp += dest_size/10;         /* reserve 1/10 for control partition */
+        cpi->partition_d_end[0] = dp;
+
+        for(i = 0; i < num_part; i++)
+        {
+            cpi->partition_d[i + 1] = dp;
+            dp += tok_part_buff_size;
+            cpi->partition_d_end[i + 1] = dp;
+        }
+    }
+#endif
+
+    /* start with a 0 size frame */
+    *size = 0;
+
+    /* Clear down mmx registers */
+    vp8_clear_system_state();
+
+    cm->frame_type = INTER_FRAME;
+    cm->frame_flags = *frame_flags;
+
+#if 0
+
+    if (cm->refresh_alt_ref_frame)
+    {
+        cm->refresh_golden_frame = 0;
+        cm->refresh_last_frame = 0;
+    }
+    else
+    {
+        cm->refresh_golden_frame = 0;
+        cm->refresh_last_frame = 1;
+    }
+
+#endif
+    /* find a free buffer for the new frame */
+    {
+        int i = 0;
+        for(; i < NUM_YV12_BUFFERS; i++)
+        {
+            if(!cm->yv12_fb[i].flags)
+            {
+                cm->new_fb_idx = i;
+                break;
+            }
+        }
+
+        assert(i < NUM_YV12_BUFFERS );
+    }
+#if !(CONFIG_REALTIME_ONLY)
+
+    if (cpi->pass == 1)
+    {
+        Pass1Encode(cpi, size, dest, frame_flags);
+    }
+    else if (cpi->pass == 2)
+    {
+        Pass2Encode(cpi, size, dest, dest_end, frame_flags);
+    }
+    else
+#endif
+        encode_frame_to_data_rate(cpi, size, dest, dest_end, frame_flags);
+
+    if (cpi->compressor_speed == 2)
+    {
+        unsigned int duration, duration2;
+        vpx_usec_timer_mark(&tsctimer);
+        vpx_usec_timer_mark(&ticktimer);
+
+        duration = (int)(vpx_usec_timer_elapsed(&ticktimer));
+        duration2 = (unsigned int)((double)duration / 2);
+
+        if (cm->frame_type != KEY_FRAME)
+        {
+            if (cpi->avg_encode_time == 0)
+                cpi->avg_encode_time = duration;
+            else
+                cpi->avg_encode_time = (7 * cpi->avg_encode_time + duration) >> 3;
+        }
+
+        if (duration2)
+        {
+            {
+
+                if (cpi->avg_pick_mode_time == 0)
+                    cpi->avg_pick_mode_time = duration2;
+                else
+                    cpi->avg_pick_mode_time = (7 * cpi->avg_pick_mode_time + duration2) >> 3;
+            }
+        }
+
+    }
+
+    if (cm->refresh_entropy_probs == 0)
+    {
+        vpx_memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
+    }
+
+    /* Save the contexts separately for alt ref, gold and last. */
+    /* (TODO jbb -> Optimize this with pointers to avoid extra copies. ) */
+    if(cm->refresh_alt_ref_frame)
+        vpx_memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc));
+
+    if(cm->refresh_golden_frame)
+        vpx_memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc));
+
+    if(cm->refresh_last_frame)
+        vpx_memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
+
+    /* if its a dropped frame honor the requests on subsequent frames */
+    if (*size > 0)
+    {
+        cpi->droppable = !frame_is_reference(cpi);
+
+        /* return to normal state */
+        cm->refresh_entropy_probs = 1;
+        cm->refresh_alt_ref_frame = 0;
+        cm->refresh_golden_frame = 0;
+        cm->refresh_last_frame = 1;
+        cm->frame_type = INTER_FRAME;
+
+    }
+
+    /* Save layer specific state */
+    if (cpi->oxcf.number_of_layers > 1)
+        save_layer_context (cpi);
+
+    vpx_usec_timer_mark(&cmptimer);
+    cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+
+    if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)
+    {
+        generate_psnr_packet(cpi);
+    }
+
+#if CONFIG_INTERNAL_STATS
+
+    if (cpi->pass != 1)
+    {
+        cpi->bytes += *size;
+
+        if (cm->show_frame)
+        {
+
+            cpi->count ++;
+
+            if (cpi->b_calculate_psnr)
+            {
+                uint64_t ye,ue,ve;
+                double frame_psnr;
+                YV12_BUFFER_CONFIG      *orig = cpi->Source;
+                YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
+                int y_samples = orig->y_height * orig->y_width ;
+                int uv_samples = orig->uv_height * orig->uv_width ;
+                int t_samples = y_samples + 2 * uv_samples;
+                double sq_error, sq_error2;
+
+                ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+                  recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height);
+
+                ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                  recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height);
+
+                ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                  recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height);
+
+                sq_error = (double)(ye + ue + ve);
+
+                frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error);
+
+                cpi->total_y += vp8_mse2psnr(y_samples, 255.0, (double)ye);
+                cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, (double)ue);
+                cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, (double)ve);
+                cpi->total_sq_error += sq_error;
+                cpi->total  += frame_psnr;
+#if CONFIG_POSTPROC
+                {
+                    YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;
+                    double frame_psnr2, frame_ssim2 = 0;
+                    double weight = 0;
+
+                    vp8_deblock(cm, cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0);
+                    vp8_clear_system_state();
+
+                    ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+                      pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height);
+
+                    ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                      pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height);
+
+                    ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                      pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height);
+
+                    sq_error2 = (double)(ye + ue + ve);
+
+                    frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error2);
+
+                    cpi->totalp_y += vp8_mse2psnr(y_samples,
+                                                  255.0, (double)ye);
+                    cpi->totalp_u += vp8_mse2psnr(uv_samples,
+                                                  255.0, (double)ue);
+                    cpi->totalp_v += vp8_mse2psnr(uv_samples,
+                                                  255.0, (double)ve);
+                    cpi->total_sq_error2 += sq_error2;
+                    cpi->totalp  += frame_psnr2;
+
+                    frame_ssim2 = vp8_calc_ssim(cpi->Source,
+                      &cm->post_proc_buffer, 1, &weight);
+
+                    cpi->summed_quality += frame_ssim2 * weight;
+                    cpi->summed_weights += weight;
+
+                    if (cpi->oxcf.number_of_layers > 1)
+                    {
+                         unsigned int i;
+
+                         for (i=cpi->current_layer;
+                                       i<cpi->oxcf.number_of_layers; i++)
+                         {
+                             cpi->frames_in_layer[i]++;
+
+                             cpi->bytes_in_layer[i] += *size;
+                             cpi->sum_psnr[i]       += frame_psnr;
+                             cpi->sum_psnr_p[i]     += frame_psnr2;
+                             cpi->total_error2[i]   += sq_error;
+                             cpi->total_error2_p[i] += sq_error2;
+                             cpi->sum_ssim[i]       += frame_ssim2 * weight;
+                             cpi->sum_weights[i]    += weight;
+                         }
+                    }
+                }
+#endif
+            }
+
+            if (cpi->b_calculate_ssimg)
+            {
+                double y, u, v, frame_all;
+                frame_all =  vp8_calc_ssimg(cpi->Source, cm->frame_to_show,
+                    &y, &u, &v);
+
+                if (cpi->oxcf.number_of_layers > 1)
+                {
+                    unsigned int i;
+
+                    for (i=cpi->current_layer;
+                         i<cpi->oxcf.number_of_layers; i++)
+                    {
+                        if (!cpi->b_calculate_psnr)
+                            cpi->frames_in_layer[i]++;
+
+                        cpi->total_ssimg_y_in_layer[i] += y;
+                        cpi->total_ssimg_u_in_layer[i] += u;
+                        cpi->total_ssimg_v_in_layer[i] += v;
+                        cpi->total_ssimg_all_in_layer[i] += frame_all;
+                    }
+                }
+                else
+                {
+                    cpi->total_ssimg_y += y;
+                    cpi->total_ssimg_u += u;
+                    cpi->total_ssimg_v += v;
+                    cpi->total_ssimg_all += frame_all;
+                }
+            }
+
+        }
+    }
+
+#if 0
+
+    if (cpi->common.frame_type != 0 && cpi->common.base_qindex == cpi->oxcf.worst_allowed_q)
+    {
+        skiptruecount += cpi->skip_true_count;
+        skipfalsecount += cpi->skip_false_count;
+    }
+
+#endif
+#if 0
+
+    if (cpi->pass != 1)
+    {
+        FILE *f = fopen("skip.stt", "a");
+        fprintf(f, "frame:%4d flags:%4x Q:%4d P:%4d Size:%5d\n", cpi->common.current_video_frame, *frame_flags, cpi->common.base_qindex, cpi->prob_skip_false, *size);
+
+        if (cpi->is_src_frame_alt_ref == 1)
+            fprintf(f, "skipcount: %4d framesize: %d\n", cpi->skip_true_count , *size);
+
+        fclose(f);
+    }
+
+#endif
+#endif
+
+#if HAVE_NEON
+#if CONFIG_RUNTIME_CPU_DETECT
+    if (cm->cpu_caps & HAS_NEON)
+#endif
+    {
+        vp8_pop_neon(store_reg);
+    }
+#endif
+
+    cpi->common.error.setjmp = 0;
+
+    return 0;
+}
+
+int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags)
+{
+    if (cpi->common.refresh_alt_ref_frame)
+        return -1;
+    else
+    {
+        int ret;
+
+#if CONFIG_MULTITHREAD
+        if(cpi->b_lpf_running)
+        {
+            sem_wait(&cpi->h_event_end_lpf);
+            cpi->b_lpf_running = 0;
+        }
+#endif
+
+#if CONFIG_POSTPROC
+        ret = vp8_post_proc_frame(&cpi->common, dest, flags);
+#else
+
+        if (cpi->common.frame_to_show)
+        {
+            *dest = *cpi->common.frame_to_show;
+            dest->y_width = cpi->common.Width;
+            dest->y_height = cpi->common.Height;
+            dest->uv_height = cpi->common.Height / 2;
+            ret = 0;
+        }
+        else
+        {
+            ret = -1;
+        }
+
+#endif
+        vp8_clear_system_state();
+        return ret;
+    }
+}
+
+int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4])
+{
+    signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+    int internal_delta_q[MAX_MB_SEGMENTS];
+    const int range = 63;
+    int i;
+
+    // This method is currently incompatible with the cyclic refresh method
+    if ( cpi->cyclic_refresh_mode_enabled )
+        return -1;
+
+    // Check number of rows and columns match
+    if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
+        return -1;
+
+    // Range check the delta Q values and convert the external Q range values
+    // to internal ones.
+    if ( (abs(delta_q[0]) > range) || (abs(delta_q[1]) > range) ||
+         (abs(delta_q[2]) > range) || (abs(delta_q[3]) > range) )
+        return -1;
+
+    // Range check the delta lf values
+    if ( (abs(delta_lf[0]) > range) || (abs(delta_lf[1]) > range) ||
+         (abs(delta_lf[2]) > range) || (abs(delta_lf[3]) > range) )
+        return -1;
+
+    if (!map)
+    {
+        disable_segmentation(cpi);
+        return 0;
+    }
+
+    // Translate the external delta q values to internal values.
+    for ( i = 0; i < MAX_MB_SEGMENTS; i++ )
+        internal_delta_q[i] =
+            ( delta_q[i] >= 0 ) ? q_trans[delta_q[i]] : -q_trans[-delta_q[i]];
+
+    /* Set the segmentation Map */
+    set_segmentation_map(cpi, map);
+
+    /* Activate segmentation. */
+    enable_segmentation(cpi);
+
+    /* Set up the quant segment data */
+    feature_data[MB_LVL_ALT_Q][0] = internal_delta_q[0];
+    feature_data[MB_LVL_ALT_Q][1] = internal_delta_q[1];
+    feature_data[MB_LVL_ALT_Q][2] = internal_delta_q[2];
+    feature_data[MB_LVL_ALT_Q][3] = internal_delta_q[3];
+
+    /* Set up the loop segment data s */
+    feature_data[MB_LVL_ALT_LF][0] = delta_lf[0];
+    feature_data[MB_LVL_ALT_LF][1] = delta_lf[1];
+    feature_data[MB_LVL_ALT_LF][2] = delta_lf[2];
+    feature_data[MB_LVL_ALT_LF][3] = delta_lf[3];
+
+    cpi->segment_encode_breakout[0] = threshold[0];
+    cpi->segment_encode_breakout[1] = threshold[1];
+    cpi->segment_encode_breakout[2] = threshold[2];
+    cpi->segment_encode_breakout[3] = threshold[3];
+
+    /* Initialise the feature data structure */
+    set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+    return 0;
+}
+
+int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols)
+{
+    if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols)
+    {
+        if (map)
+        {
+            vpx_memcpy(cpi->active_map, map, rows * cols);
+            cpi->active_map_enabled = 1;
+        }
+        else
+            cpi->active_map_enabled = 0;
+
+        return 0;
+    }
+    else
+    {
+        return -1 ;
+    }
+}
+
+int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode, VPX_SCALING vert_mode)
+{
+    if (horiz_mode <= ONETWO)
+        cpi->common.horiz_scale = horiz_mode;
+    else
+        return -1;
+
+    if (vert_mode <= ONETWO)
+        cpi->common.vert_scale  = vert_mode;
+    else
+        return -1;
+
+    return 0;
+}
+
+
+
+int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest)
+{
+    int i, j;
+    int Total = 0;
+
+    unsigned char *src = source->y_buffer;
+    unsigned char *dst = dest->y_buffer;
+
+    /* Loop through the Y plane raw and reconstruction data summing
+     * (square differences)
+     */
+    for (i = 0; i < source->y_height; i += 16)
+    {
+        for (j = 0; j < source->y_width; j += 16)
+        {
+            unsigned int sse;
+            Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+        }
+
+        src += 16 * source->y_stride;
+        dst += 16 * dest->y_stride;
+    }
+
+    return Total;
+}
+
+
+int vp8_get_quantizer(VP8_COMP *cpi)
+{
+    return cpi->common.base_qindex;
+}
diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h
new file mode 100644
index 0000000..ed9c762
--- /dev/null
+++ b/libvpx/vp8/encoder/onyx_int.h
@@ -0,0 +1,741 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8_INT_H
+#define __INC_VP8_INT_H
+
+#include <stdio.h>
+#include "vpx_config.h"
+#include "vp8/common/onyx.h"
+#include "treewriter.h"
+#include "tokenize.h"
+#include "vp8/common/onyxc_int.h"
+#include "vp8/common/variance.h"
+#include "encodemb.h"
+#include "quantize.h"
+#include "vp8/common/entropy.h"
+#include "vp8/common/threading.h"
+#include "vpx_ports/mem.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8.h"
+#include "mcomp.h"
+#include "vp8/common/findnearmv.h"
+#include "lookahead.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "vp8/encoder/denoising.h"
+#endif
+
+#define MIN_GF_INTERVAL             4
+#define DEFAULT_GF_INTERVAL         7
+
+#define KEY_FRAME_CONTEXT 5
+
+#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY? 1 : 25)
+
+#define AF_THRESH   25
+#define AF_THRESH2  100
+#define ARF_DECAY_THRESH 12
+#define MAX_MODES 20
+
+#define MIN_THRESHMULT  32
+#define MAX_THRESHMULT  512
+
+#define GF_ZEROMV_ZBIN_BOOST 12
+#define LF_ZEROMV_ZBIN_BOOST 6
+#define MV_ZBIN_BOOST        4
+#define ZBIN_OQ_MAX 192
+
+#if !(CONFIG_REALTIME_ONLY)
+#define VP8_TEMPORAL_ALT_REF 1
+#endif
+
+#define MAX(x,y) (((x)>(y))?(x):(y))
+#define MIN(x,y) (((x)<(y))?(x):(y))
+
+typedef struct
+{
+    int kf_indicated;
+    unsigned int frames_since_key;
+    unsigned int frames_since_golden;
+    int filter_level;
+    int frames_till_gf_update_due;
+    int recent_ref_frame_usage[MAX_REF_FRAMES];
+
+    MV_CONTEXT mvc[2];
+    int mvcosts[2][MVvals+1];
+
+#ifdef MODE_STATS
+    int y_modes[5];
+    int uv_modes[4];
+    int b_modes[10];
+    int inter_y_modes[10];
+    int inter_uv_modes[4];
+    int inter_b_modes[10];
+#endif
+
+    vp8_prob ymode_prob[4], uv_mode_prob[3];   /* interframe intra mode probs */
+    vp8_prob kf_ymode_prob[4], kf_uv_mode_prob[3];   /* keyframe "" */
+
+    int ymode_count[5], uv_mode_count[4];  /* intra MB type cts this frame */
+
+    int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+
+    int this_frame_percent_intra;
+    int last_frame_percent_intra;
+
+
+} CODING_CONTEXT;
+
+typedef struct
+{
+    double frame;
+    double intra_error;
+    double coded_error;
+    double ssim_weighted_pred_err;
+    double pcnt_inter;
+    double pcnt_motion;
+    double pcnt_second_ref;
+    double pcnt_neutral;
+    double MVr;
+    double mvr_abs;
+    double MVc;
+    double mvc_abs;
+    double MVrv;
+    double MVcv;
+    double mv_in_out_count;
+    double new_mv_count;
+    double duration;
+    double count;
+}
+FIRSTPASS_STATS;
+
+typedef struct
+{
+    int frames_so_far;
+    double frame_intra_error;
+    double frame_coded_error;
+    double frame_pcnt_inter;
+    double frame_pcnt_motion;
+    double frame_mvr;
+    double frame_mvr_abs;
+    double frame_mvc;
+    double frame_mvc_abs;
+
+} ONEPASS_FRAMESTATS;
+
+
+typedef enum
+{
+    THR_ZERO1          = 0,
+    THR_DC             = 1,
+
+    THR_NEAREST1       = 2,
+    THR_NEAR1          = 3,
+
+    THR_ZERO2          = 4,
+    THR_NEAREST2       = 5,
+
+    THR_ZERO3          = 6,
+    THR_NEAREST3       = 7,
+
+    THR_NEAR2          = 8,
+    THR_NEAR3          = 9,
+
+    THR_V_PRED         = 10,
+    THR_H_PRED         = 11,
+    THR_TM             = 12,
+
+    THR_NEW1           = 13,
+    THR_NEW2           = 14,
+    THR_NEW3           = 15,
+
+    THR_SPLIT1         = 16,
+    THR_SPLIT2         = 17,
+    THR_SPLIT3         = 18,
+
+    THR_B_PRED         = 19
+}
+THR_MODES;
+
+typedef enum
+{
+    DIAMOND = 0,
+    NSTEP = 1,
+    HEX = 2
+} SEARCH_METHODS;
+
+typedef struct
+{
+    int RD;
+    SEARCH_METHODS search_method;
+    int improved_quant;
+    int improved_dct;
+    int auto_filter;
+    int recode_loop;
+    int iterative_sub_pixel;
+    int half_pixel_search;
+    int quarter_pixel_search;
+    int thresh_mult[MAX_MODES];
+    int max_step_search_steps;
+    int first_step;
+    int optimize_coefficients;
+
+    int use_fastquant_for_pick;
+    int no_skip_block4x4_search;
+    int improved_mv_pred;
+
+} SPEED_FEATURES;
+
+typedef struct
+{
+    MACROBLOCK  mb;
+    int segment_counts[MAX_MB_SEGMENTS];
+    int totalrate;
+} MB_ROW_COMP;
+
+typedef struct
+{
+    TOKENEXTRA *start;
+    TOKENEXTRA *stop;
+} TOKENLIST;
+
+typedef struct
+{
+    int ithread;
+    void *ptr1;
+    void *ptr2;
+} ENCODETHREAD_DATA;
+typedef struct
+{
+    int ithread;
+    void *ptr1;
+} LPFTHREAD_DATA;
+
+enum
+{
+    BLOCK_16X8,
+    BLOCK_8X16,
+    BLOCK_8X8,
+    BLOCK_4X4,
+    BLOCK_16X16,
+    BLOCK_MAX_SEGMENTS
+};
+
+typedef struct
+{
+    /* Layer configuration */
+    double frame_rate;
+    int target_bandwidth;
+
+    /* Layer specific coding parameters */
+    int64_t starting_buffer_level;
+    int64_t optimal_buffer_level;
+    int64_t maximum_buffer_size;
+    int64_t starting_buffer_level_in_ms;
+    int64_t optimal_buffer_level_in_ms;
+    int64_t maximum_buffer_size_in_ms;
+
+    int avg_frame_size_for_layer;
+
+    int64_t buffer_level;
+    int64_t bits_off_target;
+
+    int64_t total_actual_bits;
+    int total_target_vs_actual;
+
+    int worst_quality;
+    int active_worst_quality;
+    int best_quality;
+    int active_best_quality;
+
+    int ni_av_qi;
+    int ni_tot_qi;
+    int ni_frames;
+    int avg_frame_qindex;
+
+    double rate_correction_factor;
+    double key_frame_rate_correction_factor;
+    double gf_rate_correction_factor;
+
+    int zbin_over_quant;
+
+    int inter_frame_target;
+    int64_t total_byte_count;
+
+    int filter_level;
+
+    int last_frame_percent_intra;
+
+    int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+
+} LAYER_CONTEXT;
+
+typedef struct VP8_COMP
+{
+
+    DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);
+
+    DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);
+
+    DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);
+
+    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y1quant_fast[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, Y2quant_fast[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, UVquant_fast[QINDEX_RANGE][16]);
+
+
+    MACROBLOCK mb;
+    VP8_COMMON common;
+    vp8_writer bc[9]; /* one boolcoder for each partition */
+
+    VP8_CONFIG oxcf;
+
+    struct lookahead_ctx    *lookahead;
+    struct lookahead_entry  *source;
+    struct lookahead_entry  *alt_ref_source;
+    struct lookahead_entry  *last_source;
+
+    YV12_BUFFER_CONFIG *Source;
+    YV12_BUFFER_CONFIG *un_scaled_source;
+    YV12_BUFFER_CONFIG scaled_source;
+    YV12_BUFFER_CONFIG *last_frame_unscaled_source;
+
+    /* frame in src_buffers has been identified to be encoded as an alt ref */
+    int source_alt_ref_pending;
+    /* an alt ref frame has been encoded and is usable */
+    int source_alt_ref_active;
+    /* source of frame to encode is an exact copy of an alt ref frame */
+    int is_src_frame_alt_ref;
+
+    /* golden frame same as last frame ( short circuit gold searches) */
+    int gold_is_last;
+    /* Alt reference frame same as last ( short circuit altref search) */
+    int alt_is_last;
+    /* don't do both alt and gold search ( just do gold). */
+    int gold_is_alt;
+
+    YV12_BUFFER_CONFIG pick_lf_lvl_frame;
+
+    TOKENEXTRA *tok;
+    unsigned int tok_count;
+
+
+    unsigned int frames_since_key;
+    unsigned int key_frame_frequency;
+    unsigned int this_key_frame_forced;
+    unsigned int next_key_frame_forced;
+
+    /* Ambient reconstruction err target for force key frames */
+    int ambient_err;
+
+    unsigned int mode_check_freq[MAX_MODES];
+    unsigned int mode_test_hit_counts[MAX_MODES];
+    unsigned int mode_chosen_counts[MAX_MODES];
+    unsigned int mbs_tested_so_far;
+
+    int rd_thresh_mult[MAX_MODES];
+    int rd_baseline_thresh[MAX_MODES];
+    int rd_threshes[MAX_MODES];
+
+    int RDMULT;
+    int RDDIV ;
+
+    CODING_CONTEXT coding_context;
+
+    /* Rate targetting variables */
+    int64_t last_prediction_error;
+    int64_t last_intra_error;
+
+    int this_frame_target;
+    int projected_frame_size;
+    int last_q[2];                   /* Separate values for Intra/Inter */
+
+    double rate_correction_factor;
+    double key_frame_rate_correction_factor;
+    double gf_rate_correction_factor;
+
+    /* Count down till next GF */
+    int frames_till_gf_update_due;
+
+    /* GF interval chosen when we coded the last GF */
+    int current_gf_interval;
+
+    /* Total bits overspent becasue of GF boost (cumulative) */
+    int gf_overspend_bits;
+
+    /* Used in the few frames following a GF to recover the extra bits
+     * spent in that GF
+     */
+    int non_gf_bitrate_adjustment;
+
+    /* Extra bits spent on key frames that need to be recovered */
+    int kf_overspend_bits;
+
+    /* Current number of bit s to try and recover on each inter frame. */
+    int kf_bitrate_adjustment;
+    int max_gf_interval;
+    int baseline_gf_interval;
+    int active_arnr_frames;
+
+    int64_t key_frame_count;
+    int prior_key_frame_distance[KEY_FRAME_CONTEXT];
+    /* Current section per frame bandwidth target */
+    int per_frame_bandwidth;
+    /* Average frame size target for clip */
+    int av_per_frame_bandwidth;
+    /* Minimum allocation that should be used for any frame */
+    int min_frame_bandwidth;
+    int inter_frame_target;
+    double output_frame_rate;
+    int64_t last_time_stamp_seen;
+    int64_t last_end_time_stamp_seen;
+    int64_t first_time_stamp_ever;
+
+    int ni_av_qi;
+    int ni_tot_qi;
+    int ni_frames;
+    int avg_frame_qindex;
+
+    int zbin_over_quant;
+    int zbin_mode_boost;
+    int zbin_mode_boost_enabled;
+    int last_zbin_over_quant;
+    int last_zbin_mode_boost;
+
+    int64_t total_byte_count;
+
+    int buffered_mode;
+
+    double frame_rate;
+    double ref_frame_rate;
+    int64_t buffer_level;
+    int64_t bits_off_target;
+
+    int rolling_target_bits;
+    int rolling_actual_bits;
+
+    int long_rolling_target_bits;
+    int long_rolling_actual_bits;
+
+    int64_t total_actual_bits;
+    int total_target_vs_actual; /* debug stats */
+
+    int worst_quality;
+    int active_worst_quality;
+    int best_quality;
+    int active_best_quality;
+
+    int cq_target_quality;
+
+    int drop_frames_allowed; /* Are we permitted to drop frames? */
+    int drop_frame;          /* Drop this frame? */
+
+    vp8_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+    char update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+
+    unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+
+    int gfu_boost;
+    int kf_boost;
+    int last_boost;
+
+    int target_bandwidth;
+    struct vpx_codec_pkt_list  *output_pkt_list;
+
+#if 0
+    /* Experimental code for lagged and one pass */
+    ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
+    int one_pass_frame_index;
+#endif
+
+    int decimation_factor;
+    int decimation_count;
+
+    /* for real time encoding */
+    int avg_encode_time;     /* microsecond */
+    int avg_pick_mode_time;  /* microsecond */
+    int Speed;
+    int compressor_speed;
+
+    int interquantizer;
+    int auto_gold;
+    int auto_adjust_gold_quantizer;
+    int auto_worst_q;
+    int cpu_used;
+    int pass;
+
+
+    int prob_intra_coded;
+    int prob_last_coded;
+    int prob_gf_coded;
+    int prob_skip_false;
+    int last_skip_false_probs[3];
+    int last_skip_probs_q[3];
+    int recent_ref_frame_usage[MAX_REF_FRAMES];
+
+    int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+    int this_frame_percent_intra;
+    int last_frame_percent_intra;
+
+    int ref_frame_flags;
+
+    SPEED_FEATURES sf;
+    int error_bins[1024];
+
+    /* Data used for real time conferencing mode to help determine if it
+     * would be good to update the gf
+     */
+    int inter_zz_count;
+    /* Count ZEROMV on all reference frames. */
+    int zeromv_count;
+    int lf_zeromv_pct;
+    int gf_bad_count;
+    int gf_update_recommended;
+
+    unsigned char *segmentation_map;
+    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+    int  segment_encode_breakout[MAX_MB_SEGMENTS];
+
+    unsigned char *active_map;
+    unsigned int active_map_enabled;
+
+    /* Video conferencing cyclic refresh mode flags. This is a mode
+     * designed to clean up the background over time in live encoding
+     * scenarious. It uses segmentation.
+     */
+    int cyclic_refresh_mode_enabled;
+    int cyclic_refresh_mode_max_mbs_perframe;
+    int cyclic_refresh_mode_index;
+    int cyclic_refresh_q;
+    signed char *cyclic_refresh_map;
+
+#if CONFIG_MULTITHREAD
+    /* multithread data */
+    int * mt_current_mb_col;
+    int mt_sync_range;
+    int b_multi_threaded;
+    int encoding_thread_count;
+    int b_lpf_running;
+
+    pthread_t *h_encoding_thread;
+    pthread_t h_filter_thread;
+
+    MB_ROW_COMP *mb_row_ei;
+    ENCODETHREAD_DATA *en_thread_data;
+    LPFTHREAD_DATA lpf_thread_data;
+
+    /* events */
+    sem_t *h_event_start_encoding;
+    sem_t h_event_end_encoding;
+    sem_t h_event_start_lpf;
+    sem_t h_event_end_lpf;
+#endif
+
+    TOKENLIST *tplist;
+    unsigned int partition_sz[MAX_PARTITIONS];
+    unsigned char *partition_d[MAX_PARTITIONS];
+    unsigned char *partition_d_end[MAX_PARTITIONS];
+
+
+    fractional_mv_step_fp *find_fractional_mv_step;
+    vp8_full_search_fn_t full_search_sad;
+    vp8_refining_search_fn_t refining_search_sad;
+    vp8_diamond_search_fn_t diamond_search_sad;
+    vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
+    uint64_t time_receive_data;
+    uint64_t time_compress_data;
+    uint64_t time_pick_lpf;
+    uint64_t time_encode_mb_row;
+
+    int base_skip_false_prob[128];
+
+    FRAME_CONTEXT lfc_n; /* last frame entropy */
+    FRAME_CONTEXT lfc_a; /* last alt ref entropy */
+    FRAME_CONTEXT lfc_g; /* last gold ref entropy */
+
+
+    struct twopass_rc
+    {
+        unsigned int section_intra_rating;
+        double section_max_qfactor;
+        unsigned int next_iiratio;
+        unsigned int this_iiratio;
+        FIRSTPASS_STATS total_stats;
+        FIRSTPASS_STATS this_frame_stats;
+        FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
+        FIRSTPASS_STATS total_left_stats;
+        int first_pass_done;
+        int64_t bits_left;
+        int64_t clip_bits_total;
+        double avg_iiratio;
+        double modified_error_total;
+        double modified_error_used;
+        double modified_error_left;
+        double kf_intra_err_min;
+        double gf_intra_err_min;
+        int frames_to_key;
+        int maxq_max_limit;
+        int maxq_min_limit;
+        int gf_decay_rate;
+        int static_scene_max_gf_interval;
+        int kf_bits;
+        /* Remaining error from uncoded frames in a gf group. */
+        int gf_group_error_left;
+        /* Projected total bits available for a key frame group of frames */
+        int64_t kf_group_bits;
+        /* Error score of frames still to be coded in kf group */
+        int64_t kf_group_error_left;
+        /* Projected Bits available for a group including 1 GF or ARF */
+        int gf_group_bits;
+        /* Bits for the golden frame or ARF */
+        int gf_bits;
+        int alt_extra_bits;
+        double est_max_qcorrection_factor;
+    } twopass;
+
+#if VP8_TEMPORAL_ALT_REF
+    YV12_BUFFER_CONFIG alt_ref_buffer;
+    YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+    int fixed_divide[512];
+#endif
+
+#if CONFIG_INTERNAL_STATS
+    int    count;
+    double total_y;
+    double total_u;
+    double total_v;
+    double total ;
+    double total_sq_error;
+    double totalp_y;
+    double totalp_u;
+    double totalp_v;
+    double totalp;
+    double total_sq_error2;
+    int    bytes;
+    double summed_quality;
+    double summed_weights;
+    unsigned int tot_recode_hits;
+
+
+    double total_ssimg_y;
+    double total_ssimg_u;
+    double total_ssimg_v;
+    double total_ssimg_all;
+
+    int b_calculate_ssimg;
+#endif
+    int b_calculate_psnr;
+
+    /* Per MB activity measurement */
+    unsigned int activity_avg;
+    unsigned int * mb_activity_map;
+
+    /* Record of which MBs still refer to last golden frame either
+     * directly or through 0,0
+     */
+    unsigned char *gf_active_flags;
+    int gf_active_count;
+
+    int output_partition;
+
+    /* Store last frame's MV info for next frame MV prediction */
+    int_mv *lfmv;
+    int *lf_ref_frame_sign_bias;
+    int *lf_ref_frame;
+
+    /* force next frame to intra when kf_auto says so */
+    int force_next_frame_intra;
+
+    int droppable;
+
+#if CONFIG_TEMPORAL_DENOISING
+    VP8_DENOISER denoiser;
+#endif
+
+    /* Coding layer state variables */
+    unsigned int current_layer;
+    LAYER_CONTEXT layer_context[VPX_TS_MAX_LAYERS];
+
+    int64_t frames_in_layer[VPX_TS_MAX_LAYERS];
+    int64_t bytes_in_layer[VPX_TS_MAX_LAYERS];
+    double sum_psnr[VPX_TS_MAX_LAYERS];
+    double sum_psnr_p[VPX_TS_MAX_LAYERS];
+    double total_error2[VPX_TS_MAX_LAYERS];
+    double total_error2_p[VPX_TS_MAX_LAYERS];
+    double sum_ssim[VPX_TS_MAX_LAYERS];
+    double sum_weights[VPX_TS_MAX_LAYERS];
+
+    double total_ssimg_y_in_layer[VPX_TS_MAX_LAYERS];
+    double total_ssimg_u_in_layer[VPX_TS_MAX_LAYERS];
+    double total_ssimg_v_in_layer[VPX_TS_MAX_LAYERS];
+    double total_ssimg_all_in_layer[VPX_TS_MAX_LAYERS];
+
+#if CONFIG_MULTI_RES_ENCODING
+    /* Number of MBs per row at lower-resolution level */
+    int    mr_low_res_mb_cols;
+    /* Indicate if lower-res mv info is available */
+    unsigned char  mr_low_res_mv_avail;
+    /* The frame number of each reference frames */
+    unsigned int current_ref_frames[MAX_REF_FRAMES];
+#endif
+
+    struct rd_costs_struct
+    {
+        int mvcosts[2][MVvals+1];
+        int mvsadcosts[2][MVfpvals+1];
+        int mbmode_cost[2][MB_MODE_COUNT];
+        int intra_uv_mode_cost[2][MB_MODE_COUNT];
+        int bmode_costs[10][10][10];
+        int inter_bmode_costs[B_MODE_COUNT];
+        int token_costs[BLOCK_TYPES][COEF_BANDS]
+        [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+    } rd_costs;
+} VP8_COMP;
+
+void control_data_rate(VP8_COMP *cpi);
+
+void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char *dest_end, unsigned long *size);
+
+int rd_cost_intra_mb(MACROBLOCKD *x);
+
+void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **);
+
+void vp8_set_speed_features(VP8_COMP *cpi);
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+        lval = (expr); \
+        if(!lval) \
+            vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+                               "Failed to allocate "#lval" at %s:%d", \
+                               __FILE__,__LINE__);\
+    } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+        lval = (expr); \
+        if(!lval) \
+            vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+                               "Failed to allocate "#lval);\
+    } while(0)
+#endif
+#endif
diff --git a/libvpx/vp8/encoder/pickinter.c b/libvpx/vp8/encoder/pickinter.c
new file mode 100644
index 0000000..3f09a9f
--- /dev/null
+++ b/libvpx/vp8/encoder/pickinter.c
@@ -0,0 +1,1295 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include "vpx_config.h"
+#include "onyx_int.h"
+#include "modecosts.h"
+#include "encodeintra.h"
+#include "vp8/common/entropymode.h"
+#include "pickinter.h"
+#include "vp8/common/findnearmv.h"
+#include "encodemb.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/variance.h"
+#include "mcomp.h"
+#include "rdopt.h"
+#include "vpx_mem/vpx_mem.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "denoising.h"
+#endif
+
+extern int VP8_UVSSE(MACROBLOCK *x);
+
+#ifdef SPEEDSTATS
+extern unsigned int cnt_pm;
+#endif
+
+extern const int vp8_ref_frame_order[MAX_MODES];
+extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
+
+extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
+
+
+int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
+                                int_mv *bestmv, int_mv *ref_mv,
+                                int error_per_bit,
+                                const vp8_variance_fn_ptr_t *vfp,
+                                int *mvcost[2], int *distortion,
+                                unsigned int *sse)
+{
+    (void) b;
+    (void) d;
+    (void) ref_mv;
+    (void) error_per_bit;
+    (void) vfp;
+    (void) mvcost;
+    (void) distortion;
+    (void) sse;
+    bestmv->as_mv.row <<= 3;
+    bestmv->as_mv.col <<= 3;
+    return 0;
+}
+
+
+int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
+                                  const vp8_variance_fn_ptr_t *vfp,
+                                  unsigned int *sse,
+                                  int_mv this_mv)
+{
+
+    BLOCK *b = &mb->block[0];
+    BLOCKD *d = &mb->e_mbd.block[0];
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    int pre_stride = mb->e_mbd.pre.y_stride;
+    unsigned char *in_what = mb->e_mbd.pre.y_buffer + d->offset ;
+    int in_what_stride = pre_stride;
+    int xoffset = this_mv.as_mv.col & 7;
+    int yoffset = this_mv.as_mv.row & 7;
+
+    in_what += (this_mv.as_mv.row >> 3) * pre_stride + (this_mv.as_mv.col >> 3);
+
+    if (xoffset | yoffset)
+    {
+        return vfp->svf(in_what, in_what_stride, xoffset, yoffset, what, what_stride, sse);
+    }
+    else
+    {
+        return vfp->vf(what, what_stride, in_what, in_what_stride, sse);
+    }
+
+}
+
+
+unsigned int vp8_get4x4sse_cs_c
+(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride
+)
+{
+    int distortion = 0;
+    int r, c;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            int diff = src_ptr[c] - ref_ptr[c];
+            distortion += diff * diff;
+        }
+
+        src_ptr += source_stride;
+        ref_ptr += recon_stride;
+    }
+
+    return distortion;
+}
+
+static int get_prediction_error(BLOCK *be, BLOCKD *b)
+{
+    unsigned char *sptr;
+    unsigned char *dptr;
+    sptr = (*(be->base_src) + be->src);
+    dptr = b->predictor;
+
+    return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
+
+}
+
+static int pick_intra4x4block(
+    MACROBLOCK *x,
+    int ib,
+    B_PREDICTION_MODE *best_mode,
+    const int *mode_costs,
+
+    int *bestrate,
+    int *bestdistortion)
+{
+
+    BLOCKD *b = &x->e_mbd.block[ib];
+    BLOCK *be = &x->block[ib];
+    int dst_stride = x->e_mbd.dst.y_stride;
+    unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
+    B_PREDICTION_MODE mode;
+    int best_rd = INT_MAX;
+    int rate;
+    int distortion;
+
+    unsigned char *Above = dst - dst_stride;
+    unsigned char *yleft = dst - 1;
+    unsigned char top_left = Above[-1];
+
+    for (mode = B_DC_PRED; mode <= B_HE_PRED; mode++)
+    {
+        int this_rd;
+
+        rate = mode_costs[mode];
+
+        vp8_intra4x4_predict(Above, yleft, dst_stride, mode,
+                             b->predictor, 16, top_left);
+        distortion = get_prediction_error(be, b);
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+        if (this_rd < best_rd)
+        {
+            *bestrate = rate;
+            *bestdistortion = distortion;
+            best_rd = this_rd;
+            *best_mode = mode;
+        }
+    }
+
+    b->bmi.as_mode = *best_mode;
+    vp8_encode_intra4x4block(x, ib);
+    return best_rd;
+}
+
+
+static int pick_intra4x4mby_modes
+(
+    MACROBLOCK *mb,
+    int *Rate,
+    int *best_dist
+)
+{
+    MACROBLOCKD *const xd = &mb->e_mbd;
+    int i;
+    int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
+    int error;
+    int distortion = 0;
+    const int *bmode_costs;
+
+    intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16);
+
+    bmode_costs = mb->inter_bmode_costs;
+
+    for (i = 0; i < 16; i++)
+    {
+        MODE_INFO *const mic = xd->mode_info_context;
+        const int mis = xd->mode_info_stride;
+
+        B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+        int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d);
+
+        if (mb->e_mbd.frame_type == KEY_FRAME)
+        {
+            const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
+            const B_PREDICTION_MODE L = left_block_mode(mic, i);
+
+            bmode_costs  = mb->bmode_costs[A][L];
+        }
+
+
+        pick_intra4x4block(mb, i, &best_mode, bmode_costs, &r, &d);
+
+        cost += r;
+        distortion += d;
+        mic->bmi[i].as_mode = best_mode;
+
+        /* Break out case where we have already exceeded best so far value
+         * that was passed in
+         */
+        if (distortion > *best_dist)
+            break;
+    }
+
+    *Rate = cost;
+
+    if (i == 16)
+    {
+        *best_dist = distortion;
+        error = RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
+    }
+    else
+    {
+        *best_dist = INT_MAX;
+        error = INT_MAX;
+    }
+
+    return error;
+}
+
+static void pick_intra_mbuv_mode(MACROBLOCK *mb)
+{
+
+    MACROBLOCKD *x = &mb->e_mbd;
+    unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride;
+    unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride;
+    unsigned char *usrc_ptr = (mb->block[16].src + *mb->block[16].base_src);
+    unsigned char *vsrc_ptr = (mb->block[20].src + *mb->block[20].base_src);
+    int uvsrc_stride = mb->block[16].src_stride;
+    unsigned char uleft_col[8];
+    unsigned char vleft_col[8];
+    unsigned char utop_left = uabove_row[-1];
+    unsigned char vtop_left = vabove_row[-1];
+    int i, j;
+    int expected_udc;
+    int expected_vdc;
+    int shift;
+    int Uaverage = 0;
+    int Vaverage = 0;
+    int diff;
+    int pred_error[4] = {0, 0, 0, 0}, best_error = INT_MAX;
+    MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+
+
+    for (i = 0; i < 8; i++)
+    {
+        uleft_col[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
+        vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
+    }
+
+    if (!x->up_available && !x->left_available)
+    {
+        expected_udc = 128;
+        expected_vdc = 128;
+    }
+    else
+    {
+        shift = 2;
+
+        if (x->up_available)
+        {
+
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uabove_row[i];
+                Vaverage += vabove_row[i];
+            }
+
+            shift ++;
+
+        }
+
+        if (x->left_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uleft_col[i];
+                Vaverage += vleft_col[i];
+            }
+
+            shift ++;
+
+        }
+
+        expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
+        expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
+    }
+
+
+    for (i = 0; i < 8; i++)
+    {
+        for (j = 0; j < 8; j++)
+        {
+
+            int predu = uleft_col[i] + uabove_row[j] - utop_left;
+            int predv = vleft_col[i] + vabove_row[j] - vtop_left;
+            int u_p, v_p;
+
+            u_p = usrc_ptr[j];
+            v_p = vsrc_ptr[j];
+
+            if (predu < 0)
+                predu = 0;
+
+            if (predu > 255)
+                predu = 255;
+
+            if (predv < 0)
+                predv = 0;
+
+            if (predv > 255)
+                predv = 255;
+
+
+            diff = u_p - expected_udc;
+            pred_error[DC_PRED] += diff * diff;
+            diff = v_p - expected_vdc;
+            pred_error[DC_PRED] += diff * diff;
+
+
+            diff = u_p - uabove_row[j];
+            pred_error[V_PRED] += diff * diff;
+            diff = v_p - vabove_row[j];
+            pred_error[V_PRED] += diff * diff;
+
+
+            diff = u_p - uleft_col[i];
+            pred_error[H_PRED] += diff * diff;
+            diff = v_p - vleft_col[i];
+            pred_error[H_PRED] += diff * diff;
+
+
+            diff = u_p - predu;
+            pred_error[TM_PRED] += diff * diff;
+            diff = v_p - predv;
+            pred_error[TM_PRED] += diff * diff;
+
+
+        }
+
+        usrc_ptr += uvsrc_stride;
+        vsrc_ptr += uvsrc_stride;
+
+        if (i == 3)
+        {
+            usrc_ptr = (mb->block[18].src + *mb->block[18].base_src);
+            vsrc_ptr = (mb->block[22].src + *mb->block[22].base_src);
+        }
+
+
+
+    }
+
+
+    for (i = DC_PRED; i <= TM_PRED; i++)
+    {
+        if (best_error > pred_error[i])
+        {
+            best_error = pred_error[i];
+            best_mode = (MB_PREDICTION_MODE)i;
+        }
+    }
+
+
+    mb->e_mbd.mode_info_context->mbmi.uv_mode = best_mode;
+
+}
+
+static void update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+    /* Split MV modes currently not supported when RD is nopt enabled,
+     * therefore, only need to modify MVcount in NEWMV mode. */
+    if (xd->mode_info_context->mbmi.mode == NEWMV)
+    {
+        x->MVcount[0][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.row -
+                                      best_ref_mv->as_mv.row) >> 1)]++;
+        x->MVcount[1][mv_max+((xd->mode_info_context->mbmi.mv.as_mv.col -
+                                      best_ref_mv->as_mv.col) >> 1)]++;
+    }
+}
+
+
+#if CONFIG_MULTI_RES_ENCODING
+static
+void get_lower_res_motion_info(VP8_COMP *cpi, MACROBLOCKD *xd, int *dissim,
+                               int *parent_ref_frame,
+                               MB_PREDICTION_MODE *parent_mode,
+                               int_mv *parent_ref_mv, int mb_row, int mb_col)
+{
+    LOWER_RES_MB_INFO* store_mode_info
+                          = ((LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info)->mb_info;
+    unsigned int parent_mb_index;
+
+    /* Consider different down_sampling_factor.  */
+    {
+        /* TODO: Removed the loop that supports special down_sampling_factor
+         * such as 2, 4, 8. Will revisit it if needed.
+         * Should also try using a look-up table to see if it helps
+         * performance. */
+        int parent_mb_row, parent_mb_col;
+
+        parent_mb_row = mb_row*cpi->oxcf.mr_down_sampling_factor.den
+                    /cpi->oxcf.mr_down_sampling_factor.num;
+        parent_mb_col = mb_col*cpi->oxcf.mr_down_sampling_factor.den
+                    /cpi->oxcf.mr_down_sampling_factor.num;
+        parent_mb_index = parent_mb_row*cpi->mr_low_res_mb_cols + parent_mb_col;
+    }
+
+    /* Read lower-resolution mode & motion result from memory.*/
+    *parent_ref_frame = store_mode_info[parent_mb_index].ref_frame;
+    *parent_mode =  store_mode_info[parent_mb_index].mode;
+    *dissim = store_mode_info[parent_mb_index].dissim;
+
+    /* For highest-resolution encoder, adjust dissim value. Lower its quality
+     * for good performance. */
+    if (cpi->oxcf.mr_encoder_id == (cpi->oxcf.mr_total_resolutions - 1))
+        *dissim>>=1;
+
+    if(*parent_ref_frame != INTRA_FRAME)
+    {
+        /* Consider different down_sampling_factor.
+         * The result can be rounded to be more precise, but it takes more time.
+         */
+        (*parent_ref_mv).as_mv.row = store_mode_info[parent_mb_index].mv.as_mv.row
+                                  *cpi->oxcf.mr_down_sampling_factor.num
+                                  /cpi->oxcf.mr_down_sampling_factor.den;
+        (*parent_ref_mv).as_mv.col = store_mode_info[parent_mb_index].mv.as_mv.col
+                                  *cpi->oxcf.mr_down_sampling_factor.num
+                                  /cpi->oxcf.mr_down_sampling_factor.den;
+
+        vp8_clamp_mv2(parent_ref_mv, xd);
+    }
+}
+#endif
+
+static void check_for_encode_breakout(unsigned int sse, MACROBLOCK* x)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    unsigned int threshold = (xd->block[0].dequant[1]
+        * xd->block[0].dequant[1] >>4);
+
+    if(threshold < x->encode_breakout)
+        threshold = x->encode_breakout;
+
+    if (sse < threshold )
+    {
+        /* Check u and v to make sure skip is ok */
+        unsigned int sse2 = 0;
+
+        sse2 = VP8_UVSSE(x);
+
+        if (sse2 * 2 < x->encode_breakout)
+            x->skip = 1;
+        else
+            x->skip = 0;
+    }
+}
+
+static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2,
+                               VP8_COMP *cpi, MACROBLOCK *x, int rd_adj)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+    int_mv mv = x->e_mbd.mode_info_context->mbmi.mv;
+    int this_rd;
+    /* Exit early and don't compute the distortion if this macroblock
+     * is marked inactive. */
+    if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+    {
+        *sse = 0;
+        *distortion2 = 0;
+        x->skip = 1;
+        return INT_MAX;
+    }
+
+    if((this_mode != NEWMV) ||
+        !(cpi->sf.half_pixel_search) || cpi->common.full_pixel==1)
+        *distortion2 = vp8_get_inter_mbpred_error(x,
+                                              &cpi->fn_ptr[BLOCK_16X16],
+                                              sse, mv);
+
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate2, *distortion2);
+
+    /* Adjust rd to bias to ZEROMV */
+    if(this_mode == ZEROMV)
+    {
+        /* Bias to ZEROMV on LAST_FRAME reference when it is available. */
+        if ((cpi->ref_frame_flags & VP8_LAST_FRAME &
+            cpi->common.refresh_last_frame)
+            && x->e_mbd.mode_info_context->mbmi.ref_frame != LAST_FRAME)
+            rd_adj = 100;
+
+        // rd_adj <= 100
+        this_rd = ((int64_t)this_rd) * rd_adj / 100;
+    }
+
+    check_for_encode_breakout(*sse, x);
+    return this_rd;
+}
+
+static void calculate_zeromv_rd_adjustment(VP8_COMP *cpi, MACROBLOCK *x,
+                                    int *rd_adjustment)
+{
+    MODE_INFO *mic = x->e_mbd.mode_info_context;
+    int_mv mv_l, mv_a, mv_al;
+    int local_motion_check = 0;
+
+    if (cpi->lf_zeromv_pct > 40)
+    {
+        /* left mb */
+        mic -= 1;
+        mv_l = mic->mbmi.mv;
+
+        if (mic->mbmi.ref_frame != INTRA_FRAME)
+            if( abs(mv_l.as_mv.row) < 8 && abs(mv_l.as_mv.col) < 8)
+                local_motion_check++;
+
+        /* above-left mb */
+        mic -= x->e_mbd.mode_info_stride;
+        mv_al = mic->mbmi.mv;
+
+        if (mic->mbmi.ref_frame != INTRA_FRAME)
+            if( abs(mv_al.as_mv.row) < 8 && abs(mv_al.as_mv.col) < 8)
+                local_motion_check++;
+
+        /* above mb */
+        mic += 1;
+        mv_a = mic->mbmi.mv;
+
+        if (mic->mbmi.ref_frame != INTRA_FRAME)
+            if( abs(mv_a.as_mv.row) < 8 && abs(mv_a.as_mv.col) < 8)
+                local_motion_check++;
+
+        if (((!x->e_mbd.mb_to_top_edge || !x->e_mbd.mb_to_left_edge)
+            && local_motion_check >0) ||  local_motion_check >2 )
+            *rd_adjustment = 80;
+        else if (local_motion_check > 0)
+            *rd_adjustment = 90;
+    }
+}
+
+void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+                         int recon_uvoffset, int *returnrate,
+                         int *returndistortion, int *returnintra, int mb_row,
+                         int mb_col)
+{
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+    MB_MODE_INFO best_mbmode;
+
+    int_mv best_ref_mv_sb[2];
+    int_mv mode_mv_sb[2][MB_MODE_COUNT];
+    int_mv best_ref_mv;
+    int_mv *mode_mv;
+    MB_PREDICTION_MODE this_mode;
+    int num00;
+    int mdcounts[4];
+    int best_rd = INT_MAX;
+    int rd_adjustment = 100;
+    int best_intra_rd = INT_MAX;
+    int mode_index;
+    int rate;
+    int rate2;
+    int distortion2;
+    int bestsme = INT_MAX;
+    int best_mode_index = 0;
+    unsigned int sse = INT_MAX, best_rd_sse = INT_MAX;
+#if CONFIG_TEMPORAL_DENOISING
+    unsigned int zero_mv_sse = INT_MAX, best_sse = INT_MAX;
+#endif
+
+    int_mv mvp;
+
+    int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    int saddone=0;
+    /* search range got from mv_pred(). It uses step_param levels. (0-7) */
+    int sr=0;
+
+    unsigned char *plane[4][3];
+    int ref_frame_map[4];
+    int sign_bias = 0;
+
+#if CONFIG_MULTI_RES_ENCODING
+    int dissim = INT_MAX;
+    int parent_ref_frame = 0;
+    int parent_ref_valid = cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail;
+    int_mv parent_ref_mv;
+    MB_PREDICTION_MODE parent_mode = 0;
+
+    if (parent_ref_valid)
+    {
+        int parent_ref_flag;
+
+        get_lower_res_motion_info(cpi, xd, &dissim, &parent_ref_frame,
+                                  &parent_mode, &parent_ref_mv, mb_row, mb_col);
+
+        /* TODO(jkoleszar): The references available (ref_frame_flags) to the
+         * lower res encoder should match those available to this encoder, but
+         * there seems to be a situation where this mismatch can happen in the
+         * case of frame dropping and temporal layers. For example,
+         * GOLD being disallowed in ref_frame_flags, but being returned as
+         * parent_ref_frame.
+         *
+         * In this event, take the conservative approach of disabling the
+         * lower res info for this MB.
+         */
+        parent_ref_flag = 0;
+        if (parent_ref_frame == LAST_FRAME)
+            parent_ref_flag = (cpi->ref_frame_flags & VP8_LAST_FRAME);
+        else if (parent_ref_frame == GOLDEN_FRAME)
+            parent_ref_flag = (cpi->ref_frame_flags & VP8_GOLD_FRAME);
+        else if (parent_ref_frame == ALTREF_FRAME)
+            parent_ref_flag = (cpi->ref_frame_flags & VP8_ALTR_FRAME);
+
+        //assert(!parent_ref_frame || parent_ref_flag);
+        if (parent_ref_frame && !parent_ref_flag)
+            parent_ref_valid = 0;
+    }
+#endif
+
+    mode_mv = mode_mv_sb[sign_bias];
+    best_ref_mv.as_int = 0;
+    vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
+    vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+
+    /* Setup search priorities */
+#if CONFIG_MULTI_RES_ENCODING
+    if (parent_ref_valid && parent_ref_frame && dissim < 8)
+    {
+        ref_frame_map[0] = -1;
+        ref_frame_map[1] = parent_ref_frame;
+        ref_frame_map[2] = -1;
+        ref_frame_map[3] = -1;
+    } else
+#endif
+    get_reference_search_order(cpi, ref_frame_map);
+
+    /* Check to see if there is at least 1 valid reference frame that we need
+     * to calculate near_mvs.
+     */
+    if (ref_frame_map[1] > 0)
+    {
+        sign_bias = vp8_find_near_mvs_bias(&x->e_mbd,
+                                           x->e_mbd.mode_info_context,
+                                           mode_mv_sb,
+                                           best_ref_mv_sb,
+                                           mdcounts,
+                                           ref_frame_map[1],
+                                           cpi->common.ref_frame_sign_bias);
+
+        mode_mv = mode_mv_sb[sign_bias];
+        best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+    }
+
+    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
+
+    /* Count of the number of MBs tested so far this frame */
+    cpi->mbs_tested_so_far++;
+
+    *returnintra = INT_MAX;
+    x->skip = 0;
+
+    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+    /* If the frame has big static background and current MB is in low
+     * motion area, its mode decision is biased to ZEROMV mode.
+     */
+    calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
+
+    /* if we encode a new mv this is important
+     * find the best new motion vector
+     */
+    for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
+    {
+        int frame_cost;
+        int this_rd = INT_MAX;
+        int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
+
+        if (best_rd <= cpi->rd_threshes[mode_index])
+            continue;
+
+        if (this_ref_frame < 0)
+            continue;
+
+        x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+
+        /* everything but intra */
+        if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+        {
+            x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+            x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+            x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+            if (sign_bias != cpi->common.ref_frame_sign_bias[this_ref_frame])
+            {
+                sign_bias = cpi->common.ref_frame_sign_bias[this_ref_frame];
+                mode_mv = mode_mv_sb[sign_bias];
+                best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+            }
+
+#if CONFIG_MULTI_RES_ENCODING
+            if (parent_ref_valid)
+            {
+                if (vp8_mode_order[mode_index] == NEARESTMV &&
+                    mode_mv[NEARESTMV].as_int ==0)
+                    continue;
+                if (vp8_mode_order[mode_index] == NEARMV &&
+                    mode_mv[NEARMV].as_int ==0)
+                    continue;
+
+                if (vp8_mode_order[mode_index] == NEWMV && parent_mode == ZEROMV
+                    && best_ref_mv.as_int==0)
+                    continue;
+                else if(vp8_mode_order[mode_index] == NEWMV && dissim==0
+                    && best_ref_mv.as_int==parent_ref_mv.as_int)
+                    continue;
+            }
+#endif
+        }
+
+        /* Check to see if the testing frequency for this mode is at its max
+         * If so then prevent it from being tested and increase the threshold
+         * for its testing */
+        if (cpi->mode_test_hit_counts[mode_index] &&
+                                         (cpi->mode_check_freq[mode_index] > 1))
+        {
+            if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] *
+                                         cpi->mode_test_hit_counts[mode_index]))
+            {
+                /* Increase the threshold for coding this mode to make it less
+                 * likely to be chosen */
+                cpi->rd_thresh_mult[mode_index] += 4;
+
+                if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                    cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+                cpi->rd_threshes[mode_index] =
+                                 (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                                 cpi->rd_thresh_mult[mode_index];
+                continue;
+            }
+        }
+
+        /* We have now reached the point where we are going to test the current
+         * mode so increment the counter for the number of times it has been
+         * tested */
+        cpi->mode_test_hit_counts[mode_index] ++;
+
+        rate2 = 0;
+        distortion2 = 0;
+
+        this_mode = vp8_mode_order[mode_index];
+
+        x->e_mbd.mode_info_context->mbmi.mode = this_mode;
+        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+
+        /* Work out the cost assosciated with selecting the reference frame */
+        frame_cost =
+            x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+        rate2 += frame_cost;
+
+        /* Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+         * unless ARNR filtering is enabled in which case we want
+         * an unfiltered alternative */
+        if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+        {
+            if (this_mode != ZEROMV ||
+                x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
+                continue;
+        }
+
+        switch (this_mode)
+        {
+        case B_PRED:
+            /* Pass best so far to pick_intra4x4mby_modes to use as breakout */
+            distortion2 = best_rd_sse;
+            pick_intra4x4mby_modes(x, &rate, &distortion2);
+
+            if (distortion2 == INT_MAX)
+            {
+                this_rd = INT_MAX;
+            }
+            else
+            {
+                rate2 += rate;
+                distortion2 = vp8_variance16x16(
+                                    *(b->base_src), b->src_stride,
+                                    x->e_mbd.predictor, 16, &sse);
+                this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+                if (this_rd < best_intra_rd)
+                {
+                    best_intra_rd = this_rd;
+                    *returnintra = distortion2;
+                }
+            }
+
+            break;
+
+        case SPLITMV:
+
+            /* Split MV modes currently not supported when RD is not enabled. */
+            break;
+
+        case DC_PRED:
+        case V_PRED:
+        case H_PRED:
+        case TM_PRED:
+            vp8_build_intra_predictors_mby_s(xd,
+                                             xd->dst.y_buffer - xd->dst.y_stride,
+                                             xd->dst.y_buffer - 1,
+                                             xd->dst.y_stride,
+                                             xd->predictor,
+                                             16);
+            distortion2 = vp8_variance16x16
+                                          (*(b->base_src), b->src_stride,
+                                          x->e_mbd.predictor, 16, &sse);
+            rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
+            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+            if (this_rd < best_intra_rd)
+            {
+                best_intra_rd = this_rd;
+                *returnintra = distortion2;
+            }
+            break;
+
+        case NEWMV:
+        {
+            int thissme;
+            int step_param;
+            int further_steps;
+            int n = 0;
+            int sadpb = x->sadperbit16;
+            int_mv mvp_full;
+
+            int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+            int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
+            int col_max = (best_ref_mv.as_mv.col>>3)
+                         + MAX_FULL_PEL_VAL;
+            int row_max = (best_ref_mv.as_mv.row>>3)
+                         + MAX_FULL_PEL_VAL;
+
+            int tmp_col_min = x->mv_col_min;
+            int tmp_col_max = x->mv_col_max;
+            int tmp_row_min = x->mv_row_min;
+            int tmp_row_max = x->mv_row_max;
+
+            int speed_adjust = (cpi->Speed > 5) ? ((cpi->Speed >= 8)? 3 : 2) : 1;
+
+            /* Further step/diamond searches as necessary */
+            step_param = cpi->sf.first_step + speed_adjust;
+
+#if CONFIG_MULTI_RES_ENCODING
+            /* If lower-res drops this frame, then higher-res encoder does
+               motion search without any previous knowledge. Also, since
+               last frame motion info is not stored, then we can not
+               use improved_mv_pred. */
+            if (cpi->oxcf.mr_encoder_id && !parent_ref_valid)
+                cpi->sf.improved_mv_pred = 0;
+
+            if (parent_ref_valid && parent_ref_frame)
+            {
+                /* Use parent MV as predictor. Adjust search range
+                 * accordingly.
+                 */
+                mvp.as_int = parent_ref_mv.as_int;
+                mvp_full.as_mv.col = parent_ref_mv.as_mv.col>>3;
+                mvp_full.as_mv.row = parent_ref_mv.as_mv.row>>3;
+
+                if(dissim <=32) step_param += 3;
+                else if(dissim <=128) step_param += 2;
+                else step_param += 1;
+            }else
+#endif
+            {
+                if(cpi->sf.improved_mv_pred)
+                {
+                    if(!saddone)
+                    {
+                        vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+                        saddone = 1;
+                    }
+
+                    vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context,
+                                &mvp,x->e_mbd.mode_info_context->mbmi.ref_frame,
+                                cpi->common.ref_frame_sign_bias, &sr,
+                                &near_sadidx[0]);
+
+                    sr += speed_adjust;
+                    /* adjust search range according to sr from mv prediction */
+                    if(sr > step_param)
+                        step_param = sr;
+
+                    mvp_full.as_mv.col = mvp.as_mv.col>>3;
+                    mvp_full.as_mv.row = mvp.as_mv.row>>3;
+                }else
+                {
+                    mvp.as_int = best_ref_mv.as_int;
+                    mvp_full.as_mv.col = best_ref_mv.as_mv.col>>3;
+                    mvp_full.as_mv.row = best_ref_mv.as_mv.row>>3;
+                }
+            }
+
+#if CONFIG_MULTI_RES_ENCODING
+            if (parent_ref_valid && parent_ref_frame && dissim <= 2 &&
+                MAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row),
+                    abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <= 4)
+            {
+                d->bmi.mv.as_int = mvp_full.as_int;
+                mode_mv[NEWMV].as_int = mvp_full.as_int;
+
+                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv,
+                                             x->errorperbit,
+                                             &cpi->fn_ptr[BLOCK_16X16],
+                                             cpi->mb.mvcost,
+                                             &distortion2,&sse);
+            }else
+#endif
+            {
+                /* Get intersection of UMV window and valid MV window to
+                 * reduce # of checks in diamond search. */
+                if (x->mv_col_min < col_min )
+                    x->mv_col_min = col_min;
+                if (x->mv_col_max > col_max )
+                    x->mv_col_max = col_max;
+                if (x->mv_row_min < row_min )
+                    x->mv_row_min = row_min;
+                if (x->mv_row_max > row_max )
+                    x->mv_row_max = row_max;
+
+                further_steps = (cpi->Speed >= 8)?
+                           0: (cpi->sf.max_step_search_steps - 1 - step_param);
+
+                if (cpi->sf.search_method == HEX)
+                {
+#if CONFIG_MULTI_RES_ENCODING
+                /* TODO: In higher-res pick_inter_mode, step_param is used to
+                 * modify hex search range. Here, set step_param to 0 not to
+                 * change the behavior in lowest-resolution encoder.
+                 * Will improve it later.
+                 */
+                 /* Set step_param to 0 to ensure large-range motion search
+                    when encoder drops this frame at lower-resolution.
+                  */
+                if (!parent_ref_valid)
+                    step_param = 0;
+#endif
+                    bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv,
+                                          step_param, sadpb,
+                                          &cpi->fn_ptr[BLOCK_16X16],
+                                          x->mvsadcost, x->mvcost, &best_ref_mv);
+                    mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                }
+                else
+                {
+                    bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                          &d->bmi.mv, step_param, sadpb, &num00,
+                                          &cpi->fn_ptr[BLOCK_16X16],
+                                          x->mvcost, &best_ref_mv);
+                    mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+
+                    /* Further step/diamond searches as necessary */
+                    n = num00;
+                    num00 = 0;
+
+                    while (n < further_steps)
+                    {
+                        n++;
+
+                        if (num00)
+                            num00--;
+                        else
+                        {
+                            thissme =
+                            cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                                    &d->bmi.mv,
+                                                    step_param + n,
+                                                    sadpb, &num00,
+                                                    &cpi->fn_ptr[BLOCK_16X16],
+                                                    x->mvcost, &best_ref_mv);
+                            if (thissme < bestsme)
+                            {
+                                bestsme = thissme;
+                                mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                            }
+                            else
+                            {
+                                d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+                            }
+                        }
+                    }
+                }
+
+                x->mv_col_min = tmp_col_min;
+                x->mv_col_max = tmp_col_max;
+                x->mv_row_min = tmp_row_min;
+                x->mv_row_max = tmp_row_max;
+
+                if (bestsme < INT_MAX)
+                    cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv,
+                                             &best_ref_mv, x->errorperbit,
+                                             &cpi->fn_ptr[BLOCK_16X16],
+                                             cpi->mb.mvcost,
+                                             &distortion2,&sse);
+            }
+
+            mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+
+            /* mv cost; */
+            rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv,
+                                     cpi->mb.mvcost, 128);
+        }
+
+        case NEARESTMV:
+        case NEARMV:
+
+            if (mode_mv[this_mode].as_int == 0)
+                continue;
+
+        case ZEROMV:
+
+            /* Trap vectors that reach beyond the UMV borders
+             * Note that ALL New MV, Nearest MV Near MV and Zero MV code drops
+             * through to this point because of the lack of break statements
+             * in the previous two cases.
+             */
+            if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
+                ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+                ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
+                ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
+                continue;
+
+            rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+            x->e_mbd.mode_info_context->mbmi.mv.as_int =
+                                                    mode_mv[this_mode].as_int;
+            this_rd = evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x,
+                                          rd_adjustment);
+
+            break;
+        default:
+            break;
+        }
+
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity)
+        {
+
+            /* Store for later use by denoiser. */
+            if (this_mode == ZEROMV && sse < zero_mv_sse )
+            {
+                zero_mv_sse = sse;
+                x->best_zeromv_reference_frame =
+                        x->e_mbd.mode_info_context->mbmi.ref_frame;
+            }
+
+            /* Store the best NEWMV in x for later use in the denoiser. */
+            if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
+                    sse < best_sse)
+            {
+                best_sse = sse;
+                x->best_sse_inter_mode = NEWMV;
+                x->best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
+                x->need_to_clamp_best_mvs =
+                    x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
+                x->best_reference_frame =
+                    x->e_mbd.mode_info_context->mbmi.ref_frame;
+            }
+        }
+#endif
+
+        if (this_rd < best_rd || x->skip)
+        {
+            /* Note index of best mode */
+            best_mode_index = mode_index;
+
+            *returnrate = rate2;
+            *returndistortion = distortion2;
+            best_rd_sse = sse;
+            best_rd = this_rd;
+            vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+                       sizeof(MB_MODE_INFO));
+
+            /* Testing this mode gave rise to an improvement in best error
+             * score. Lower threshold a bit for next time
+             */
+            cpi->rd_thresh_mult[mode_index] =
+                     (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+                     cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+            cpi->rd_threshes[mode_index] =
+                                   (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                                   cpi->rd_thresh_mult[mode_index];
+        }
+
+        /* If the mode did not help improve the best error case then raise the
+         * threshold for testing that mode next time around.
+         */
+        else
+        {
+            cpi->rd_thresh_mult[mode_index] += 4;
+
+            if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+            cpi->rd_threshes[mode_index] =
+                         (cpi->rd_baseline_thresh[mode_index] >> 7) *
+                         cpi->rd_thresh_mult[mode_index];
+        }
+
+        if (x->skip)
+            break;
+    }
+
+    /* Reduce the activation RD thresholds for the best choice mode */
+    if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
+    {
+        int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3);
+
+        cpi->rd_thresh_mult[best_mode_index] =
+                        (cpi->rd_thresh_mult[best_mode_index]
+                        >= (MIN_THRESHMULT + best_adjustment)) ?
+                        cpi->rd_thresh_mult[best_mode_index] - best_adjustment :
+                        MIN_THRESHMULT;
+        cpi->rd_threshes[best_mode_index] =
+                        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
+                        cpi->rd_thresh_mult[best_mode_index];
+    }
+
+
+    {
+        int this_rdbin = (*returndistortion >> 7);
+
+        if (this_rdbin >= 1024)
+        {
+            this_rdbin = 1023;
+        }
+
+        cpi->error_bins[this_rdbin] ++;
+    }
+
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+        if (x->best_sse_inter_mode == DC_PRED)
+        {
+            /* No best MV found. */
+            x->best_sse_inter_mode = best_mbmode.mode;
+            x->best_sse_mv = best_mbmode.mv;
+            x->need_to_clamp_best_mvs = best_mbmode.need_to_clamp_mvs;
+            x->best_reference_frame = best_mbmode.ref_frame;
+            best_sse = best_rd_sse;
+        }
+        vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
+                                recon_yoffset, recon_uvoffset);
+
+
+        /* Reevaluate ZEROMV after denoising. */
+        if (best_mbmode.ref_frame == INTRA_FRAME &&
+            x->best_zeromv_reference_frame != INTRA_FRAME)
+        {
+            int this_rd = 0;
+            int this_ref_frame = x->best_zeromv_reference_frame;
+            rate2 = x->ref_frame_cost[this_ref_frame] +
+                    vp8_cost_mv_ref(ZEROMV, mdcounts);
+            distortion2 = 0;
+
+            /* set up the proper prediction buffers for the frame */
+            x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+            x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+            x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+            x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+            x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+            x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+            x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+            this_rd = evaluate_inter_mode(&sse, rate2, &distortion2, cpi, x,
+                                          rd_adjustment);
+
+            if (this_rd < best_rd)
+            {
+                vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+                           sizeof(MB_MODE_INFO));
+            }
+        }
+
+    }
+#endif
+
+    if (cpi->is_src_frame_alt_ref &&
+        (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME))
+    {
+        x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+        x->e_mbd.mode_info_context->mbmi.ref_frame = ALTREF_FRAME;
+        x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+        x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
+                                        (cpi->common.mb_no_coeff_skip);
+        x->e_mbd.mode_info_context->mbmi.partitioning = 0;
+
+        return;
+    }
+
+    /* set to the best mb mode, this copy can be skip if x->skip since it
+     * already has the right content */
+    if (!x->skip)
+        vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode,
+                   sizeof(MB_MODE_INFO));
+
+    if (best_mbmode.mode <= B_PRED)
+    {
+        /* set mode_info_context->mbmi.uv_mode */
+        pick_intra_mbuv_mode(x);
+    }
+
+    if (sign_bias
+      != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
+        best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;
+
+    update_mvcount(cpi, x, &best_ref_mv);
+}
+
+
+void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
+{
+    int error4x4, error16x16 = INT_MAX;
+    int rate, best_rate = 0, distortion, best_sse;
+    MB_PREDICTION_MODE mode, best_mode = DC_PRED;
+    int this_rd;
+    unsigned int sse;
+    BLOCK *b = &x->block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    xd->mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+    pick_intra_mbuv_mode(x);
+
+    for (mode = DC_PRED; mode <= TM_PRED; mode ++)
+    {
+        xd->mode_info_context->mbmi.mode = mode;
+        vp8_build_intra_predictors_mby_s(xd,
+                                         xd->dst.y_buffer - xd->dst.y_stride,
+                                         xd->dst.y_buffer - 1,
+                                         xd->dst.y_stride,
+                                         xd->predictor,
+                                         16);
+        distortion = vp8_variance16x16
+            (*(b->base_src), b->src_stride, xd->predictor, 16, &sse);
+        rate = x->mbmode_cost[xd->frame_type][mode];
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+        if (error16x16 > this_rd)
+        {
+            error16x16 = this_rd;
+            best_mode = mode;
+            best_sse = sse;
+            best_rate = rate;
+        }
+    }
+    xd->mode_info_context->mbmi.mode = best_mode;
+
+    error4x4 = pick_intra4x4mby_modes(x, &rate,
+                                      &best_sse);
+    if (error4x4 < error16x16)
+    {
+        xd->mode_info_context->mbmi.mode = B_PRED;
+        best_rate = rate;
+    }
+
+    *rate_ = best_rate;
+}
diff --git a/libvpx/vp8/encoder/pickinter.h b/libvpx/vp8/encoder/pickinter.h
new file mode 100644
index 0000000..35011ca
--- /dev/null
+++ b/libvpx/vp8/encoder/pickinter.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PICKINTER_H
+#define __INC_PICKINTER_H
+#include "vpx_config.h"
+#include "vp8/common/onyxc_int.h"
+
+extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+                                int recon_uvoffset, int *returnrate,
+                                int *returndistortion, int *returnintra,
+                                int mb_row, int mb_col);
+extern void vp8_pick_intra_mode(MACROBLOCK *x, int *rate);
+
+extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
+                                      const vp8_variance_fn_ptr_t *vfp,
+                                      unsigned int *sse,
+                                      int_mv this_mv);
+#endif
diff --git a/libvpx/vp8/encoder/picklpf.c b/libvpx/vp8/encoder/picklpf.c
new file mode 100644
index 0000000..4121349
--- /dev/null
+++ b/libvpx/vp8/encoder/picklpf.c
@@ -0,0 +1,406 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/onyxc_int.h"
+#include "onyx_int.h"
+#include "quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/loopfilter.h"
+#if ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+
+extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
+
+void vp8_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
+                                   YV12_BUFFER_CONFIG *dst_ybc)
+{
+    unsigned char *src_y, *dst_y;
+    int yheight;
+    int ystride;
+    int yoffset;
+    int linestocopy;
+
+    yheight  = src_ybc->y_height;
+    ystride  = src_ybc->y_stride;
+
+    /* number of MB rows to use in partial filtering */
+    linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION;
+    linestocopy = linestocopy ? linestocopy << 4 : 16;     /* 16 lines per MB */
+
+    /* Copy extra 4 so that full filter context is available if filtering done
+     * on the copied partial frame and not original. Partial filter does mb
+     * filtering for top row also, which can modify3 pixels above.
+     */
+    linestocopy += 4;
+    /* partial image starts at ~middle of frame (macroblock border)*/
+    yoffset  = ystride * (((yheight >> 5) * 16) - 4);
+    src_y = src_ybc->y_buffer + yoffset;
+    dst_y = dst_ybc->y_buffer + yoffset;
+
+    vpx_memcpy(dst_y, src_y, ystride * linestocopy);
+}
+
+static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
+                                YV12_BUFFER_CONFIG *dest)
+{
+    int i, j;
+    int Total = 0;
+    int srcoffset, dstoffset;
+    unsigned char *src = source->y_buffer;
+    unsigned char *dst = dest->y_buffer;
+
+    int linestocopy;
+
+    /* number of MB rows to use in partial filtering */
+    linestocopy = (source->y_height >> 4) / PARTIAL_FRAME_FRACTION;
+    linestocopy = linestocopy ? linestocopy << 4 : 16;     /* 16 lines per MB */
+
+
+    /* partial image starts at ~middle of frame (macroblock border)*/
+    srcoffset = source->y_stride * ((dest->y_height >> 5) * 16);
+    dstoffset = dest->y_stride   * ((dest->y_height >> 5) * 16);
+
+    src += srcoffset;
+    dst += dstoffset;
+
+    /* Loop through the Y plane raw and reconstruction data summing
+     * (square differences)
+     */
+    for (i = 0; i < linestocopy; i += 16)
+    {
+        for (j = 0; j < source->y_width; j += 16)
+        {
+            unsigned int sse;
+            Total += vp8_mse16x16(src + j, source->y_stride,
+                                                     dst + j, dest->y_stride,
+                                                     &sse);
+        }
+
+        src += 16 * source->y_stride;
+        dst += 16 * dest->y_stride;
+    }
+
+    return Total;
+}
+
+/* Enforce a minimum filter level based upon baseline Q */
+static int get_min_filter_level(VP8_COMP *cpi, int base_qindex)
+{
+    int min_filter_level;
+
+    if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame &&
+        !cpi->common.refresh_alt_ref_frame)
+        min_filter_level = 0;
+    else
+    {
+        if (base_qindex <= 6)
+            min_filter_level = 0;
+        else if (base_qindex <= 16)
+            min_filter_level = 1;
+        else
+            min_filter_level = (base_qindex / 8);
+    }
+
+    return min_filter_level;
+}
+
+/* Enforce a maximum filter level based upon baseline Q */
+static int get_max_filter_level(VP8_COMP *cpi, int base_qindex)
+{
+    /* PGW August 2006: Highest filter values almost always a bad idea */
+
+    /* jbb chg: 20100118 - not so any more with this overquant stuff allow
+     * high values with lots of intra coming in.
+     */
+    int max_filter_level = MAX_LOOP_FILTER;
+    (void)base_qindex;
+
+    if (cpi->twopass.section_intra_rating > 8)
+        max_filter_level = MAX_LOOP_FILTER * 3 / 4;
+
+    return max_filter_level;
+}
+
+void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    int best_err = 0;
+    int filt_err = 0;
+    int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+    int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+    int filt_val;
+    int best_filt_val = cm->filter_level;
+    YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
+
+    /* Replace unfiltered frame buffer with a new one */
+    cm->frame_to_show = &cpi->pick_lf_lvl_frame;
+
+    if (cm->frame_type == KEY_FRAME)
+        cm->sharpness_level = 0;
+    else
+        cm->sharpness_level = cpi->oxcf.Sharpness;
+
+    if (cm->sharpness_level != cm->last_sharpness_level)
+    {
+        vp8_loop_filter_update_sharpness(&cm->lf_info, cm->sharpness_level);
+        cm->last_sharpness_level = cm->sharpness_level;
+    }
+
+    /* Start the search at the previous frame filter level unless it is
+     * now out of range.
+     */
+    if (cm->filter_level < min_filter_level)
+        cm->filter_level = min_filter_level;
+    else if (cm->filter_level > max_filter_level)
+        cm->filter_level = max_filter_level;
+
+    filt_val = cm->filter_level;
+    best_filt_val = filt_val;
+
+    /* Get the err using the previous frame's filter value. */
+
+    /* Copy the unfiltered / processed recon buffer to the new buffer */
+    vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+    vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+    best_err = calc_partial_ssl_err(sd, cm->frame_to_show);
+
+    filt_val -= 1 + (filt_val > 10);
+
+    /* Search lower filter levels */
+    while (filt_val >= min_filter_level)
+    {
+        /* Apply the loop filter */
+        vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+        vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+        /* Get the err for filtered frame */
+        filt_err = calc_partial_ssl_err(sd, cm->frame_to_show);
+
+        /* Update the best case record or exit loop. */
+        if (filt_err < best_err)
+        {
+            best_err = filt_err;
+            best_filt_val = filt_val;
+        }
+        else
+            break;
+
+        /* Adjust filter level */
+        filt_val -= 1 + (filt_val > 10);
+    }
+
+    /* Search up (note that we have already done filt_val = cm->filter_level) */
+    filt_val = cm->filter_level + 1 + (filt_val > 10);
+
+    if (best_filt_val == cm->filter_level)
+    {
+        /* Resist raising filter level for very small gains */
+        best_err -= (best_err >> 10);
+
+        while (filt_val < max_filter_level)
+        {
+            /* Apply the loop filter */
+            vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
+
+            vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
+
+            /* Get the err for filtered frame */
+            filt_err = calc_partial_ssl_err(sd, cm->frame_to_show);
+
+            /* Update the best case record or exit loop. */
+            if (filt_err < best_err)
+            {
+                /* Do not raise filter level if improvement is < 1 part
+                 * in 4096
+                 */
+                best_err = filt_err - (filt_err >> 10);
+
+                best_filt_val = filt_val;
+            }
+            else
+                break;
+
+            /* Adjust filter level */
+            filt_val += 1 + (filt_val > 10);
+        }
+    }
+
+    cm->filter_level = best_filt_val;
+
+    if (cm->filter_level < min_filter_level)
+        cm->filter_level = min_filter_level;
+
+    if (cm->filter_level > max_filter_level)
+        cm->filter_level = max_filter_level;
+
+    /* restore unfiltered frame pointer */
+    cm->frame_to_show = saved_frame;
+}
+
+/* Stub function for now Alt LF not used */
+void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val)
+{
+    MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+    (void) filt_val;
+
+    mbd->segment_feature_data[MB_LVL_ALT_LF][0] = cpi->segment_feature_data[MB_LVL_ALT_LF][0];
+    mbd->segment_feature_data[MB_LVL_ALT_LF][1] = cpi->segment_feature_data[MB_LVL_ALT_LF][1];
+    mbd->segment_feature_data[MB_LVL_ALT_LF][2] = cpi->segment_feature_data[MB_LVL_ALT_LF][2];
+    mbd->segment_feature_data[MB_LVL_ALT_LF][3] = cpi->segment_feature_data[MB_LVL_ALT_LF][3];
+}
+
+void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    int best_err = 0;
+    int filt_err = 0;
+    int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+    int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+
+    int filter_step;
+    int filt_high = 0;
+    /* Start search at previous frame filter level */
+    int filt_mid = cm->filter_level;
+    int filt_low = 0;
+    int filt_best;
+    int filt_direction = 0;
+
+    /* Bias against raising loop filter and in favor of lowering it */
+    int Bias = 0;
+
+    int ss_err[MAX_LOOP_FILTER + 1];
+
+    YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
+
+    vpx_memset(ss_err, 0, sizeof(ss_err));
+
+    /* Replace unfiltered frame buffer with a new one */
+    cm->frame_to_show = &cpi->pick_lf_lvl_frame;
+
+    if (cm->frame_type == KEY_FRAME)
+        cm->sharpness_level = 0;
+    else
+        cm->sharpness_level = cpi->oxcf.Sharpness;
+
+    /* Start the search at the previous frame filter level unless it is
+     * now out of range.
+     */
+    filt_mid = cm->filter_level;
+
+    if (filt_mid < min_filter_level)
+        filt_mid = min_filter_level;
+    else if (filt_mid > max_filter_level)
+        filt_mid = max_filter_level;
+
+    /* Define the initial step size */
+    filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
+
+    /* Get baseline error score */
+
+    /* Copy the unfiltered / processed recon buffer to the new buffer */
+    vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+
+    vp8cx_set_alt_lf_level(cpi, filt_mid);
+    vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
+
+    best_err = vp8_calc_ss_err(sd, cm->frame_to_show);
+
+    ss_err[filt_mid] = best_err;
+
+    filt_best = filt_mid;
+
+    while (filter_step > 0)
+    {
+        Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+        if (cpi->twopass.section_intra_rating < 20)
+            Bias = Bias * cpi->twopass.section_intra_rating / 20;
+
+        filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
+        filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
+
+        if ((filt_direction <= 0) && (filt_low != filt_mid))
+        {
+            if(ss_err[filt_low] == 0)
+            {
+                /* Get Low filter error score */
+                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vp8cx_set_alt_lf_level(cpi, filt_low);
+                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
+
+                filt_err = vp8_calc_ss_err(sd, cm->frame_to_show);
+                ss_err[filt_low] = filt_err;
+            }
+            else
+                filt_err = ss_err[filt_low];
+
+            /* If value is close to the best so far then bias towards a
+             * lower loop filter value.
+             */
+            if ((filt_err - Bias) < best_err)
+            {
+                /* Was it actually better than the previous best? */
+                if (filt_err < best_err)
+                    best_err = filt_err;
+
+                filt_best = filt_low;
+            }
+        }
+
+        /* Now look at filt_high */
+        if ((filt_direction >= 0) && (filt_high != filt_mid))
+        {
+            if(ss_err[filt_high] == 0)
+            {
+                vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
+                vp8cx_set_alt_lf_level(cpi, filt_high);
+                vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
+
+                filt_err = vp8_calc_ss_err(sd, cm->frame_to_show);
+                ss_err[filt_high] = filt_err;
+            }
+            else
+                filt_err = ss_err[filt_high];
+
+            /* Was it better than the previous best? */
+            if (filt_err < (best_err - Bias))
+            {
+                best_err = filt_err;
+                filt_best = filt_high;
+            }
+        }
+
+        /* Half the step distance if the best filter value was the same
+         * as last time
+         */
+        if (filt_best == filt_mid)
+        {
+            filter_step = filter_step / 2;
+            filt_direction = 0;
+        }
+        else
+        {
+            filt_direction = (filt_best < filt_mid) ? -1 : 1;
+            filt_mid = filt_best;
+        }
+    }
+
+    cm->filter_level = filt_best;
+
+    /* restore unfiltered frame pointer */
+    cm->frame_to_show = saved_frame;
+}
diff --git a/libvpx/vp8/encoder/ppc/csystemdependent.c b/libvpx/vp8/encoder/ppc/csystemdependent.c
new file mode 100644
index 0000000..63f2357
--- /dev/null
+++ b/libvpx/vp8/encoder/ppc/csystemdependent.c
@@ -0,0 +1,160 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/encoder/variance.h"
+#include "vp8/encoder/onyx_int.h"
+
+SADFunction *vp8_sad16x16;
+SADFunction *vp8_sad16x8;
+SADFunction *vp8_sad8x16;
+SADFunction *vp8_sad8x8;
+SADFunction *vp8_sad4x4;
+
+variance_function *vp8_variance4x4;
+variance_function *vp8_variance8x8;
+variance_function *vp8_variance8x16;
+variance_function *vp8_variance16x8;
+variance_function *vp8_variance16x16;
+
+variance_function *vp8_mse16x16;
+
+sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
+
+int (*vp8_block_error)(short *coeff, short *dqcoeff);
+int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
+
+int (*vp8_mbuverror)(MACROBLOCK *mb);
+unsigned int (*vp8_get_mb_ss)(short *);
+void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
+void (*short_walsh4x4)(short *input, short *output, int pitch);
+
+void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
+void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
+void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
+
+unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
+
+// c imports
+extern int block_error_c(short *coeff, short *dqcoeff);
+extern int vp8_mbblock_error_c(MACROBLOCK *mb, int dc);
+
+extern int vp8_mbuverror_c(MACROBLOCK *mb);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern void short_fdct4x4_c(short *input, short *output, int pitch);
+extern void short_fdct8x4_c(short *input, short *output, int pitch);
+extern void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+
+extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
+extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
+
+extern SADFunction sad16x16_c;
+extern SADFunction sad16x8_c;
+extern SADFunction sad8x16_c;
+extern SADFunction sad8x8_c;
+extern SADFunction sad4x4_c;
+
+extern variance_function variance16x16_c;
+extern variance_function variance8x16_c;
+extern variance_function variance16x8_c;
+extern variance_function variance8x8_c;
+extern variance_function variance4x4_c;
+extern variance_function mse16x16_c;
+
+extern sub_pixel_variance_function sub_pixel_variance4x4_c;
+extern sub_pixel_variance_function sub_pixel_variance8x8_c;
+extern sub_pixel_variance_function sub_pixel_variance8x16_c;
+extern sub_pixel_variance_function sub_pixel_variance16x8_c;
+extern sub_pixel_variance_function sub_pixel_variance16x16_c;
+
+extern unsigned int vp8_get_mb_ss_c(short *);
+extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
+
+// ppc
+extern int vp8_block_error_ppc(short *coeff, short *dqcoeff);
+
+extern void vp8_short_fdct4x4_ppc(short *input, short *output, int pitch);
+extern void vp8_short_fdct8x4_ppc(short *input, short *output, int pitch);
+
+extern void vp8_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void vp8_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+
+extern SADFunction vp8_sad16x16_ppc;
+extern SADFunction vp8_sad16x8_ppc;
+extern SADFunction vp8_sad8x16_ppc;
+extern SADFunction vp8_sad8x8_ppc;
+extern SADFunction vp8_sad4x4_ppc;
+
+extern variance_function vp8_variance16x16_ppc;
+extern variance_function vp8_variance8x16_ppc;
+extern variance_function vp8_variance16x8_ppc;
+extern variance_function vp8_variance8x8_ppc;
+extern variance_function vp8_variance4x4_ppc;
+extern variance_function vp8_mse16x16_ppc;
+
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_ppc;
+
+extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+
+void vp8_cmachine_specific_config(void)
+{
+    // Pure C:
+    vp8_mbuverror               = vp8_mbuverror_c;
+    vp8_fast_quantize_b           = vp8_fast_quantize_b_c;
+    vp8_short_fdct4x4            = vp8_short_fdct4x4_ppc;
+    vp8_short_fdct8x4            = vp8_short_fdct8x4_ppc;
+    vp8_fast_fdct4x4             = vp8_short_fdct4x4_ppc;
+    vp8_fast_fdct8x4             = vp8_short_fdct8x4_ppc;
+    short_walsh4x4               = vp8_short_walsh4x4_c;
+
+    vp8_variance4x4             = vp8_variance4x4_ppc;
+    vp8_variance8x8             = vp8_variance8x8_ppc;
+    vp8_variance8x16            = vp8_variance8x16_ppc;
+    vp8_variance16x8            = vp8_variance16x8_ppc;
+    vp8_variance16x16           = vp8_variance16x16_ppc;
+    vp8_mse16x16                = vp8_mse16x16_ppc;
+
+    vp8_sub_pixel_variance4x4     = vp8_sub_pixel_variance4x4_ppc;
+    vp8_sub_pixel_variance8x8     = vp8_sub_pixel_variance8x8_ppc;
+    vp8_sub_pixel_variance8x16    = vp8_sub_pixel_variance8x16_ppc;
+    vp8_sub_pixel_variance16x8    = vp8_sub_pixel_variance16x8_ppc;
+    vp8_sub_pixel_variance16x16   = vp8_sub_pixel_variance16x16_ppc;
+
+    vp8_get_mb_ss                 = vp8_get_mb_ss_c;
+    vp8_get4x4sse_cs            = vp8_get4x4sse_cs_c;
+
+    vp8_sad16x16                = vp8_sad16x16_ppc;
+    vp8_sad16x8                 = vp8_sad16x8_ppc;
+    vp8_sad8x16                 = vp8_sad8x16_ppc;
+    vp8_sad8x8                  = vp8_sad8x8_ppc;
+    vp8_sad4x4                  = vp8_sad4x4_ppc;
+
+    vp8_block_error              = vp8_block_error_ppc;
+    vp8_mbblock_error            = vp8_mbblock_error_c;
+
+    vp8_subtract_b               = vp8_subtract_b_c;
+    vp8_subtract_mby             = vp8_subtract_mby_ppc;
+    vp8_subtract_mbuv            = vp8_subtract_mbuv_ppc;
+}
diff --git a/libvpx/vp8/encoder/ppc/encodemb_altivec.asm b/libvpx/vp8/encoder/ppc/encodemb_altivec.asm
new file mode 100644
index 0000000..6e0099d
--- /dev/null
+++ b/libvpx/vp8/encoder/ppc/encodemb_altivec.asm
@@ -0,0 +1,153 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_subtract_mbuv_ppc
+    .globl vp8_subtract_mby_ppc
+
+;# r3 short *diff
+;# r4 unsigned char *usrc
+;# r5 unsigned char *vsrc
+;# r6 unsigned char *pred
+;# r7 int stride
+vp8_subtract_mbuv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf000
+    mtspr   256, r12            ;# set VRSAVE
+
+    li      r9, 256
+    add     r3, r3, r9
+    add     r3, r3, r9
+    add     r6, r6, r9
+
+    li      r10, 16
+    li      r9,  4
+    mtctr   r9
+
+    vspltisw v0, 0
+
+mbu_loop:
+    lvsl    v5, 0, r4           ;# permutate value for alignment
+    lvx     v1, 0, r4           ;# src
+    lvx     v2, 0, r6           ;# pred
+
+    add     r4, r4, r7
+    addi    r6, r6, 16
+
+    vperm   v1, v1, v0, v5
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrghb  v4, v0, v2          ;# unpack high pred to short
+
+    lvsl    v5, 0, r4           ;# permutate value for alignment
+    lvx     v1, 0, r4           ;# src
+
+    add     r4, r4, r7
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, 0, r3           ;# store out diff
+
+    vperm   v1, v1, v0, v5
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrglb  v4, v0, v2          ;# unpack high pred to short
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, r10, r3         ;# store out diff
+
+    addi    r3, r3, 32
+
+    bdnz    mbu_loop
+
+    mtctr   r9
+
+mbv_loop:
+    lvsl    v5, 0, r5           ;# permutate value for alignment
+    lvx     v1, 0, r5           ;# src
+    lvx     v2, 0, r6           ;# pred
+
+    add     r5, r5, r7
+    addi    r6, r6, 16
+
+    vperm   v1, v1, v0, v5
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrghb  v4, v0, v2          ;# unpack high pred to short
+
+    lvsl    v5, 0, r5           ;# permutate value for alignment
+    lvx     v1, 0, r5           ;# src
+
+    add     r5, r5, r7
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, 0, r3           ;# store out diff
+
+    vperm   v1, v1, v0, v5
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrglb  v4, v0, v2          ;# unpack high pred to short
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, r10, r3         ;# store out diff
+
+    addi    r3, r3, 32
+
+    bdnz    mbv_loop
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# r3 short *diff
+;# r4 unsigned char *src
+;# r5 unsigned char *pred
+;# r6 int stride
+vp8_subtract_mby_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf800
+    mtspr   256, r12            ;# set VRSAVE
+
+    li      r10, 16
+    mtctr   r10
+
+    vspltisw v0, 0
+
+mby_loop:
+    lvx     v1, 0, r4           ;# src
+    lvx     v2, 0, r5           ;# pred
+
+    add     r4, r4, r6
+    addi    r5, r5, 16
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrghb  v4, v0, v2          ;# unpack high pred to short
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, 0, r3           ;# store out diff
+
+    vmrglb  v3, v0, v1          ;# unpack low src  to short
+    vmrglb  v4, v0, v2          ;# unpack low pred to short
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, r10, r3         ;# store out diff
+
+    addi    r3, r3, 32
+
+    bdnz    mby_loop
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
diff --git a/libvpx/vp8/encoder/ppc/fdct_altivec.asm b/libvpx/vp8/encoder/ppc/fdct_altivec.asm
new file mode 100644
index 0000000..935d0cb
--- /dev/null
+++ b/libvpx/vp8/encoder/ppc/fdct_altivec.asm
@@ -0,0 +1,205 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_short_fdct4x4_ppc
+    .globl vp8_short_fdct8x4_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+;# Forward and inverse DCTs are nearly identical; only differences are
+;#   in normalization (fwd is twice unitary, inv is half unitary)
+;#   and that they are of course transposes of each other.
+;#
+;#   The following three accomplish most of implementation and
+;#   are used only by ppc_idct.c and ppc_fdct.c.
+.macro prologue
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfffc
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    li      r6, 16
+
+    load_c v0, dct_tab, 0, r9, r10
+    lvx     v1,   r6, r10
+    addi    r10, r10, 32
+    lvx     v2,    0, r10
+    lvx     v3,   r6, r10
+
+    load_c v4, ppc_dctperm_tab,  0, r9, r10
+    load_c v5, ppc_dctperm_tab, r6, r9, r10
+
+    load_c v6, round_tab, 0, r10, r9
+.endm
+
+.macro epilogue
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+.endm
+
+;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.
+;#   a/A are the even rows 0,2   b/B are the odd rows 1,3
+;#   For fwd transform, indices are horizontal positions, then frequencies.
+;#   For inverse transform, frequencies then positions.
+;#   The two resulting  A0..A3  B0..B3  are later combined
+;#   and vertically transformed.
+
+.macro two_rows_horiz Dst
+    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1
+
+    vmsumshm v10, v0, v8, v6
+    vmsumshm v10, v1, v9, v10
+    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1
+
+    vmsumshm v11, v2, v8, v6
+    vmsumshm v11, v3, v9, v11
+    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3
+
+    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3
+    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3
+.endm
+
+;# Vertical xf on two rows. DCT values in comments are for inverse transform;
+;#   forward transform uses transpose.
+
+.macro two_rows_vert Ceven, Codd
+    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times
+    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""
+    vmsumshm v8, v8, v12, v6
+    vmsumshm v8, v9, v13, v8
+    vsraw   v10, v8, v7
+
+    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13
+    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33
+    vmsumshm v8, v8, v12, v6
+    vmsumshm v8, v9, v13, v8
+    vsraw   v8, v8, v7
+
+    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3
+.endm
+
+.macro two_rows_h Dest
+    stw     r0,  0(r8)
+    lwz     r0,  4(r3)
+    stw     r0,  4(r8)
+    lwzux   r0, r3,r5
+    stw     r0,  8(r8)
+    lwz     r0,  4(r3)
+    stw     r0, 12(r8)
+    lvx     v8,  0,r8
+    two_rows_horiz \Dest
+.endm
+
+    .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct4x4_ppc:
+
+    prologue
+
+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
+    addi    r8, r1, 0
+
+
+    lwz     r0, 0(r3)
+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
+
+    lwzux   r0, r3, r5
+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
+
+    lvx     v6, r6, r9          ;# v6 = Vround
+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
+
+    two_rows_vert v0, v1
+    stvx    v8, 0, r4
+    two_rows_vert v2, v3
+    stvx    v8, r6, r4
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct8x4_ppc:
+    prologue
+
+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
+    addi    r8,  r1, 0
+    addi    r10, r3, 0
+
+    lwz     r0, 0(r3)
+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
+
+    lwzux   r0, r3, r5
+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
+
+    lvx     v6, r6, r9          ;# v6 = Vround
+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
+
+    two_rows_vert v0, v1
+    stvx    v8, 0, r4
+    two_rows_vert v2, v3
+    stvx    v8, r6, r4
+
+    ;# Next block
+    addi    r3, r10, 8
+    addi    r4, r4, 32
+    lvx     v6, 0, r9           ;# v6 = Hround
+
+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
+    addi    r8, r1, 0
+
+    lwz     r0, 0(r3)
+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
+
+    lwzux   r0, r3, r5
+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
+
+    lvx     v6, r6, r9          ;# v6 = Vround
+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
+
+    two_rows_vert v0, v1
+    stvx    v8, 0, r4
+    two_rows_vert v2, v3
+    stvx    v8, r6, r4
+
+    epilogue
+
+    blr
+
+    .data
+    .align 4
+ppc_dctperm_tab:
+    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
+    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
+
+    .align 4
+dct_tab:
+    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
+    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
+
+    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
+    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
+
+    .align 4
+round_tab:
+    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
+    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
diff --git a/libvpx/vp8/encoder/ppc/rdopt_altivec.asm b/libvpx/vp8/encoder/ppc/rdopt_altivec.asm
new file mode 100644
index 0000000..ba48230
--- /dev/null
+++ b/libvpx/vp8/encoder/ppc/rdopt_altivec.asm
@@ -0,0 +1,51 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    .globl vp8_block_error_ppc
+
+    .align 2
+;# r3 short *Coeff
+;# r4 short *dqcoeff
+vp8_block_error_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf800
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    stw     r5, 12(r1)          ;# tranfer dc to vector register
+
+    lvx     v0, 0, r3           ;# Coeff
+    lvx     v1, 0, r4           ;# dqcoeff
+
+    li      r10, 16
+
+    vspltisw v3, 0
+
+    vsubshs v0, v0, v1
+
+    vmsumshm v2, v0, v0, v3     ;# multiply differences
+
+    lvx     v0, r10, r3         ;# Coeff
+    lvx     v1, r10, r4         ;# dqcoeff
+
+    vsubshs v0, v0, v1
+
+    vmsumshm v1, v0, v0, v2     ;# multiply differences
+    vsumsws v1, v1, v3          ;# sum up
+
+    stvx    v1, 0, r1
+    lwz     r3, 12(r1)          ;# return value
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
diff --git a/libvpx/vp8/encoder/psnr.c b/libvpx/vp8/encoder/psnr.c
new file mode 100644
index 0000000..5bb49ad
--- /dev/null
+++ b/libvpx/vp8/encoder/psnr.c
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "math.h"
+#include "vp8/common/systemdependent.h" /* for vp8_clear_system_state() */
+
+#define MAX_PSNR 60
+
+double vp8_mse2psnr(double Samples, double Peak, double Mse)
+{
+    double psnr;
+
+    if ((double)Mse > 0.0)
+        psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
+    else
+        psnr = MAX_PSNR;      /* Limit to prevent / 0 */
+
+    if (psnr > MAX_PSNR)
+        psnr = MAX_PSNR;
+
+    return psnr;
+}
diff --git a/libvpx/vp8/encoder/psnr.h b/libvpx/vp8/encoder/psnr.h
new file mode 100644
index 0000000..7f6269a
--- /dev/null
+++ b/libvpx/vp8/encoder/psnr.h
@@ -0,0 +1,17 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PSNR_H
+#define __INC_PSNR_H
+
+extern double vp8_mse2psnr(double Samples, double Peak, double Mse);
+
+#endif
diff --git a/libvpx/vp8/encoder/quantize.c b/libvpx/vp8/encoder/quantize.c
new file mode 100644
index 0000000..88fea11
--- /dev/null
+++ b/libvpx/vp8/encoder/quantize.c
@@ -0,0 +1,811 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+
+#include "onyx_int.h"
+#include "quantize.h"
+#include "vp8/common/quant_common.h"
+
+#define EXACT_QUANT
+
+#ifdef EXACT_FASTQUANT
+void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+    int i, rc, eob;
+    int zbin;
+    int x, y, z, sz;
+    short *coeff_ptr       = b->coeff;
+    short *zbin_ptr        = b->zbin;
+    short *round_ptr       = b->round;
+    short *quant_ptr       = b->quant_fast;
+    unsigned char *quant_shift_ptr = b->quant_shift;
+    short *qcoeff_ptr      = d->qcoeff;
+    short *dqcoeff_ptr     = d->dqcoeff;
+    short *dequant_ptr     = d->dequant;
+
+    vpx_memset(qcoeff_ptr, 0, 32);
+    vpx_memset(dqcoeff_ptr, 0, 32);
+
+    eob = -1;
+
+    for (i = 0; i < 16; i++)
+    {
+        rc   = vp8_default_zig_zag1d[i];
+        z    = coeff_ptr[rc];
+        zbin = zbin_ptr[rc] ;
+
+        sz = (z >> 31);                              /* sign of z */
+        x  = (z ^ sz) - sz;                          /* x = abs(z) */
+
+        if (x >= zbin)
+        {
+            x += round_ptr[rc];
+            y  = (((x * quant_ptr[rc]) >> 16) + x)
+                 >> quant_shift_ptr[rc];             /* quantize (x) */
+            x  = (y ^ sz) - sz;                      /* get the sign back */
+            qcoeff_ptr[rc] = x;                      /* write to destination */
+            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
+
+            if (y)
+            {
+                eob = i;                             /* last nonzero coeffs */
+            }
+        }
+    }
+    *d->eob = (char)(eob + 1);
+}
+
+#else
+
+void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+    int i, rc, eob;
+    int x, y, z, sz;
+    short *coeff_ptr   = b->coeff;
+    short *round_ptr   = b->round;
+    short *quant_ptr   = b->quant_fast;
+    short *qcoeff_ptr  = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = d->dequant;
+
+    eob = -1;
+    for (i = 0; i < 16; i++)
+    {
+        rc   = vp8_default_zig_zag1d[i];
+        z    = coeff_ptr[rc];
+
+        sz = (z >> 31);                              /* sign of z */
+        x  = (z ^ sz) - sz;                          /* x = abs(z) */
+
+        y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; /* quantize (x) */
+        x  = (y ^ sz) - sz;                          /* get the sign back */
+        qcoeff_ptr[rc] = x;                          /* write to destination */
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];       /* dequantized value */
+
+        if (y)
+        {
+            eob = i;                                 /* last nonzero coeffs */
+        }
+    }
+    *d->eob = (char)(eob + 1);
+}
+
+#endif
+
+#ifdef EXACT_QUANT
+void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+    int i, rc, eob;
+    int zbin;
+    int x, y, z, sz;
+    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+    short *coeff_ptr       = b->coeff;
+    short *zbin_ptr        = b->zbin;
+    short *round_ptr       = b->round;
+    short *quant_ptr       = b->quant;
+    unsigned char *quant_shift_ptr = b->quant_shift;
+    short *qcoeff_ptr      = d->qcoeff;
+    short *dqcoeff_ptr     = d->dqcoeff;
+    short *dequant_ptr     = d->dequant;
+    short zbin_oq_value    = b->zbin_extra;
+
+    vpx_memset(qcoeff_ptr, 0, 32);
+    vpx_memset(dqcoeff_ptr, 0, 32);
+
+    eob = -1;
+
+    for (i = 0; i < 16; i++)
+    {
+        rc   = vp8_default_zig_zag1d[i];
+        z    = coeff_ptr[rc];
+
+        zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+
+        zbin_boost_ptr ++;
+        sz = (z >> 31);                              /* sign of z */
+        x  = (z ^ sz) - sz;                          /* x = abs(z) */
+
+        if (x >= zbin)
+        {
+            x += round_ptr[rc];
+            y  = (((x * quant_ptr[rc]) >> 16) + x)
+                 >> quant_shift_ptr[rc];             /* quantize (x) */
+            x  = (y ^ sz) - sz;                      /* get the sign back */
+            qcoeff_ptr[rc]  = x;                     /* write to destination */
+            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
+
+            if (y)
+            {
+                eob = i;                             /* last nonzero coeffs */
+                zbin_boost_ptr = b->zrun_zbin_boost; /* reset zero runlength */
+            }
+        }
+    }
+
+    *d->eob = (char)(eob + 1);
+}
+
+/* Perform regular quantization, with unbiased rounding and no zero bin. */
+void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+    int i;
+    int rc;
+    int eob;
+    int x;
+    int y;
+    int z;
+    int sz;
+    short *coeff_ptr;
+    short *quant_ptr;
+    unsigned char *quant_shift_ptr;
+    short *qcoeff_ptr;
+    short *dqcoeff_ptr;
+    short *dequant_ptr;
+
+    coeff_ptr       = b->coeff;
+    quant_ptr       = b->quant;
+    quant_shift_ptr = b->quant_shift;
+    qcoeff_ptr      = d->qcoeff;
+    dqcoeff_ptr     = d->dqcoeff;
+    dequant_ptr     = d->dequant;
+    eob = - 1;
+    vpx_memset(qcoeff_ptr, 0, 32);
+    vpx_memset(dqcoeff_ptr, 0, 32);
+    for (i = 0; i < 16; i++)
+    {
+        int dq;
+        int round;
+
+        /*TODO: These arrays should be stored in zig-zag order.*/
+        rc = vp8_default_zig_zag1d[i];
+        z = coeff_ptr[rc];
+        dq = dequant_ptr[rc];
+        round = dq >> 1;
+        /* Sign of z. */
+        sz = -(z < 0);
+        x = (z + sz) ^ sz;
+        x += round;
+        if (x >= dq)
+        {
+            /* Quantize x. */
+            y  = (((x * quant_ptr[rc]) >> 16) + x) >> quant_shift_ptr[rc];
+            /* Put the sign back. */
+            x = (y + sz) ^ sz;
+            /* Save the coefficient and its dequantized value. */
+            qcoeff_ptr[rc] = x;
+            dqcoeff_ptr[rc] = x * dq;
+            /* Remember the last non-zero coefficient. */
+            if (y)
+                eob = i;
+        }
+    }
+
+    *d->eob = (char)(eob + 1);
+}
+
+#else
+
+void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+    int i, rc, eob;
+    int zbin;
+    int x, y, z, sz;
+    short *zbin_boost_ptr = b->zrun_zbin_boost;
+    short *coeff_ptr      = b->coeff;
+    short *zbin_ptr       = b->zbin;
+    short *round_ptr      = b->round;
+    short *quant_ptr      = b->quant;
+    short *qcoeff_ptr     = d->qcoeff;
+    short *dqcoeff_ptr    = d->dqcoeff;
+    short *dequant_ptr    = d->dequant;
+    short zbin_oq_value   = b->zbin_extra;
+
+    vpx_memset(qcoeff_ptr, 0, 32);
+    vpx_memset(dqcoeff_ptr, 0, 32);
+
+    eob = -1;
+
+    for (i = 0; i < 16; i++)
+    {
+        rc   = vp8_default_zig_zag1d[i];
+        z    = coeff_ptr[rc];
+
+        zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+
+        zbin_boost_ptr ++;
+        sz = (z >> 31);                              /* sign of z */
+        x  = (z ^ sz) - sz;                          /* x = abs(z) */
+
+        if (x >= zbin)
+        {
+            y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; /* quantize (x) */
+            x  = (y ^ sz) - sz;                      /* get the sign back */
+            qcoeff_ptr[rc]  = x;                     /* write to destination */
+            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
+
+            if (y)
+            {
+                eob = i;                             /* last nonzero coeffs */
+                zbin_boost_ptr = &b->zrun_zbin_boost[0]; /* reset zrl */
+            }
+        }
+    }
+
+    *d->eob = (char)(eob + 1);
+}
+
+#endif
+
+void vp8_quantize_mby_c(MACROBLOCK *x)
+{
+    int i;
+    int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+    for (i = 0; i < 16; i++)
+        x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+
+    if(has_2nd_order)
+        x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
+}
+
+void vp8_quantize_mb_c(MACROBLOCK *x)
+{
+    int i;
+    int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
+
+    for (i = 0; i < 24+has_2nd_order; i++)
+        x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+}
+
+
+void vp8_quantize_mbuv_c(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i++)
+        x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+}
+
+/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
+ * these two C functions if corresponding optimized routine is not available.
+ * NEON optimized version implements currently the fast quantization for pair
+ * of blocks. */
+void vp8_regular_quantize_b_pair(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
+{
+    vp8_regular_quantize_b(b1, d1);
+    vp8_regular_quantize_b(b2, d2);
+}
+
+void vp8_fast_quantize_b_pair_c(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
+{
+    vp8_fast_quantize_b_c(b1, d1);
+    vp8_fast_quantize_b_c(b2, d2);
+}
+
+
+static const int qrounding_factors[129] =
+{
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48
+};
+
+
+static const int qzbin_factors[129] =
+{
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80
+};
+
+
+static const int qrounding_factors_y2[129] =
+{
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48
+};
+
+
+static const int qzbin_factors_y2[129] =
+{
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    84, 84, 84, 84, 84, 84, 84, 84,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80
+};
+
+
+#define EXACT_QUANT
+#ifdef EXACT_QUANT
+static void invert_quant(int improved_quant, short *quant,
+                               unsigned char *shift, short d)
+{
+    if(improved_quant)
+    {
+        unsigned t;
+        int l;
+        t = d;
+        for(l = 0; t > 1; l++)
+            t>>=1;
+        t = 1 + (1<<(16+l))/d;
+        *quant = (short)(t - (1<<16));
+        *shift = l;
+    }
+    else
+    {
+        *quant = (1 << 16) / d;
+        *shift = 0;
+    }
+}
+
+
+void vp8cx_init_quantizer(VP8_COMP *cpi)
+{
+    int i;
+    int quant_val;
+    int Q;
+
+    int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44,
+                          44, 44};
+
+    for (Q = 0; Q < QINDEX_RANGE; Q++)
+    {
+        /* dc values */
+        quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
+        cpi->Y1quant_fast[Q][0] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0,
+                     cpi->Y1quant_shift[Q] + 0, quant_val);
+        cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.Y1dequant[Q][0] = quant_val;
+        cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+        quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
+        cpi->Y2quant_fast[Q][0] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 0,
+                     cpi->Y2quant_shift[Q] + 0, quant_val);
+        cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+        cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+        cpi->common.Y2dequant[Q][0] = quant_val;
+        cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+        quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
+        cpi->UVquant_fast[Q][0] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 0,
+                     cpi->UVquant_shift[Q] + 0, quant_val);
+        cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
+        cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.UVdequant[Q][0] = quant_val;
+        cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+        /* all the ac values = ; */
+        quant_val = vp8_ac_yquant(Q);
+        cpi->Y1quant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 1,
+                     cpi->Y1quant_shift[Q] + 1, quant_val);
+        cpi->Y1zbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->Y1round[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.Y1dequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_y1[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+        cpi->Y2quant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->Y2quant[Q] + 1,
+                     cpi->Y2quant_shift[Q] + 1, quant_val);
+        cpi->Y2zbin[Q][1] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+        cpi->Y2round[Q][1] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+        cpi->common.Y2dequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_y2[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+        cpi->UVquant_fast[Q][1] = (1 << 16) / quant_val;
+        invert_quant(cpi->sf.improved_quant, cpi->UVquant[Q] + 1,
+                     cpi->UVquant_shift[Q] + 1, quant_val);
+        cpi->UVzbin[Q][1] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->UVround[Q][1] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.UVdequant[Q][1] = quant_val;
+        cpi->zrun_zbin_boost_uv[Q][1] = (quant_val * zbin_boost[1]) >> 7;
+
+        for (i = 2; i < 16; i++)
+        {
+            cpi->Y1quant_fast[Q][i] = cpi->Y1quant_fast[Q][1];
+            cpi->Y1quant[Q][i] = cpi->Y1quant[Q][1];
+            cpi->Y1quant_shift[Q][i] = cpi->Y1quant_shift[Q][1];
+            cpi->Y1zbin[Q][i] = cpi->Y1zbin[Q][1];
+            cpi->Y1round[Q][i] = cpi->Y1round[Q][1];
+            cpi->zrun_zbin_boost_y1[Q][i] = (cpi->common.Y1dequant[Q][1] *
+                                             zbin_boost[i]) >> 7;
+
+            cpi->Y2quant_fast[Q][i] = cpi->Y2quant_fast[Q][1];
+            cpi->Y2quant[Q][i] = cpi->Y2quant[Q][1];
+            cpi->Y2quant_shift[Q][i] = cpi->Y2quant_shift[Q][1];
+            cpi->Y2zbin[Q][i] = cpi->Y2zbin[Q][1];
+            cpi->Y2round[Q][i] = cpi->Y2round[Q][1];
+            cpi->zrun_zbin_boost_y2[Q][i] = (cpi->common.Y2dequant[Q][1] *
+                                             zbin_boost[i]) >> 7;
+
+            cpi->UVquant_fast[Q][i] = cpi->UVquant_fast[Q][1];
+            cpi->UVquant[Q][i] = cpi->UVquant[Q][1];
+            cpi->UVquant_shift[Q][i] = cpi->UVquant_shift[Q][1];
+            cpi->UVzbin[Q][i] = cpi->UVzbin[Q][1];
+            cpi->UVround[Q][i] = cpi->UVround[Q][1];
+            cpi->zrun_zbin_boost_uv[Q][i] = (cpi->common.UVdequant[Q][1] *
+                                             zbin_boost[i]) >> 7;
+        }
+    }
+}
+#else
+void vp8cx_init_quantizer(VP8_COMP *cpi)
+{
+    int i;
+    int quant_val;
+    int Q;
+
+    int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, 44, 44};
+
+    for (Q = 0; Q < QINDEX_RANGE; Q++)
+    {
+        /* dc values */
+        quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
+        cpi->Y1quant[Q][0] = (1 << 16) / quant_val;
+        cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.Y1dequant[Q][0] = quant_val;
+        cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+        quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
+        cpi->Y2quant[Q][0] = (1 << 16) / quant_val;
+        cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+        cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+        cpi->common.Y2dequant[Q][0] = quant_val;
+        cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+        quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
+        cpi->UVquant[Q][0] = (1 << 16) / quant_val;
+        cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
+        cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.UVdequant[Q][0] = quant_val;
+        cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+        /* all the ac values = ; */
+        for (i = 1; i < 16; i++)
+        {
+            int rc = vp8_default_zig_zag1d[i];
+
+            quant_val = vp8_ac_yquant(Q);
+            cpi->Y1quant[Q][rc] = (1 << 16) / quant_val;
+            cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+            cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
+            cpi->common.Y1dequant[Q][rc] = quant_val;
+            cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+
+            quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+            cpi->Y2quant[Q][rc] = (1 << 16) / quant_val;
+            cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
+            cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
+            cpi->common.Y2dequant[Q][rc] = quant_val;
+            cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+
+            quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+            cpi->UVquant[Q][rc] = (1 << 16) / quant_val;
+            cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+            cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
+            cpi->common.UVdequant[Q][rc] = quant_val;
+            cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+        }
+    }
+}
+#endif
+
+#define ZBIN_EXTRA_Y \
+    (( cpi->common.Y1dequant[QIndex][1] *  \
+    ( cpi->zbin_over_quant +  \
+      cpi->zbin_mode_boost +  \
+      x->act_zbin_adj ) ) >> 7)
+
+#define ZBIN_EXTRA_UV \
+    (( cpi->common.UVdequant[QIndex][1] *  \
+    ( cpi->zbin_over_quant +  \
+      cpi->zbin_mode_boost +  \
+      x->act_zbin_adj ) ) >> 7)
+
+#define ZBIN_EXTRA_Y2 \
+    (( cpi->common.Y2dequant[QIndex][1] *  \
+    ( (cpi->zbin_over_quant / 2) +  \
+       cpi->zbin_mode_boost +  \
+       x->act_zbin_adj ) ) >> 7)
+
+void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip)
+{
+    int i;
+    int QIndex;
+    MACROBLOCKD *xd = &x->e_mbd;
+    int zbin_extra;
+
+    /* Select the baseline MB Q index. */
+    if (xd->segmentation_enabled)
+    {
+        /* Abs Value */
+        if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+            QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][xd->mode_info_context->mbmi.segment_id];
+        /* Delta Value */
+        else
+        {
+            QIndex = cpi->common.base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][xd->mode_info_context->mbmi.segment_id];
+            /* Clamp to valid range */
+            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;
+        }
+    }
+    else
+        QIndex = cpi->common.base_qindex;
+
+    /* This initialization should be called at least once. Use ok_to_skip to
+     * decide if it is ok to skip.
+     * Before encoding a frame, this function is always called with ok_to_skip
+     * =0, which means no skiping of calculations. The "last" values are
+     * initialized at that time.
+     */
+    if (!ok_to_skip || QIndex != x->q_index)
+    {
+
+        xd->dequant_y1_dc[0] = 1;
+        xd->dequant_y1[0] = cpi->common.Y1dequant[QIndex][0];
+        xd->dequant_y2[0] = cpi->common.Y2dequant[QIndex][0];
+        xd->dequant_uv[0] = cpi->common.UVdequant[QIndex][0];
+
+        for (i = 1; i < 16; i++)
+        {
+            xd->dequant_y1_dc[i] =
+            xd->dequant_y1[i] = cpi->common.Y1dequant[QIndex][1];
+            xd->dequant_y2[i] = cpi->common.Y2dequant[QIndex][1];
+            xd->dequant_uv[i] = cpi->common.UVdequant[QIndex][1];
+        }
+#if 1
+        /*TODO:  Remove dequant from BLOCKD.  This is a temporary solution until
+         * the quantizer code uses a passed in pointer to the dequant constants.
+         * This will also require modifications to the x86 and neon assembly.
+         * */
+        for (i = 0; i < 16; i++)
+            x->e_mbd.block[i].dequant = xd->dequant_y1;
+        for (i = 16; i < 24; i++)
+            x->e_mbd.block[i].dequant = xd->dequant_uv;
+        x->e_mbd.block[24].dequant = xd->dequant_y2;
+#endif
+
+        /* Y */
+        zbin_extra = ZBIN_EXTRA_Y;
+
+        for (i = 0; i < 16; i++)
+        {
+            x->block[i].quant = cpi->Y1quant[QIndex];
+            x->block[i].quant_fast = cpi->Y1quant_fast[QIndex];
+            x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
+            x->block[i].zbin = cpi->Y1zbin[QIndex];
+            x->block[i].round = cpi->Y1round[QIndex];
+            x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
+            x->block[i].zbin_extra = (short)zbin_extra;
+        }
+
+        /* UV */
+        zbin_extra = ZBIN_EXTRA_UV;
+
+        for (i = 16; i < 24; i++)
+        {
+            x->block[i].quant = cpi->UVquant[QIndex];
+            x->block[i].quant_fast = cpi->UVquant_fast[QIndex];
+            x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
+            x->block[i].zbin = cpi->UVzbin[QIndex];
+            x->block[i].round = cpi->UVround[QIndex];
+            x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
+            x->block[i].zbin_extra = (short)zbin_extra;
+        }
+
+        /* Y2 */
+        zbin_extra = ZBIN_EXTRA_Y2;
+
+        x->block[24].quant_fast = cpi->Y2quant_fast[QIndex];
+        x->block[24].quant = cpi->Y2quant[QIndex];
+        x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
+        x->block[24].zbin = cpi->Y2zbin[QIndex];
+        x->block[24].round = cpi->Y2round[QIndex];
+        x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
+        x->block[24].zbin_extra = (short)zbin_extra;
+
+        /* save this macroblock QIndex for vp8_update_zbin_extra() */
+        x->q_index = QIndex;
+
+        cpi->last_zbin_over_quant = cpi->zbin_over_quant;
+        cpi->last_zbin_mode_boost = cpi->zbin_mode_boost;
+        x->last_act_zbin_adj = x->act_zbin_adj;
+
+
+
+    }
+    else if(cpi->last_zbin_over_quant != cpi->zbin_over_quant
+            || cpi->last_zbin_mode_boost != cpi->zbin_mode_boost
+            || x->last_act_zbin_adj != x->act_zbin_adj)
+    {
+        /* Y */
+        zbin_extra = ZBIN_EXTRA_Y;
+
+        for (i = 0; i < 16; i++)
+            x->block[i].zbin_extra = (short)zbin_extra;
+
+        /* UV */
+        zbin_extra = ZBIN_EXTRA_UV;
+
+        for (i = 16; i < 24; i++)
+            x->block[i].zbin_extra = (short)zbin_extra;
+
+        /* Y2 */
+        zbin_extra = ZBIN_EXTRA_Y2;
+        x->block[24].zbin_extra = (short)zbin_extra;
+
+        cpi->last_zbin_over_quant = cpi->zbin_over_quant;
+        cpi->last_zbin_mode_boost = cpi->zbin_mode_boost;
+        x->last_act_zbin_adj = x->act_zbin_adj;
+    }
+}
+
+void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x)
+{
+    int i;
+    int QIndex = x->q_index;
+    int zbin_extra;
+
+    /* Y */
+    zbin_extra = ZBIN_EXTRA_Y;
+
+    for (i = 0; i < 16; i++)
+        x->block[i].zbin_extra = (short)zbin_extra;
+
+    /* UV */
+    zbin_extra = ZBIN_EXTRA_UV;
+
+    for (i = 16; i < 24; i++)
+        x->block[i].zbin_extra = (short)zbin_extra;
+
+    /* Y2 */
+    zbin_extra = ZBIN_EXTRA_Y2;
+    x->block[24].zbin_extra = (short)zbin_extra;
+}
+#undef ZBIN_EXTRA_Y
+#undef ZBIN_EXTRA_UV
+#undef ZBIN_EXTRA_Y2
+
+void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
+{
+    /* Clear Zbin mode boost for default case */
+    cpi->zbin_mode_boost = 0;
+
+    /* MB level quantizer setup */
+    vp8cx_mb_init_quantizer(cpi, &cpi->mb, 0);
+}
+
+
+void vp8_set_quantizer(struct VP8_COMP *cpi, int Q)
+{
+    VP8_COMMON *cm = &cpi->common;
+    MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+    int update = 0;
+    int new_delta_q;
+    cm->base_qindex = Q;
+
+    /* if any of the delta_q values are changing update flag has to be set */
+    /* currently only y2dc_delta_q may change */
+
+    cm->y1dc_delta_q = 0;
+    cm->y2ac_delta_q = 0;
+    cm->uvdc_delta_q = 0;
+    cm->uvac_delta_q = 0;
+
+    if (Q < 4)
+    {
+        new_delta_q = 4-Q;
+    }
+    else
+        new_delta_q = 0;
+
+    update |= cm->y2dc_delta_q != new_delta_q;
+    cm->y2dc_delta_q = new_delta_q;
+
+
+    /* Set Segment specific quatizers */
+    mbd->segment_feature_data[MB_LVL_ALT_Q][0] = cpi->segment_feature_data[MB_LVL_ALT_Q][0];
+    mbd->segment_feature_data[MB_LVL_ALT_Q][1] = cpi->segment_feature_data[MB_LVL_ALT_Q][1];
+    mbd->segment_feature_data[MB_LVL_ALT_Q][2] = cpi->segment_feature_data[MB_LVL_ALT_Q][2];
+    mbd->segment_feature_data[MB_LVL_ALT_Q][3] = cpi->segment_feature_data[MB_LVL_ALT_Q][3];
+
+    /* quantizer has to be reinitialized for any delta_q changes */
+    if(update)
+        vp8cx_init_quantizer(cpi);
+
+}
diff --git a/libvpx/vp8/encoder/quantize.h b/libvpx/vp8/encoder/quantize.h
new file mode 100644
index 0000000..d55496c
--- /dev/null
+++ b/libvpx/vp8/encoder/quantize.h
@@ -0,0 +1,23 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_QUANTIZE_H
+#define __INC_QUANTIZE_H
+
+struct VP8_COMP;
+struct macroblock;
+extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q);
+extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi);
+extern void vp8_update_zbin_extra(struct VP8_COMP *cpi, struct macroblock *x);
+extern void vp8cx_mb_init_quantizer(struct VP8_COMP *cpi, struct macroblock *x, int ok_to_skip);
+extern void vp8cx_init_quantizer(struct VP8_COMP *cpi);
+
+#endif
diff --git a/libvpx/vp8/encoder/ratectrl.c b/libvpx/vp8/encoder/ratectrl.c
new file mode 100644
index 0000000..77c1c5a
--- /dev/null
+++ b/libvpx/vp8/encoder/ratectrl.c
@@ -0,0 +1,1534 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "math.h"
+#include "vp8/common/common.h"
+#include "ratectrl.h"
+#include "vp8/common/entropymode.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/systemdependent.h"
+#include "encodemv.h"
+
+
+#define MIN_BPB_FACTOR          0.01
+#define MAX_BPB_FACTOR          50
+
+extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
+
+
+
+#ifdef MODE_STATS
+extern int y_modes[5];
+extern int uv_modes[4];
+extern int b_modes[10];
+
+extern int inter_y_modes[10];
+extern int inter_uv_modes[4];
+extern int inter_b_modes[10];
+#endif
+
+/* Bits Per MB at different Q (Multiplied by 512) */
+#define BPER_MB_NORMBITS    9
+
+/* Work in progress recalibration of baseline rate tables based on
+ * the assumption that bits per mb is inversely proportional to the
+ * quantizer value.
+ */
+const int vp8_bits_per_mb[2][QINDEX_RANGE] =
+{
+    /* Intra case 450000/Qintra */
+    {
+        1125000,900000, 750000, 642857, 562500, 500000, 450000, 450000,
+        409090, 375000, 346153, 321428, 300000, 281250, 264705, 264705,
+        250000, 236842, 225000, 225000, 214285, 214285, 204545, 204545,
+        195652, 195652, 187500, 180000, 180000, 173076, 166666, 160714,
+        155172, 150000, 145161, 140625, 136363, 132352, 128571, 125000,
+        121621, 121621, 118421, 115384, 112500, 109756, 107142, 104651,
+        102272, 100000, 97826,  97826,  95744,  93750,  91836,  90000,
+        88235,  86538,  84905,  83333,  81818,  80357,  78947,  77586,
+        76271,  75000,  73770,  72580,  71428,  70312,  69230,  68181,
+        67164,  66176,  65217,  64285,  63380,  62500,  61643,  60810,
+        60000,  59210,  59210,  58441,  57692,  56962,  56250,  55555,
+        54878,  54216,  53571,  52941,  52325,  51724,  51136,  50561,
+        49450,  48387,  47368,  46875,  45918,  45000,  44554,  44117,
+        43269,  42452,  41666,  40909,  40178,  39473,  38793,  38135,
+        36885,  36290,  35714,  35156,  34615,  34090,  33582,  33088,
+        32608,  32142,  31468,  31034,  30405,  29801,  29220,  28662,
+    },
+    /* Inter case 285000/Qinter */
+    {
+        712500, 570000, 475000, 407142, 356250, 316666, 285000, 259090,
+        237500, 219230, 203571, 190000, 178125, 167647, 158333, 150000,
+        142500, 135714, 129545, 123913, 118750, 114000, 109615, 105555,
+        101785, 98275,  95000,  91935,  89062,  86363,  83823,  81428,
+        79166,  77027,  75000,  73076,  71250,  69512,  67857,  66279,
+        64772,  63333,  61956,  60638,  59375,  58163,  57000,  55882,
+        54807,  53773,  52777,  51818,  50892,  50000,  49137,  47500,
+        45967,  44531,  43181,  41911,  40714,  39583,  38513,  37500,
+        36538,  35625,  34756,  33928,  33139,  32386,  31666,  30978,
+        30319,  29687,  29081,  28500,  27941,  27403,  26886,  26388,
+        25909,  25446,  25000,  24568,  23949,  23360,  22800,  22265,
+        21755,  21268,  20802,  20357,  19930,  19520,  19127,  18750,
+        18387,  18037,  17701,  17378,  17065,  16764,  16473,  16101,
+        15745,  15405,  15079,  14766,  14467,  14179,  13902,  13636,
+        13380,  13133,  12895,  12666,  12445,  12179,  11924,  11632,
+        11445,  11220,  11003,  10795,  10594,  10401,  10215,  10035,
+    }
+};
+
+static const int kf_boost_qadjustment[QINDEX_RANGE] =
+{
+    128, 129, 130, 131, 132, 133, 134, 135,
+    136, 137, 138, 139, 140, 141, 142, 143,
+    144, 145, 146, 147, 148, 149, 150, 151,
+    152, 153, 154, 155, 156, 157, 158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167,
+    168, 169, 170, 171, 172, 173, 174, 175,
+    176, 177, 178, 179, 180, 181, 182, 183,
+    184, 185, 186, 187, 188, 189, 190, 191,
+    192, 193, 194, 195, 196, 197, 198, 199,
+    200, 200, 201, 201, 202, 203, 203, 203,
+    204, 204, 205, 205, 206, 206, 207, 207,
+    208, 208, 209, 209, 210, 210, 211, 211,
+    212, 212, 213, 213, 214, 214, 215, 215,
+    216, 216, 217, 217, 218, 218, 219, 219,
+    220, 220, 220, 220, 220, 220, 220, 220,
+    220, 220, 220, 220, 220, 220, 220, 220,
+};
+
+/* #define GFQ_ADJUSTMENT (Q+100) */
+#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q]
+const int vp8_gf_boost_qadjustment[QINDEX_RANGE] =
+{
+    80, 82, 84, 86, 88, 90, 92, 94,
+    96, 97, 98, 99, 100, 101, 102, 103,
+    104, 105, 106, 107, 108, 109, 110, 111,
+    112, 113, 114, 115, 116, 117, 118, 119,
+    120, 121, 122, 123, 124, 125, 126, 127,
+    128, 129, 130, 131, 132, 133, 134, 135,
+    136, 137, 138, 139, 140, 141, 142, 143,
+    144, 145, 146, 147, 148, 149, 150, 151,
+    152, 153, 154, 155, 156, 157, 158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167,
+    168, 169, 170, 171, 172, 173, 174, 175,
+    176, 177, 178, 179, 180, 181, 182, 183,
+    184, 184, 185, 185, 186, 186, 187, 187,
+    188, 188, 189, 189, 190, 190, 191, 191,
+    192, 192, 193, 193, 194, 194, 194, 194,
+    195, 195, 196, 196, 197, 197, 198, 198
+};
+
+/*
+const int vp8_gf_boost_qadjustment[QINDEX_RANGE] =
+{
+    100,101,102,103,104,105,105,106,
+    106,107,107,108,109,109,110,111,
+    112,113,114,115,116,117,118,119,
+    120,121,122,123,124,125,126,127,
+    128,129,130,131,132,133,134,135,
+    136,137,138,139,140,141,142,143,
+    144,145,146,147,148,149,150,151,
+    152,153,154,155,156,157,158,159,
+    160,161,162,163,164,165,166,167,
+    168,169,170,170,171,171,172,172,
+    173,173,173,174,174,174,175,175,
+    175,176,176,176,177,177,177,177,
+    178,178,179,179,180,180,181,181,
+    182,182,183,183,184,184,185,185,
+    186,186,187,187,188,188,189,189,
+    190,190,191,191,192,192,193,193,
+};
+*/
+
+static const int kf_gf_boost_qlimits[QINDEX_RANGE] =
+{
+    150, 155, 160, 165, 170, 175, 180, 185,
+    190, 195, 200, 205, 210, 215, 220, 225,
+    230, 235, 240, 245, 250, 255, 260, 265,
+    270, 275, 280, 285, 290, 295, 300, 305,
+    310, 320, 330, 340, 350, 360, 370, 380,
+    390, 400, 410, 420, 430, 440, 450, 460,
+    470, 480, 490, 500, 510, 520, 530, 540,
+    550, 560, 570, 580, 590, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+};
+
+/* % adjustment to target kf size based on seperation from previous frame */
+static const int kf_boost_seperation_adjustment[16] =
+{
+    30,   40,   50,   55,   60,   65,   70,   75,
+    80,   85,   90,   95,  100,  100,  100,  100,
+};
+
+
+static const int gf_adjust_table[101] =
+{
+    100,
+    115, 130, 145, 160, 175, 190, 200, 210, 220, 230,
+    240, 260, 270, 280, 290, 300, 310, 320, 330, 340,
+    350, 360, 370, 380, 390, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+};
+
+static const int gf_intra_usage_adjustment[20] =
+{
+    125, 120, 115, 110, 105, 100,  95,  85,  80,  75,
+    70,  65,  60,  55,  50,  50,  50,  50,  50,  50,
+};
+
+static const int gf_interval_table[101] =
+{
+    7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+};
+
+static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 };
+
+
+void vp8_save_coding_context(VP8_COMP *cpi)
+{
+    CODING_CONTEXT *const cc = & cpi->coding_context;
+
+    /* Stores a snapshot of key state variables which can subsequently be
+     * restored with a call to vp8_restore_coding_context. These functions are
+     * intended for use in a re-code loop in vp8_compress_frame where the
+     * quantizer value is adjusted between loop iterations.
+     */
+
+    cc->frames_since_key          = cpi->frames_since_key;
+    cc->filter_level             = cpi->common.filter_level;
+    cc->frames_till_gf_update_due   = cpi->frames_till_gf_update_due;
+    cc->frames_since_golden       = cpi->common.frames_since_golden;
+
+    vp8_copy(cc->mvc,      cpi->common.fc.mvc);
+    vp8_copy(cc->mvcosts,  cpi->rd_costs.mvcosts);
+
+    vp8_copy(cc->ymode_prob,   cpi->common.fc.ymode_prob);
+    vp8_copy(cc->uv_mode_prob,  cpi->common.fc.uv_mode_prob);
+
+    vp8_copy(cc->ymode_count, cpi->mb.ymode_count);
+    vp8_copy(cc->uv_mode_count, cpi->mb.uv_mode_count);
+
+
+    /* Stats */
+#ifdef MODE_STATS
+    vp8_copy(cc->y_modes,       y_modes);
+    vp8_copy(cc->uv_modes,      uv_modes);
+    vp8_copy(cc->b_modes,       b_modes);
+    vp8_copy(cc->inter_y_modes,  inter_y_modes);
+    vp8_copy(cc->inter_uv_modes, inter_uv_modes);
+    vp8_copy(cc->inter_b_modes,  inter_b_modes);
+#endif
+
+    cc->this_frame_percent_intra = cpi->this_frame_percent_intra;
+}
+
+
+void vp8_restore_coding_context(VP8_COMP *cpi)
+{
+    CODING_CONTEXT *const cc = & cpi->coding_context;
+
+    /* Restore key state variables to the snapshot state stored in the
+     * previous call to vp8_save_coding_context.
+     */
+
+    cpi->frames_since_key         =   cc->frames_since_key;
+    cpi->common.filter_level     =   cc->filter_level;
+    cpi->frames_till_gf_update_due  =   cc->frames_till_gf_update_due;
+    cpi->common.frames_since_golden       =   cc->frames_since_golden;
+
+    vp8_copy(cpi->common.fc.mvc, cc->mvc);
+
+    vp8_copy(cpi->rd_costs.mvcosts, cc->mvcosts);
+
+    vp8_copy(cpi->common.fc.ymode_prob,   cc->ymode_prob);
+    vp8_copy(cpi->common.fc.uv_mode_prob,  cc->uv_mode_prob);
+
+    vp8_copy(cpi->mb.ymode_count, cc->ymode_count);
+    vp8_copy(cpi->mb.uv_mode_count, cc->uv_mode_count);
+
+    /* Stats */
+#ifdef MODE_STATS
+    vp8_copy(y_modes, cc->y_modes);
+    vp8_copy(uv_modes, cc->uv_modes);
+    vp8_copy(b_modes, cc->b_modes);
+    vp8_copy(inter_y_modes, cc->inter_y_modes);
+    vp8_copy(inter_uv_modes, cc->inter_uv_modes);
+    vp8_copy(inter_b_modes, cc->inter_b_modes);
+#endif
+
+
+    cpi->this_frame_percent_intra = cc->this_frame_percent_intra;
+}
+
+
+void vp8_setup_key_frame(VP8_COMP *cpi)
+{
+    /* Setup for Key frame: */
+
+    vp8_default_coef_probs(& cpi->common);
+
+    vpx_memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+    {
+        int flag[2] = {1, 1};
+        vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag);
+    }
+
+    /* Make sure we initialize separate contexts for altref,gold, and normal.
+     * TODO shouldn't need 3 different copies of structure to do this!
+     */
+    vpx_memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
+    vpx_memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc));
+    vpx_memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc));
+
+    cpi->common.filter_level = cpi->common.base_qindex * 3 / 8 ;
+
+    /* Provisional interval before next GF */
+    if (cpi->auto_gold)
+        cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+    else
+        cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
+
+    cpi->common.refresh_golden_frame = 1;
+    cpi->common.refresh_alt_ref_frame = 1;
+}
+
+
+static int estimate_bits_at_q(int frame_kind, int Q, int MBs,
+                              double correction_factor)
+{
+    int Bpm = (int)(.5 + correction_factor * vp8_bits_per_mb[frame_kind][Q]);
+
+    /* Attempt to retain reasonable accuracy without overflow. The cutoff is
+     * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
+     * largest Bpm takes 20 bits.
+     */
+    if (MBs > (1 << 11))
+        return (Bpm >> BPER_MB_NORMBITS) * MBs;
+    else
+        return (Bpm * MBs) >> BPER_MB_NORMBITS;
+}
+
+
+static void calc_iframe_target_size(VP8_COMP *cpi)
+{
+    /* boost defaults to half second */
+    int kf_boost;
+    uint64_t target;
+
+    /* Clear down mmx registers to allow floating point in what follows */
+    vp8_clear_system_state();
+
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        int Q = cpi->oxcf.key_q;
+
+        target = estimate_bits_at_q(INTRA_FRAME, Q, cpi->common.MBs,
+                                    cpi->key_frame_rate_correction_factor);
+    }
+    else if (cpi->pass == 2)
+    {
+        /* New Two pass RC */
+        target = cpi->per_frame_bandwidth;
+    }
+    /* First Frame is a special case */
+    else if (cpi->common.current_video_frame == 0)
+    {
+        /* 1 Pass there is no information on which to base size so use
+         * bandwidth per second * fraction of the initial buffer
+         * level
+         */
+        target = cpi->oxcf.starting_buffer_level / 2;
+
+        if(target > cpi->oxcf.target_bandwidth * 3 / 2)
+            target = cpi->oxcf.target_bandwidth * 3 / 2;
+    }
+    else
+    {
+        /* if this keyframe was forced, use a more recent Q estimate */
+        int Q = (cpi->common.frame_flags & FRAMEFLAGS_KEY)
+                ? cpi->avg_frame_qindex : cpi->ni_av_qi;
+
+        int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
+        /* Boost depends somewhat on frame rate: only used for 1 layer case. */
+        if (cpi->oxcf.number_of_layers == 1) {
+          kf_boost = MAX(initial_boost, (int)(2 * cpi->output_frame_rate - 16));
+        }
+        else {
+          /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
+          kf_boost = initial_boost;
+        }
+
+        /* adjustment up based on q: this factor ranges from ~1.2 to 2.2. */
+        kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100;
+
+        /* frame separation adjustment ( down) */
+        if (cpi->frames_since_key  < cpi->output_frame_rate / 2)
+            kf_boost = (int)(kf_boost
+                       * cpi->frames_since_key / (cpi->output_frame_rate / 2));
+
+        /* Minimal target size is |2* per_frame_bandwidth|. */
+        if (kf_boost < 16)
+            kf_boost = 16;
+
+        target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
+    }
+
+
+    if (cpi->oxcf.rc_max_intra_bitrate_pct)
+    {
+        unsigned int max_rate = cpi->per_frame_bandwidth
+                                * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
+
+        if (target > max_rate)
+            target = max_rate;
+    }
+
+    cpi->this_frame_target = (int)target;
+
+    /* TODO: if we separate rate targeting from Q targetting, move this.
+     * Reset the active worst quality to the baseline value for key frames.
+     */
+    if (cpi->pass != 2)
+        cpi->active_worst_quality = cpi->worst_quality;
+
+#if 0
+    {
+        FILE *f;
+
+        f = fopen("kf_boost.stt", "a");
+        fprintf(f, " %8u %10d %10d %10d\n",
+                cpi->common.current_video_frame,  cpi->gfu_boost, cpi->baseline_gf_interval, cpi->source_alt_ref_pending);
+
+        fclose(f);
+    }
+#endif
+}
+
+
+/* Do the best we can to define the parameters for the next GF based on what
+ * information we have available.
+ */
+static void calc_gf_params(VP8_COMP *cpi)
+{
+    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+    int Boost = 0;
+
+    int gf_frame_useage = 0;      /* Golden frame useage since last GF */
+    int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME]  +
+                  cpi->recent_ref_frame_usage[LAST_FRAME]   +
+                  cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                  cpi->recent_ref_frame_usage[ALTREF_FRAME];
+
+    int pct_gf_active = (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols);
+
+    if (tot_mbs)
+        gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs;
+
+    if (pct_gf_active > gf_frame_useage)
+        gf_frame_useage = pct_gf_active;
+
+    /* Not two pass */
+    if (cpi->pass != 2)
+    {
+        /* Single Pass lagged mode: TBD */
+        if (0)
+        {
+        }
+
+        /* Single Pass compression: Has to use current and historical data */
+        else
+        {
+#if 0
+            /* Experimental code */
+            int index = cpi->one_pass_frame_index;
+            int frames_to_scan = (cpi->max_gf_interval <= MAX_LAG_BUFFERS) ? cpi->max_gf_interval : MAX_LAG_BUFFERS;
+
+            /* ************** Experimental code - incomplete */
+            /*
+            double decay_val = 1.0;
+            double IIAccumulator = 0.0;
+            double last_iiaccumulator = 0.0;
+            double IIRatio;
+
+            cpi->one_pass_frame_index = cpi->common.current_video_frame%MAX_LAG_BUFFERS;
+
+            for ( i = 0; i < (frames_to_scan - 1); i++ )
+            {
+                if ( index < 0 )
+                    index = MAX_LAG_BUFFERS;
+                index --;
+
+                if ( cpi->one_pass_frame_stats[index].frame_coded_error > 0.0 )
+                {
+                    IIRatio = cpi->one_pass_frame_stats[index].frame_intra_error / cpi->one_pass_frame_stats[index].frame_coded_error;
+
+                    if ( IIRatio > 30.0 )
+                        IIRatio = 30.0;
+                }
+                else
+                    IIRatio = 30.0;
+
+                IIAccumulator += IIRatio * decay_val;
+
+                decay_val = decay_val * cpi->one_pass_frame_stats[index].frame_pcnt_inter;
+
+                if (    (i > MIN_GF_INTERVAL) &&
+                        ((IIAccumulator - last_iiaccumulator) < 2.0) )
+                {
+                    break;
+                }
+                last_iiaccumulator = IIAccumulator;
+            }
+
+            Boost = IIAccumulator*100.0/16.0;
+            cpi->baseline_gf_interval = i;
+
+            */
+#else
+
+            /*************************************************************/
+            /* OLD code */
+
+            /* Adjust boost based upon ambient Q */
+            Boost = GFQ_ADJUSTMENT;
+
+            /* Adjust based upon most recently measure intra useage */
+            Boost = Boost * gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra : 14] / 100;
+
+            /* Adjust gf boost based upon GF usage since last GF */
+            Boost = Boost * gf_adjust_table[gf_frame_useage] / 100;
+#endif
+        }
+
+        /* golden frame boost without recode loop often goes awry.  be
+         * safe by keeping numbers down.
+         */
+        if (!cpi->sf.recode_loop)
+        {
+            if (cpi->compressor_speed == 2)
+                Boost = Boost / 2;
+        }
+
+        /* Apply an upper limit based on Q for 1 pass encodes */
+        if (Boost > kf_gf_boost_qlimits[Q] && (cpi->pass == 0))
+            Boost = kf_gf_boost_qlimits[Q];
+
+        /* Apply lower limits to boost. */
+        else if (Boost < 110)
+            Boost = 110;
+
+        /* Note the boost used */
+        cpi->last_boost = Boost;
+
+    }
+
+    /* Estimate next interval
+     * This is updated once the real frame size/boost is known.
+     */
+    if (cpi->oxcf.fixed_q == -1)
+    {
+        if (cpi->pass == 2)         /* 2 Pass */
+        {
+            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+        }
+        else                            /* 1 Pass */
+        {
+            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+            if (cpi->last_boost > 750)
+                cpi->frames_till_gf_update_due++;
+
+            if (cpi->last_boost > 1000)
+                cpi->frames_till_gf_update_due++;
+
+            if (cpi->last_boost > 1250)
+                cpi->frames_till_gf_update_due++;
+
+            if (cpi->last_boost >= 1500)
+                cpi->frames_till_gf_update_due ++;
+
+            if (gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due)
+                cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_useage];
+
+            if (cpi->frames_till_gf_update_due > cpi->max_gf_interval)
+                cpi->frames_till_gf_update_due = cpi->max_gf_interval;
+        }
+    }
+    else
+        cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+    /* ARF on or off */
+    if (cpi->pass != 2)
+    {
+        /* For now Alt ref is not allowed except in 2 pass modes. */
+        cpi->source_alt_ref_pending = 0;
+
+        /*if ( cpi->oxcf.fixed_q == -1)
+        {
+            if ( cpi->oxcf.play_alternate && (cpi->last_boost > (100 + (AF_THRESH*cpi->frames_till_gf_update_due)) ) )
+                cpi->source_alt_ref_pending = 1;
+            else
+                cpi->source_alt_ref_pending = 0;
+        }*/
+    }
+}
+
+
+static void calc_pframe_target_size(VP8_COMP *cpi)
+{
+    int min_frame_target;
+    int Adjustment;
+    int old_per_frame_bandwidth = cpi->per_frame_bandwidth;
+
+    if ( cpi->current_layer > 0)
+        cpi->per_frame_bandwidth =
+            cpi->layer_context[cpi->current_layer].avg_frame_size_for_layer;
+
+    min_frame_target = 0;
+
+    if (cpi->pass == 2)
+    {
+        min_frame_target = cpi->min_frame_bandwidth;
+
+        if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))
+            min_frame_target = cpi->av_per_frame_bandwidth >> 5;
+    }
+    else if (min_frame_target < cpi->per_frame_bandwidth / 4)
+        min_frame_target = cpi->per_frame_bandwidth / 4;
+
+
+    /* Special alt reference frame case */
+    if((cpi->common.refresh_alt_ref_frame) && (cpi->oxcf.number_of_layers == 1))
+    {
+        if (cpi->pass == 2)
+        {
+            /* Per frame bit target for the alt ref frame */
+            cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+            cpi->this_frame_target = cpi->per_frame_bandwidth;
+        }
+
+        /* One Pass ??? TBD */
+    }
+
+    /* Normal frames (gf,and inter) */
+    else
+    {
+        /* 2 pass */
+        if (cpi->pass == 2)
+        {
+            cpi->this_frame_target = cpi->per_frame_bandwidth;
+        }
+        /* 1 pass */
+        else
+        {
+            /* Make rate adjustment to recover bits spent in key frame
+             * Test to see if the key frame inter data rate correction
+             * should still be in force
+             */
+            if (cpi->kf_overspend_bits > 0)
+            {
+                Adjustment = (cpi->kf_bitrate_adjustment <= cpi->kf_overspend_bits) ? cpi->kf_bitrate_adjustment : cpi->kf_overspend_bits;
+
+                if (Adjustment > (cpi->per_frame_bandwidth - min_frame_target))
+                    Adjustment = (cpi->per_frame_bandwidth - min_frame_target);
+
+                cpi->kf_overspend_bits -= Adjustment;
+
+                /* Calculate an inter frame bandwidth target for the next
+                 * few frames designed to recover any extra bits spent on
+                 * the key frame.
+                 */
+                cpi->this_frame_target = cpi->per_frame_bandwidth - Adjustment;
+
+                if (cpi->this_frame_target < min_frame_target)
+                    cpi->this_frame_target = min_frame_target;
+            }
+            else
+                cpi->this_frame_target = cpi->per_frame_bandwidth;
+
+            /* If appropriate make an adjustment to recover bits spent on a
+             * recent GF
+             */
+            if ((cpi->gf_overspend_bits > 0) && (cpi->this_frame_target > min_frame_target))
+            {
+                int Adjustment = (cpi->non_gf_bitrate_adjustment <= cpi->gf_overspend_bits) ? cpi->non_gf_bitrate_adjustment : cpi->gf_overspend_bits;
+
+                if (Adjustment > (cpi->this_frame_target - min_frame_target))
+                    Adjustment = (cpi->this_frame_target - min_frame_target);
+
+                cpi->gf_overspend_bits -= Adjustment;
+                cpi->this_frame_target -= Adjustment;
+            }
+
+            /* Apply small + and - boosts for non gf frames */
+            if ((cpi->last_boost > 150) && (cpi->frames_till_gf_update_due > 0) &&
+                (cpi->current_gf_interval >= (MIN_GF_INTERVAL << 1)))
+            {
+                /* % Adjustment limited to the range 1% to 10% */
+                Adjustment = (cpi->last_boost - 100) >> 5;
+
+                if (Adjustment < 1)
+                    Adjustment = 1;
+                else if (Adjustment > 10)
+                    Adjustment = 10;
+
+                /* Convert to bits */
+                Adjustment = (cpi->this_frame_target * Adjustment) / 100;
+
+                if (Adjustment > (cpi->this_frame_target - min_frame_target))
+                    Adjustment = (cpi->this_frame_target - min_frame_target);
+
+                if (cpi->common.frames_since_golden == (cpi->current_gf_interval >> 1))
+                    cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment);
+                else
+                    cpi->this_frame_target -= Adjustment;
+            }
+        }
+    }
+
+    /* Sanity check that the total sum of adjustments is not above the
+     * maximum allowed That is that having allowed for KF and GF penalties
+     * we have not pushed the current interframe target to low. If the
+     * adjustment we apply here is not capable of recovering all the extra
+     * bits we have spent in the KF or GF then the remainder will have to
+     * be recovered over a longer time span via other buffer / rate control
+     * mechanisms.
+     */
+    if (cpi->this_frame_target < min_frame_target)
+        cpi->this_frame_target = min_frame_target;
+
+    if (!cpi->common.refresh_alt_ref_frame)
+        /* Note the baseline target data rate for this inter frame. */
+        cpi->inter_frame_target = cpi->this_frame_target;
+
+    /* One Pass specific code */
+    if (cpi->pass == 0)
+    {
+        /* Adapt target frame size with respect to any buffering constraints: */
+        if (cpi->buffered_mode)
+        {
+            int one_percent_bits = (int)
+                (1 + cpi->oxcf.optimal_buffer_level / 100);
+
+            if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) ||
+                (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
+            {
+                int percent_low = 0;
+
+                /* Decide whether or not we need to adjust the frame data
+                 * rate target.
+                 *
+                 * If we are are below the optimal buffer fullness level
+                 * and adherence to buffering constraints is important to
+                 * the end usage then adjust the per frame target.
+                 */
+                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+                    (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
+                {
+                    percent_low = (int)
+                        ((cpi->oxcf.optimal_buffer_level - cpi->buffer_level) /
+                        one_percent_bits);
+                }
+                /* Are we overshooting the long term clip data rate... */
+                else if (cpi->bits_off_target < 0)
+                {
+                    /* Adjust per frame data target downwards to compensate. */
+                    percent_low = (int)(100 * -cpi->bits_off_target /
+                                       (cpi->total_byte_count * 8));
+                }
+
+                if (percent_low > cpi->oxcf.under_shoot_pct)
+                    percent_low = cpi->oxcf.under_shoot_pct;
+                else if (percent_low < 0)
+                    percent_low = 0;
+
+                /* lower the target bandwidth for this frame. */
+                cpi->this_frame_target -=
+                        (cpi->this_frame_target * percent_low) / 200;
+
+                /* Are we using allowing control of active_worst_allowed_q
+                 * according to buffer level.
+                 */
+                if (cpi->auto_worst_q && cpi->ni_frames > 150)
+                {
+                    int64_t critical_buffer_level;
+
+                    /* For streaming applications the most important factor is
+                     * cpi->buffer_level as this takes into account the
+                     * specified short term buffering constraints. However,
+                     * hitting the long term clip data rate target is also
+                     * important.
+                     */
+                    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+                    {
+                        /* Take the smaller of cpi->buffer_level and
+                         * cpi->bits_off_target
+                         */
+                        critical_buffer_level =
+                            (cpi->buffer_level < cpi->bits_off_target)
+                            ? cpi->buffer_level : cpi->bits_off_target;
+                    }
+                    /* For local file playback short term buffering constraints
+                     * are less of an issue
+                     */
+                    else
+                    {
+                        /* Consider only how we are doing for the clip as a
+                         * whole
+                         */
+                        critical_buffer_level = cpi->bits_off_target;
+                    }
+
+                    /* Set the active worst quality based upon the selected
+                     * buffer fullness number.
+                     */
+                    if (critical_buffer_level < cpi->oxcf.optimal_buffer_level)
+                    {
+                        if ( critical_buffer_level >
+                             (cpi->oxcf.optimal_buffer_level >> 2) )
+                        {
+                            int64_t qadjustment_range =
+                                      cpi->worst_quality - cpi->ni_av_qi;
+                            int64_t above_base =
+                                      (critical_buffer_level -
+                                       (cpi->oxcf.optimal_buffer_level >> 2));
+
+                            /* Step active worst quality down from
+                             * cpi->ni_av_qi when (critical_buffer_level ==
+                             * cpi->optimal_buffer_level) to
+                             * cpi->worst_quality when
+                             * (critical_buffer_level ==
+                             *     cpi->optimal_buffer_level >> 2)
+                             */
+                            cpi->active_worst_quality =
+                                cpi->worst_quality -
+                                (int)((qadjustment_range * above_base) /
+                                 (cpi->oxcf.optimal_buffer_level*3>>2));
+                        }
+                        else
+                        {
+                            cpi->active_worst_quality = cpi->worst_quality;
+                        }
+                    }
+                    else
+                    {
+                        cpi->active_worst_quality = cpi->ni_av_qi;
+                    }
+                }
+                else
+                {
+                    cpi->active_worst_quality = cpi->worst_quality;
+                }
+            }
+            else
+            {
+                int percent_high = 0;
+
+                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+                     && (cpi->buffer_level > cpi->oxcf.optimal_buffer_level))
+                {
+                    percent_high = (int)((cpi->buffer_level
+                                    - cpi->oxcf.optimal_buffer_level)
+                                   / one_percent_bits);
+                }
+                else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level)
+                {
+                    percent_high = (int)((100 * cpi->bits_off_target)
+                                         / (cpi->total_byte_count * 8));
+                }
+
+                if (percent_high > cpi->oxcf.over_shoot_pct)
+                    percent_high = cpi->oxcf.over_shoot_pct;
+                else if (percent_high < 0)
+                    percent_high = 0;
+
+                cpi->this_frame_target += (cpi->this_frame_target *
+                                          percent_high) / 200;
+
+                /* Are we allowing control of active_worst_allowed_q according
+                 * to buffer level.
+                 */
+                if (cpi->auto_worst_q && cpi->ni_frames > 150)
+                {
+                    /* When using the relaxed buffer model stick to the
+                     * user specified value
+                     */
+                    cpi->active_worst_quality = cpi->ni_av_qi;
+                }
+                else
+                {
+                    cpi->active_worst_quality = cpi->worst_quality;
+                }
+            }
+
+            /* Set active_best_quality to prevent quality rising too high */
+            cpi->active_best_quality = cpi->best_quality;
+
+            /* Worst quality obviously must not be better than best quality */
+            if (cpi->active_worst_quality <= cpi->active_best_quality)
+                cpi->active_worst_quality = cpi->active_best_quality + 1;
+
+            if(cpi->active_worst_quality > 127)
+                cpi->active_worst_quality = 127;
+        }
+        /* Unbuffered mode (eg. video conferencing) */
+        else
+        {
+            /* Set the active worst quality */
+            cpi->active_worst_quality = cpi->worst_quality;
+        }
+
+        /* Special trap for constrained quality mode
+         * "active_worst_quality" may never drop below cq level
+         * for any frame type.
+         */
+        if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+             cpi->active_worst_quality < cpi->cq_target_quality)
+        {
+            cpi->active_worst_quality = cpi->cq_target_quality;
+        }
+    }
+
+    /* Test to see if we have to drop a frame
+     * The auto-drop frame code is only used in buffered mode.
+     * In unbufferd mode (eg vide conferencing) the descision to
+     * code or drop a frame is made outside the codec in response to real
+     * world comms or buffer considerations.
+     */
+    if (cpi->drop_frames_allowed &&
+        (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+        ((cpi->common.frame_type != KEY_FRAME)))
+    {
+        /* Check for a buffer underun-crisis in which case we have to drop
+         * a frame
+         */
+        if ((cpi->buffer_level < 0))
+        {
+#if 0
+            FILE *f = fopen("dec.stt", "a");
+            fprintf(f, "%10d %10d %10d %10d ***** BUFFER EMPTY\n",
+                    (int) cpi->common.current_video_frame,
+                    cpi->decimation_factor, cpi->common.horiz_scale,
+                    (cpi->buffer_level * 100) / cpi->oxcf.optimal_buffer_level);
+            fclose(f);
+#endif
+            cpi->drop_frame = 1;
+
+            /* Update the buffer level variable. */
+            cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+            if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
+              cpi->bits_off_target = (int)cpi->oxcf.maximum_buffer_size;
+            cpi->buffer_level = cpi->bits_off_target;
+        }
+    }
+
+    /* Adjust target frame size for Golden Frames: */
+    if (cpi->oxcf.error_resilient_mode == 0 &&
+        (cpi->frames_till_gf_update_due == 0) && !cpi->drop_frame)
+    {
+        int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+
+        int gf_frame_useage = 0;      /* Golden frame useage since last GF */
+        int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME]  +
+                      cpi->recent_ref_frame_usage[LAST_FRAME]   +
+                      cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                      cpi->recent_ref_frame_usage[ALTREF_FRAME];
+
+        int pct_gf_active = (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols);
+
+        if (tot_mbs)
+            gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs;
+
+        if (pct_gf_active > gf_frame_useage)
+            gf_frame_useage = pct_gf_active;
+
+        /* Is a fixed manual GF frequency being used */
+        if (cpi->auto_gold)
+        {
+            /* For one pass throw a GF if recent frame intra useage is
+             * low or the GF useage is high
+             */
+            if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5))
+                cpi->common.refresh_golden_frame = 1;
+
+            /* Two pass GF descision */
+            else if (cpi->pass == 2)
+                cpi->common.refresh_golden_frame = 1;
+        }
+
+#if 0
+
+        /* Debug stats */
+        if (0)
+        {
+            FILE *f;
+
+            f = fopen("gf_useaget.stt", "a");
+            fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n",
+                    cpi->common.current_video_frame,  cpi->gfu_boost, GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage);
+            fclose(f);
+        }
+
+#endif
+
+        if (cpi->common.refresh_golden_frame == 1)
+        {
+#if 0
+
+            if (0)
+            {
+                FILE *f;
+
+                f = fopen("GFexit.stt", "a");
+                fprintf(f, "%8ld GF coded\n", cpi->common.current_video_frame);
+                fclose(f);
+            }
+
+#endif
+
+            if (cpi->auto_adjust_gold_quantizer)
+            {
+                calc_gf_params(cpi);
+            }
+
+            /* If we are using alternate ref instead of gf then do not apply the
+             * boost It will instead be applied to the altref update Jims
+             * modified boost
+             */
+            if (!cpi->source_alt_ref_active)
+            {
+                if (cpi->oxcf.fixed_q < 0)
+                {
+                    if (cpi->pass == 2)
+                    {
+                        /* The spend on the GF is defined in the two pass
+                         * code for two pass encodes
+                         */
+                        cpi->this_frame_target = cpi->per_frame_bandwidth;
+                    }
+                    else
+                    {
+                        int Boost = cpi->last_boost;
+                        int frames_in_section = cpi->frames_till_gf_update_due + 1;
+                        int allocation_chunks = (frames_in_section * 100) + (Boost - 100);
+                        int bits_in_section = cpi->inter_frame_target * frames_in_section;
+
+                        /* Normalize Altboost and allocations chunck down to
+                         * prevent overflow
+                         */
+                        while (Boost > 1000)
+                        {
+                            Boost /= 2;
+                            allocation_chunks /= 2;
+                        }
+
+                        /* Avoid loss of precision but avoid overflow */
+                        if ((bits_in_section >> 7) > allocation_chunks)
+                            cpi->this_frame_target = Boost * (bits_in_section / allocation_chunks);
+                        else
+                            cpi->this_frame_target = (Boost * bits_in_section) / allocation_chunks;
+                    }
+                }
+                else
+                    cpi->this_frame_target =
+                        (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0)
+                         * cpi->last_boost) / 100;
+
+            }
+            /* If there is an active ARF at this location use the minimum
+             * bits on this frame even if it is a contructed arf.
+             * The active maximum quantizer insures that an appropriate
+             * number of bits will be spent if needed for contstructed ARFs.
+             */
+            else
+            {
+                cpi->this_frame_target = 0;
+            }
+
+            cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+        }
+    }
+
+    cpi->per_frame_bandwidth = old_per_frame_bandwidth;
+}
+
+
+void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var)
+{
+    int    Q = cpi->common.base_qindex;
+    int    correction_factor = 100;
+    double rate_correction_factor;
+    double adjustment_limit;
+
+    int    projected_size_based_on_q = 0;
+
+    /* Clear down mmx registers to allow floating point in what follows */
+    vp8_clear_system_state();
+
+    if (cpi->common.frame_type == KEY_FRAME)
+    {
+        rate_correction_factor = cpi->key_frame_rate_correction_factor;
+    }
+    else
+    {
+        if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+            rate_correction_factor = cpi->gf_rate_correction_factor;
+        else
+            rate_correction_factor = cpi->rate_correction_factor;
+    }
+
+    /* Work out how big we would have expected the frame to be at this Q
+     * given the current correction factor. Stay in double to avoid int
+     * overflow when values are large
+     */
+    projected_size_based_on_q = (int)(((.5 + rate_correction_factor * vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) / (1 << BPER_MB_NORMBITS));
+
+    /* Make some allowance for cpi->zbin_over_quant */
+    if (cpi->zbin_over_quant > 0)
+    {
+        int Z = cpi->zbin_over_quant;
+        double Factor = 0.99;
+        double factor_adjustment = 0.01 / 256.0;
+
+        while (Z > 0)
+        {
+            Z --;
+            projected_size_based_on_q =
+                (int)(Factor * projected_size_based_on_q);
+            Factor += factor_adjustment;
+
+            if (Factor  >= 0.999)
+                Factor = 0.999;
+        }
+    }
+
+    /* Work out a size correction factor. */
+    if (projected_size_based_on_q > 0)
+        correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+
+    /* More heavily damped adjustment used if we have been oscillating
+     * either side of target
+     */
+    switch (damp_var)
+    {
+    case 0:
+        adjustment_limit = 0.75;
+        break;
+    case 1:
+        adjustment_limit = 0.375;
+        break;
+    case 2:
+    default:
+        adjustment_limit = 0.25;
+        break;
+    }
+
+    if (correction_factor > 102)
+    {
+        /* We are not already at the worst allowable quality */
+        correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
+        rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+        /* Keep rate_correction_factor within limits */
+        if (rate_correction_factor > MAX_BPB_FACTOR)
+            rate_correction_factor = MAX_BPB_FACTOR;
+    }
+    else if (correction_factor < 99)
+    {
+        /* We are not already at the best allowable quality */
+        correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
+        rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+        /* Keep rate_correction_factor within limits */
+        if (rate_correction_factor < MIN_BPB_FACTOR)
+            rate_correction_factor = MIN_BPB_FACTOR;
+    }
+
+    if (cpi->common.frame_type == KEY_FRAME)
+        cpi->key_frame_rate_correction_factor = rate_correction_factor;
+    else
+    {
+        if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+            cpi->gf_rate_correction_factor = rate_correction_factor;
+        else
+            cpi->rate_correction_factor = rate_correction_factor;
+    }
+}
+
+
+int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
+{
+    int Q = cpi->active_worst_quality;
+
+    /* Reset Zbin OQ value */
+    cpi->zbin_over_quant = 0;
+
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        Q = cpi->oxcf.fixed_q;
+
+        if (cpi->common.frame_type == KEY_FRAME)
+        {
+            Q = cpi->oxcf.key_q;
+        }
+        else if (cpi->common.refresh_alt_ref_frame)
+        {
+            Q = cpi->oxcf.alt_q;
+        }
+        else if (cpi->common.refresh_golden_frame)
+        {
+            Q = cpi->oxcf.gold_q;
+        }
+
+    }
+    else
+    {
+        int i;
+        int last_error = INT_MAX;
+        int target_bits_per_mb;
+        int bits_per_mb_at_this_q;
+        double correction_factor;
+
+        /* Select the appropriate correction factor based upon type of frame. */
+        if (cpi->common.frame_type == KEY_FRAME)
+            correction_factor = cpi->key_frame_rate_correction_factor;
+        else
+        {
+            if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+                correction_factor = cpi->gf_rate_correction_factor;
+            else
+                correction_factor = cpi->rate_correction_factor;
+        }
+
+        /* Calculate required scaling factor based on target frame size and
+         * size of frame produced using previous Q
+         */
+        if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
+            /* Case where we would overflow int */
+            target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS;
+        else
+            target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
+
+        i = cpi->active_best_quality;
+
+        do
+        {
+            bits_per_mb_at_this_q = (int)(.5 + correction_factor * vp8_bits_per_mb[cpi->common.frame_type][i]);
+
+            if (bits_per_mb_at_this_q <= target_bits_per_mb)
+            {
+                if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+                    Q = i;
+                else
+                    Q = i - 1;
+
+                break;
+            }
+            else
+                last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+        }
+        while (++i <= cpi->active_worst_quality);
+
+
+        /* If we are at MAXQ then enable Q over-run which seeks to claw
+         * back additional bits through things like the RD multiplier
+         * and zero bin size.
+         */
+        if (Q >= MAXQ)
+        {
+            int zbin_oqmax;
+
+            double Factor = 0.99;
+            double factor_adjustment = 0.01 / 256.0;
+
+            if (cpi->common.frame_type == KEY_FRAME)
+                zbin_oqmax = 0;
+            else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))
+                zbin_oqmax = 16;
+            else
+                zbin_oqmax = ZBIN_OQ_MAX;
+
+            /*{
+                double Factor = (double)target_bits_per_mb/(double)bits_per_mb_at_this_q;
+                double Oq;
+
+                Factor = Factor/1.2683;
+
+                Oq = pow( Factor, (1.0/-0.165) );
+
+                if ( Oq > zbin_oqmax )
+                    Oq = zbin_oqmax;
+
+                cpi->zbin_over_quant = (int)Oq;
+            }*/
+
+            /* Each incrment in the zbin is assumed to have a fixed effect
+             * on bitrate. This is not of course true. The effect will be
+             * highly clip dependent and may well have sudden steps. The
+             * idea here is to acheive higher effective quantizers than the
+             * normal maximum by expanding the zero bin and hence
+             * decreasing the number of low magnitude non zero coefficients.
+             */
+            while (cpi->zbin_over_quant < zbin_oqmax)
+            {
+                cpi->zbin_over_quant ++;
+
+                if (cpi->zbin_over_quant > zbin_oqmax)
+                    cpi->zbin_over_quant = zbin_oqmax;
+
+                /* Adjust bits_per_mb_at_this_q estimate */
+                bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);
+                Factor += factor_adjustment;
+
+                if (Factor  >= 0.999)
+                    Factor = 0.999;
+
+                /* Break out if we get down to the target rate */
+                if (bits_per_mb_at_this_q <= target_bits_per_mb)
+                    break;
+            }
+
+        }
+    }
+
+    return Q;
+}
+
+
+static int estimate_keyframe_frequency(VP8_COMP *cpi)
+{
+    int i;
+
+    /* Average key frame frequency */
+    int av_key_frame_frequency = 0;
+
+    /* First key frame at start of sequence is a special case. We have no
+     * frequency data.
+     */
+    if (cpi->key_frame_count == 1)
+    {
+        /* Assume a default of 1 kf every 2 seconds, or the max kf interval,
+         * whichever is smaller.
+         */
+        int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1;
+        av_key_frame_frequency = (int)cpi->output_frame_rate * 2;
+
+        if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
+            av_key_frame_frequency = cpi->oxcf.key_freq;
+
+        cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
+            = av_key_frame_frequency;
+    }
+    else
+    {
+        unsigned int total_weight = 0;
+        int last_kf_interval =
+                (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1;
+
+        /* reset keyframe context and calculate weighted average of last
+         * KEY_FRAME_CONTEXT keyframes
+         */
+        for (i = 0; i < KEY_FRAME_CONTEXT; i++)
+        {
+            if (i < KEY_FRAME_CONTEXT - 1)
+                cpi->prior_key_frame_distance[i]
+                    = cpi->prior_key_frame_distance[i+1];
+            else
+                cpi->prior_key_frame_distance[i] = last_kf_interval;
+
+            av_key_frame_frequency += prior_key_frame_weight[i]
+                                      * cpi->prior_key_frame_distance[i];
+            total_weight += prior_key_frame_weight[i];
+        }
+
+        av_key_frame_frequency  /= total_weight;
+
+    }
+    return av_key_frame_frequency;
+}
+
+
+void vp8_adjust_key_frame_context(VP8_COMP *cpi)
+{
+    /* Clear down mmx registers to allow floating point in what follows */
+    vp8_clear_system_state();
+
+    /* Do we have any key frame overspend to recover? */
+    /* Two-pass overspend handled elsewhere. */
+    if ((cpi->pass != 2)
+         && (cpi->projected_frame_size > cpi->per_frame_bandwidth))
+    {
+        int overspend;
+
+        /* Update the count of key frame overspend to be recovered in
+         * subsequent frames. A portion of the KF overspend is treated as gf
+         * overspend (and hence recovered more quickly) as the kf is also a
+         * gf. Otherwise the few frames following each kf tend to get more
+         * bits allocated than those following other gfs.
+         */
+        overspend = (cpi->projected_frame_size - cpi->per_frame_bandwidth);
+
+        if (cpi->oxcf.number_of_layers > 1)
+            cpi->kf_overspend_bits += overspend;
+        else
+        {
+            cpi->kf_overspend_bits += overspend * 7 / 8;
+            cpi->gf_overspend_bits += overspend * 1 / 8;
+        }
+
+        /* Work out how much to try and recover per frame. */
+        cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits
+                                     / estimate_keyframe_frequency(cpi);
+    }
+
+    cpi->frames_since_key = 0;
+    cpi->key_frame_count++;
+}
+
+
+void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit)
+{
+    /* Set-up bounds on acceptable frame size: */
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        /* Fixed Q scenario: frame size never outranges target
+         * (there is no target!)
+         */
+        *frame_under_shoot_limit = 0;
+        *frame_over_shoot_limit  = INT_MAX;
+    }
+    else
+    {
+        if (cpi->common.frame_type == KEY_FRAME)
+        {
+            *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
+            *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+        }
+        else
+        {
+            if (cpi->oxcf.number_of_layers > 1 ||
+                cpi->common.refresh_alt_ref_frame ||
+                cpi->common.refresh_golden_frame)
+            {
+                *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
+                *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+            }
+            else
+            {
+                /* For CBR take buffer fullness into account */
+                if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+                {
+                    if (cpi->buffer_level >= ((cpi->oxcf.optimal_buffer_level + cpi->oxcf.maximum_buffer_size) >> 1))
+                    {
+                        /* Buffer is too full so relax overshoot and tighten
+                         * undershoot
+                         */
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 12 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 6 / 8;
+                    }
+                    else if (cpi->buffer_level <= (cpi->oxcf.optimal_buffer_level >> 1))
+                    {
+                        /* Buffer is too low so relax undershoot and tighten
+                         * overshoot
+                         */
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 10 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 4 / 8;
+                    }
+                    else
+                    {
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+                    }
+                }
+                /* VBR and CQ mode */
+                /* Note that tighter restrictions here can help quality
+                 * but hurt encode speed
+                 */
+                else
+                {
+                    /* Stron overshoot limit for constrained quality */
+                    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+                    {
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;
+                    }
+                    else
+                    {
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+                    }
+                }
+            }
+        }
+
+        /* For very small rate targets where the fractional adjustment
+         * (eg * 7/8) may be tiny make sure there is at least a minimum
+         * range.
+         */
+        *frame_over_shoot_limit += 200;
+        *frame_under_shoot_limit -= 200;
+        if ( *frame_under_shoot_limit < 0 )
+            *frame_under_shoot_limit = 0;
+
+    }
+}
+
+
+/* return of 0 means drop frame */
+int vp8_pick_frame_size(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    if (cm->frame_type == KEY_FRAME)
+        calc_iframe_target_size(cpi);
+    else
+    {
+        calc_pframe_target_size(cpi);
+
+        /* Check if we're dropping the frame: */
+        if (cpi->drop_frame)
+        {
+            cpi->drop_frame = 0;
+            return 0;
+        }
+    }
+    return 1;
+}
diff --git a/libvpx/vp8/encoder/ratectrl.h b/libvpx/vp8/encoder/ratectrl.h
new file mode 100644
index 0000000..c43f08d
--- /dev/null
+++ b/libvpx/vp8/encoder/ratectrl.h
@@ -0,0 +1,28 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#if !defined __INC_RATECTRL_H
+
+#include "onyx_int.h"
+
+extern void vp8_save_coding_context(VP8_COMP *cpi);
+extern void vp8_restore_coding_context(VP8_COMP *cpi);
+
+extern void vp8_setup_key_frame(VP8_COMP *cpi);
+extern void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var);
+extern int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame);
+extern void vp8_adjust_key_frame_context(VP8_COMP *cpi);
+extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit);
+
+/* return of 0 means drop frame */
+extern int vp8_pick_frame_size(VP8_COMP *cpi);
+
+#endif
diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c
new file mode 100644
index 0000000..7d80606
--- /dev/null
+++ b/libvpx/vp8/encoder/rdopt.c
@@ -0,0 +1,2629 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include <assert.h>
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/pragmas.h"
+#include "tokenize.h"
+#include "treewriter.h"
+#include "onyx_int.h"
+#include "modecosts.h"
+#include "encodeintra.h"
+#include "pickinter.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/findnearmv.h"
+#include "vp8/common/quant_common.h"
+#include "encodemb.h"
+#include "quantize.h"
+#include "vp8/common/variance.h"
+#include "mcomp.h"
+#include "rdopt.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/systemdependent.h"
+#if CONFIG_TEMPORAL_DENOISING
+#include "denoising.h"
+#endif
+extern void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x);
+
+#define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
+
+typedef struct rate_distortion_struct
+{
+    int rate2;
+    int rate_y;
+    int rate_uv;
+    int distortion2;
+    int distortion_uv;
+} RATE_DISTORTION;
+
+typedef struct best_mode_struct
+{
+  int yrd;
+  int rd;
+  int intra_rd;
+  MB_MODE_INFO mbmode;
+  union b_mode_info bmodes[16];
+  PARTITION_INFO partition;
+} BEST_MODE;
+
+static const int auto_speed_thresh[17] =
+{
+    1000,
+    200,
+    150,
+    130,
+    150,
+    125,
+    120,
+    115,
+    115,
+    115,
+    115,
+    115,
+    115,
+    115,
+    115,
+    115,
+    105
+};
+
+const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES] =
+{
+    ZEROMV,
+    DC_PRED,
+
+    NEARESTMV,
+    NEARMV,
+
+    ZEROMV,
+    NEARESTMV,
+
+    ZEROMV,
+    NEARESTMV,
+
+    NEARMV,
+    NEARMV,
+
+    V_PRED,
+    H_PRED,
+    TM_PRED,
+
+    NEWMV,
+    NEWMV,
+    NEWMV,
+
+    SPLITMV,
+    SPLITMV,
+    SPLITMV,
+
+    B_PRED,
+};
+
+/* This table determines the search order in reference frame priority order,
+ * which may not necessarily match INTRA,LAST,GOLDEN,ARF
+ */
+const int vp8_ref_frame_order[MAX_MODES] =
+{
+    1,
+    0,
+
+    1,
+    1,
+
+    2,
+    2,
+
+    3,
+    3,
+
+    2,
+    3,
+
+    0,
+    0,
+    0,
+
+    1,
+    2,
+    3,
+
+    1,
+    2,
+    3,
+
+    0,
+};
+
+static void fill_token_costs(
+    int c[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
+    const vp8_prob p[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]
+)
+{
+    int i, j, k;
+
+
+    for (i = 0; i < BLOCK_TYPES; i++)
+        for (j = 0; j < COEF_BANDS; j++)
+            for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+
+                /* check for pt=0 and band > 1 if block type 0
+                 * and 0 if blocktype 1
+                 */
+                if (k == 0 && j > (i == 0))
+                    vp8_cost_tokens2(c[i][j][k], p [i][j][k], vp8_coef_tree, 2);
+                else
+                    vp8_cost_tokens(c[i][j][k], p [i][j][k], vp8_coef_tree);
+}
+
+static const int rd_iifactor[32] =
+{
+    4, 4, 3, 2, 1, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* values are now correlated to quantizer */
+static const int sad_per_bit16lut[QINDEX_RANGE] =
+{
+    2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,
+    3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  5,  5,  5,  5,  5,  5,
+    5,  5,  5,  5,  5,  5,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  7,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  9,  9,  9,  9,  9,  9,
+    9,  9,  9,  9,  9,  9,  10, 10,
+    10, 10, 10, 10, 10, 10, 11, 11,
+    11, 11, 11, 11, 12, 12, 12, 12,
+    12, 12, 13, 13, 13, 13, 14, 14
+};
+static const int sad_per_bit4lut[QINDEX_RANGE] =
+{
+    2,  2,  2,  2,  2,  2,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  5,  5,
+    5,  5,  5,  5,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,
+    7,  7,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  8,  8,  8,
+    8,  8,  9,  9,  9,  9,  9,  9,
+    10, 10, 10, 10, 10, 10, 10, 10,
+    11, 11, 11, 11, 11, 11, 11, 11,
+    12, 12, 12, 12, 12, 12, 12, 12,
+    13, 13, 13, 13, 13, 13, 13, 14,
+    14, 14, 14, 14, 15, 15, 15, 15,
+    16, 16, 16, 16, 17, 17, 17, 18,
+    18, 18, 19, 19, 19, 20, 20, 20,
+};
+
+void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex)
+{
+    cpi->mb.sadperbit16 =  sad_per_bit16lut[QIndex];
+    cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex];
+}
+
+void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
+{
+    int q;
+    int i;
+    double capped_q = (Qvalue < 160) ? (double)Qvalue : 160.0;
+    double rdconst = 2.80;
+
+    vp8_clear_system_state();
+
+    /* Further tests required to see if optimum is different
+     * for key frames, golden frames and arf frames.
+     */
+    cpi->RDMULT = (int)(rdconst * (capped_q * capped_q));
+
+    /* Extend rate multiplier along side quantizer zbin increases */
+    if (cpi->zbin_over_quant  > 0)
+    {
+        double oq_factor;
+        double modq;
+
+        /* Experimental code using the same basic equation as used for Q above
+         * The units of cpi->zbin_over_quant are 1/128 of Q bin size
+         */
+        oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant);
+        modq = (int)((double)capped_q * oq_factor);
+        cpi->RDMULT = (int)(rdconst * (modq * modq));
+    }
+
+    if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME))
+    {
+        if (cpi->twopass.next_iiratio > 31)
+            cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
+        else
+            cpi->RDMULT +=
+                (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
+    }
+
+    cpi->mb.errorperbit = (cpi->RDMULT / 110);
+    cpi->mb.errorperbit += (cpi->mb.errorperbit==0);
+
+    vp8_set_speed_features(cpi);
+
+    q = (int)pow(Qvalue, 1.25);
+
+    if (q < 8)
+        q = 8;
+
+    if (cpi->RDMULT > 1000)
+    {
+        cpi->RDDIV = 1;
+        cpi->RDMULT /= 100;
+
+        for (i = 0; i < MAX_MODES; i++)
+        {
+            if (cpi->sf.thresh_mult[i] < INT_MAX)
+            {
+                cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
+            }
+            else
+            {
+                cpi->rd_threshes[i] = INT_MAX;
+            }
+
+            cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+        }
+    }
+    else
+    {
+        cpi->RDDIV = 100;
+
+        for (i = 0; i < MAX_MODES; i++)
+        {
+            if (cpi->sf.thresh_mult[i] < (INT_MAX / q))
+            {
+                cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
+            }
+            else
+            {
+                cpi->rd_threshes[i] = INT_MAX;
+            }
+
+            cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+        }
+    }
+
+    {
+      /* build token cost array for the type of frame we have now */
+      FRAME_CONTEXT *l = &cpi->lfc_n;
+
+      if(cpi->common.refresh_alt_ref_frame)
+          l = &cpi->lfc_a;
+      else if(cpi->common.refresh_golden_frame)
+          l = &cpi->lfc_g;
+
+      fill_token_costs(
+          cpi->mb.token_costs,
+          (const vp8_prob( *)[8][3][11]) l->coef_probs
+      );
+      /*
+      fill_token_costs(
+          cpi->mb.token_costs,
+          (const vp8_prob( *)[8][3][11]) cpi->common.fc.coef_probs);
+      */
+
+
+      /* TODO make these mode costs depend on last,alt or gold too.  (jbb) */
+      vp8_init_mode_costs(cpi);
+    }
+
+}
+
+void vp8_auto_select_speed(VP8_COMP *cpi)
+{
+    int milliseconds_for_compress = (int)(1000000 / cpi->frame_rate);
+
+    milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
+
+#if 0
+
+    if (0)
+    {
+        FILE *f;
+
+        f = fopen("speed.stt", "a");
+        fprintf(f, " %8ld %10ld %10ld %10ld\n",
+                cpi->common.current_video_frame, cpi->Speed, milliseconds_for_compress, cpi->avg_pick_mode_time);
+        fclose(f);
+    }
+
+#endif
+
+    if (cpi->avg_pick_mode_time < milliseconds_for_compress && (cpi->avg_encode_time - cpi->avg_pick_mode_time) < milliseconds_for_compress)
+    {
+        if (cpi->avg_pick_mode_time == 0)
+        {
+            cpi->Speed = 4;
+        }
+        else
+        {
+            if (milliseconds_for_compress * 100 < cpi->avg_encode_time * 95)
+            {
+                cpi->Speed          += 2;
+                cpi->avg_pick_mode_time = 0;
+                cpi->avg_encode_time = 0;
+
+                if (cpi->Speed > 16)
+                {
+                    cpi->Speed = 16;
+                }
+            }
+
+            if (milliseconds_for_compress * 100 > cpi->avg_encode_time * auto_speed_thresh[cpi->Speed])
+            {
+                cpi->Speed          -= 1;
+                cpi->avg_pick_mode_time = 0;
+                cpi->avg_encode_time = 0;
+
+                /* In real-time mode, cpi->speed is in [4, 16]. */
+                if (cpi->Speed < 4)
+                {
+                    cpi->Speed = 4;
+                }
+            }
+        }
+    }
+    else
+    {
+        cpi->Speed += 4;
+
+        if (cpi->Speed > 16)
+            cpi->Speed = 16;
+
+
+        cpi->avg_pick_mode_time = 0;
+        cpi->avg_encode_time = 0;
+    }
+}
+
+int vp8_block_error_c(short *coeff, short *dqcoeff)
+{
+    int i;
+    int error = 0;
+
+    for (i = 0; i < 16; i++)
+    {
+        int this_diff = coeff[i] - dqcoeff[i];
+        error += this_diff * this_diff;
+    }
+
+    return error;
+}
+
+int vp8_mbblock_error_c(MACROBLOCK *mb, int dc)
+{
+    BLOCK  *be;
+    BLOCKD *bd;
+    int i, j;
+    int berror, error = 0;
+
+    for (i = 0; i < 16; i++)
+    {
+        be = &mb->block[i];
+        bd = &mb->e_mbd.block[i];
+
+        berror = 0;
+
+        for (j = dc; j < 16; j++)
+        {
+            int this_diff = be->coeff[j] - bd->dqcoeff[j];
+            berror += this_diff * this_diff;
+        }
+
+        error += berror;
+    }
+
+    return error;
+}
+
+int vp8_mbuverror_c(MACROBLOCK *mb)
+{
+
+    BLOCK  *be;
+    BLOCKD *bd;
+
+
+    int i;
+    int error = 0;
+
+    for (i = 16; i < 24; i++)
+    {
+        be = &mb->block[i];
+        bd = &mb->e_mbd.block[i];
+
+        error += vp8_block_error_c(be->coeff, bd->dqcoeff);
+    }
+
+    return error;
+}
+
+int VP8_UVSSE(MACROBLOCK *x)
+{
+    unsigned char *uptr, *vptr;
+    unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
+    unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
+    int uv_stride = x->block[16].src_stride;
+
+    unsigned int sse1 = 0;
+    unsigned int sse2 = 0;
+    int mv_row = x->e_mbd.mode_info_context->mbmi.mv.as_mv.row;
+    int mv_col = x->e_mbd.mode_info_context->mbmi.mv.as_mv.col;
+    int offset;
+    int pre_stride = x->e_mbd.pre.uv_stride;
+
+    if (mv_row < 0)
+        mv_row -= 1;
+    else
+        mv_row += 1;
+
+    if (mv_col < 0)
+        mv_col -= 1;
+    else
+        mv_col += 1;
+
+    mv_row /= 2;
+    mv_col /= 2;
+
+    offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+    uptr = x->e_mbd.pre.u_buffer + offset;
+    vptr = x->e_mbd.pre.v_buffer + offset;
+
+    if ((mv_row | mv_col) & 7)
+    {
+        vp8_sub_pixel_variance8x8(uptr, pre_stride,
+            mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);
+        vp8_sub_pixel_variance8x8(vptr, pre_stride,
+            mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);
+        sse2 += sse1;
+    }
+    else
+    {
+        vp8_variance8x8(uptr, pre_stride,
+            upred_ptr, uv_stride, &sse2);
+        vp8_variance8x8(vptr, pre_stride,
+            vpred_ptr, uv_stride, &sse1);
+        sse2 += sse1;
+    }
+    return sse2;
+
+}
+
+static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
+{
+    int c = !type;              /* start at coef 0, unless Y with Y2 */
+    int eob = (int)(*b->eob);
+    int pt ;    /* surrounding block/prev coef predictor */
+    int cost = 0;
+    short *qcoeff_ptr = b->qcoeff;
+
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+# define QC( I)  ( qcoeff_ptr [vp8_default_zig_zag1d[I]] )
+
+    for (; c < eob; c++)
+    {
+        int v = QC(c);
+        int t = vp8_dct_value_tokens_ptr[v].Token;
+        cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [t];
+        cost += vp8_dct_value_cost_ptr[v];
+        pt = vp8_prev_token_class[t];
+    }
+
+# undef QC
+
+    if (c < 16)
+        cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [DCT_EOB_TOKEN];
+
+    pt = (c != !type); /* is eob first coefficient; */
+    *a = *l = pt;
+
+    return cost;
+}
+
+static int vp8_rdcost_mby(MACROBLOCK *mb)
+{
+    int cost = 0;
+    int b;
+    MACROBLOCKD *x = &mb->e_mbd;
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+
+    vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    for (b = 0; b < 16; b++)
+        cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_Y_NO_DC,
+                    ta + vp8_block2above[b], tl + vp8_block2left[b]);
+
+    cost += cost_coeffs(mb, x->block + 24, PLANE_TYPE_Y2,
+                ta + vp8_block2above[24], tl + vp8_block2left[24]);
+
+    return cost;
+}
+
+static void macro_block_yrd( MACROBLOCK *mb,
+                             int *Rate,
+                             int *Distortion)
+{
+    int b;
+    MACROBLOCKD *const x = &mb->e_mbd;
+    BLOCK   *const mb_y2 = mb->block + 24;
+    BLOCKD *const x_y2  = x->block + 24;
+    short *Y2DCPtr = mb_y2->src_diff;
+    BLOCK *beptr;
+    int d;
+
+    vp8_subtract_mby( mb->src_diff, *(mb->block[0].base_src),
+        mb->block[0].src_stride,  mb->e_mbd.predictor, 16);
+
+    /* Fdct and building the 2nd order block */
+    for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
+    {
+        mb->short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
+        *Y2DCPtr++ = beptr->coeff[0];
+        *Y2DCPtr++ = beptr->coeff[16];
+    }
+
+    /* 2nd order fdct */
+    mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);
+
+    /* Quantization */
+    for (b = 0; b < 16; b++)
+    {
+        mb->quantize_b(&mb->block[b], &mb->e_mbd.block[b]);
+    }
+
+    /* DC predication and Quantization of 2nd Order block */
+    mb->quantize_b(mb_y2, x_y2);
+
+    /* Distortion */
+    d = vp8_mbblock_error(mb, 1) << 2;
+    d += vp8_block_error(mb_y2->coeff, x_y2->dqcoeff);
+
+    *Distortion = (d >> 4);
+
+    /* rate */
+    *Rate = vp8_rdcost_mby(mb);
+}
+
+static void copy_predictor(unsigned char *dst, const unsigned char *predictor)
+{
+    const unsigned int *p = (const unsigned int *)predictor;
+    unsigned int *d = (unsigned int *)dst;
+    d[0] = p[0];
+    d[4] = p[4];
+    d[8] = p[8];
+    d[12] = p[12];
+}
+static int rd_pick_intra4x4block(
+    MACROBLOCK *x,
+    BLOCK *be,
+    BLOCKD *b,
+    B_PREDICTION_MODE *best_mode,
+    const int *bmode_costs,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+
+    int *bestrate,
+    int *bestratey,
+    int *bestdistortion)
+{
+    B_PREDICTION_MODE mode;
+    int best_rd = INT_MAX;
+    int rate = 0;
+    int distortion;
+
+    ENTROPY_CONTEXT ta = *a, tempa = *a;
+    ENTROPY_CONTEXT tl = *l, templ = *l;
+    /*
+     * The predictor buffer is a 2d buffer with a stride of 16.  Create
+     * a temp buffer that meets the stride requirements, but we are only
+     * interested in the left 4x4 block
+     * */
+    DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16*4);
+    DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);
+    int dst_stride = x->e_mbd.dst.y_stride;
+    unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
+
+    unsigned char *Above = dst - dst_stride;
+    unsigned char *yleft = dst - 1;
+    unsigned char top_left = Above[-1];
+
+    for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++)
+    {
+        int this_rd;
+        int ratey;
+
+        rate = bmode_costs[mode];
+
+        vp8_intra4x4_predict(Above, yleft, dst_stride, mode,
+                             b->predictor, 16, top_left);
+        vp8_subtract_b(be, b, 16);
+        x->short_fdct4x4(be->src_diff, be->coeff, 32);
+        x->quantize_b(be, b);
+
+        tempa = ta;
+        templ = tl;
+
+        ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ);
+        rate += ratey;
+        distortion = vp8_block_error(be->coeff, b->dqcoeff) >> 2;
+
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+        if (this_rd < best_rd)
+        {
+            *bestrate = rate;
+            *bestratey = ratey;
+            *bestdistortion = distortion;
+            best_rd = this_rd;
+            *best_mode = mode;
+            *a = tempa;
+            *l = templ;
+            copy_predictor(best_predictor, b->predictor);
+            vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
+        }
+    }
+    b->bmi.as_mode = *best_mode;
+
+    vp8_short_idct4x4llm(best_dqcoeff, best_predictor, 16, dst, dst_stride);
+
+    return best_rd;
+}
+
+static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate,
+                                     int *rate_y, int *Distortion, int best_rd)
+{
+    MACROBLOCKD *const xd = &mb->e_mbd;
+    int i;
+    int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
+    int distortion = 0;
+    int tot_rate_y = 0;
+    int64_t total_rd = 0;
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+    const int *bmode_costs;
+
+    vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    intra_prediction_down_copy(xd, xd->dst.y_buffer - xd->dst.y_stride + 16);
+
+    bmode_costs = mb->inter_bmode_costs;
+
+    for (i = 0; i < 16; i++)
+    {
+        MODE_INFO *const mic = xd->mode_info_context;
+        const int mis = xd->mode_info_stride;
+        B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+        int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
+
+        if (mb->e_mbd.frame_type == KEY_FRAME)
+        {
+            const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
+            const B_PREDICTION_MODE L = left_block_mode(mic, i);
+
+            bmode_costs  = mb->bmode_costs[A][L];
+        }
+
+        total_rd += rd_pick_intra4x4block(
+            mb, mb->block + i, xd->block + i, &best_mode, bmode_costs,
+            ta + vp8_block2above[i],
+            tl + vp8_block2left[i], &r, &ry, &d);
+
+        cost += r;
+        distortion += d;
+        tot_rate_y += ry;
+
+        mic->bmi[i].as_mode = best_mode;
+
+        if(total_rd >= (int64_t)best_rd)
+            break;
+    }
+
+    if(total_rd >= (int64_t)best_rd)
+        return INT_MAX;
+
+    *Rate = cost;
+    *rate_y = tot_rate_y;
+    *Distortion = distortion;
+
+    return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
+}
+
+
+static int rd_pick_intra16x16mby_mode(MACROBLOCK *x,
+                                      int *Rate,
+                                      int *rate_y,
+                                      int *Distortion)
+{
+    MB_PREDICTION_MODE mode;
+    MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+    int rate, ratey;
+    int distortion;
+    int best_rd = INT_MAX;
+    int this_rd;
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    /* Y Search for 16x16 intra prediction mode */
+    for (mode = DC_PRED; mode <= TM_PRED; mode++)
+    {
+        xd->mode_info_context->mbmi.mode = mode;
+
+        vp8_build_intra_predictors_mby_s(xd,
+                                         xd->dst.y_buffer - xd->dst.y_stride,
+                                         xd->dst.y_buffer - 1,
+                                         xd->dst.y_stride,
+                                         xd->predictor,
+                                         16);
+
+        macro_block_yrd(x, &ratey, &distortion);
+        rate = ratey + x->mbmode_cost[xd->frame_type]
+                                     [xd->mode_info_context->mbmi.mode];
+
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+        if (this_rd < best_rd)
+        {
+            mode_selected = mode;
+            best_rd = this_rd;
+            *Rate = rate;
+            *rate_y = ratey;
+            *Distortion = distortion;
+        }
+    }
+
+    xd->mode_info_context->mbmi.mode = mode_selected;
+    return best_rd;
+}
+
+static int rd_cost_mbuv(MACROBLOCK *mb)
+{
+    int b;
+    int cost = 0;
+    MACROBLOCKD *x = &mb->e_mbd;
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+
+    vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+
+    for (b = 16; b < 24; b++)
+        cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_UV,
+                    ta + vp8_block2above[b], tl + vp8_block2left[b]);
+
+    return cost;
+}
+
+
+static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
+                            int *distortion, int fullpixel)
+{
+    vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
+    vp8_subtract_mbuv(x->src_diff,
+        x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+        &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
+
+    vp8_transform_mbuv(x);
+    vp8_quantize_mbuv(x);
+
+    *rate       = rd_cost_mbuv(x);
+    *distortion = vp8_mbuverror(x) / 4;
+
+    return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
+                          int *distortion, int fullpixel)
+{
+    vp8_build_inter4x4_predictors_mbuv(&x->e_mbd);
+    vp8_subtract_mbuv(x->src_diff,
+        x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+        &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
+
+    vp8_transform_mbuv(x);
+    vp8_quantize_mbuv(x);
+
+    *rate       = rd_cost_mbuv(x);
+    *distortion = vp8_mbuverror(x) / 4;
+
+    return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
+                                    int *rate_tokenonly, int *distortion)
+{
+    MB_PREDICTION_MODE mode;
+    MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+    int best_rd = INT_MAX;
+    int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
+    int rate_to;
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    for (mode = DC_PRED; mode <= TM_PRED; mode++)
+    {
+        int rate;
+        int distortion;
+        int this_rd;
+
+        xd->mode_info_context->mbmi.uv_mode = mode;
+
+        vp8_build_intra_predictors_mbuv_s(xd,
+                                          xd->dst.u_buffer - xd->dst.uv_stride,
+                                          xd->dst.v_buffer - xd->dst.uv_stride,
+                                          xd->dst.u_buffer - 1,
+                                          xd->dst.v_buffer - 1,
+                                          xd->dst.uv_stride,
+                                          &xd->predictor[256], &xd->predictor[320],
+                                          8);
+
+
+        vp8_subtract_mbuv(x->src_diff,
+                      x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+                      &xd->predictor[256], &xd->predictor[320], 8);
+        vp8_transform_mbuv(x);
+        vp8_quantize_mbuv(x);
+
+        rate_to = rd_cost_mbuv(x);
+        rate = rate_to + x->intra_uv_mode_cost[xd->frame_type][xd->mode_info_context->mbmi.uv_mode];
+
+        distortion = vp8_mbuverror(x) / 4;
+
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+        if (this_rd < best_rd)
+        {
+            best_rd = this_rd;
+            d = distortion;
+            r = rate;
+            *rate_tokenonly = rate_to;
+            mode_selected = mode;
+        }
+    }
+
+    *rate = r;
+    *distortion = d;
+
+    xd->mode_info_context->mbmi.uv_mode = mode_selected;
+}
+
+int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4])
+{
+    vp8_prob p [VP8_MVREFS-1];
+    assert(NEARESTMV <= m  &&  m <= SPLITMV);
+    vp8_mv_ref_probs(p, near_mv_ref_ct);
+    return vp8_cost_token(vp8_mv_ref_tree, p,
+                          vp8_mv_ref_encoding_array - NEARESTMV + m);
+}
+
+void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv)
+{
+    x->e_mbd.mode_info_context->mbmi.mode = mb;
+    x->e_mbd.mode_info_context->mbmi.mv.as_int = mv->as_int;
+}
+
+static int labels2mode(
+    MACROBLOCK *x,
+    int const *labelings, int which_label,
+    B_PREDICTION_MODE this_mode,
+    int_mv *this_mv, int_mv *best_ref_mv,
+    int *mvcost[2]
+)
+{
+    MACROBLOCKD *const xd = & x->e_mbd;
+    MODE_INFO *const mic = xd->mode_info_context;
+    const int mis = xd->mode_info_stride;
+
+    int cost = 0;
+    int thismvcost = 0;
+
+    /* We have to be careful retrieving previously-encoded motion vectors.
+       Ones from this macroblock have to be pulled from the BLOCKD array
+       as they have not yet made it to the bmi array in our MB_MODE_INFO. */
+
+    int i = 0;
+
+    do
+    {
+        BLOCKD *const d = xd->block + i;
+        const int row = i >> 2,  col = i & 3;
+
+        B_PREDICTION_MODE m;
+
+        if (labelings[i] != which_label)
+            continue;
+
+        if (col  &&  labelings[i] == labelings[i-1])
+            m = LEFT4X4;
+        else if (row  &&  labelings[i] == labelings[i-4])
+            m = ABOVE4X4;
+        else
+        {
+            /* the only time we should do costing for new motion vector
+             * or mode is when we are on a new label  (jbb May 08, 2007)
+             */
+            switch (m = this_mode)
+            {
+            case NEW4X4 :
+                thismvcost  = vp8_mv_bit_cost(this_mv, best_ref_mv, mvcost, 102);
+                break;
+            case LEFT4X4:
+                this_mv->as_int = col ? d[-1].bmi.mv.as_int : left_block_mv(mic, i);
+                break;
+            case ABOVE4X4:
+                this_mv->as_int = row ? d[-4].bmi.mv.as_int : above_block_mv(mic, i, mis);
+                break;
+            case ZERO4X4:
+                this_mv->as_int = 0;
+                break;
+            default:
+                break;
+            }
+
+            if (m == ABOVE4X4)  /* replace above with left if same */
+            {
+                int_mv left_mv;
+
+                left_mv.as_int = col ? d[-1].bmi.mv.as_int :
+                                        left_block_mv(mic, i);
+
+                if (left_mv.as_int == this_mv->as_int)
+                    m = LEFT4X4;
+            }
+
+            cost = x->inter_bmode_costs[ m];
+        }
+
+        d->bmi.mv.as_int = this_mv->as_int;
+
+        x->partition_info->bmi[i].mode = m;
+        x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
+
+    }
+    while (++i < 16);
+
+    cost += thismvcost ;
+    return cost;
+}
+
+static int rdcost_mbsegment_y(MACROBLOCK *mb, const int *labels,
+                              int which_label, ENTROPY_CONTEXT *ta,
+                              ENTROPY_CONTEXT *tl)
+{
+    int cost = 0;
+    int b;
+    MACROBLOCKD *x = &mb->e_mbd;
+
+    for (b = 0; b < 16; b++)
+        if (labels[ b] == which_label)
+            cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_Y_WITH_DC,
+                                ta + vp8_block2above[b],
+                                tl + vp8_block2left[b]);
+
+    return cost;
+
+}
+static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x, int const *labels, int which_label)
+{
+    int i;
+    unsigned int distortion = 0;
+    int pre_stride = x->e_mbd.pre.y_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+
+
+    for (i = 0; i < 16; i++)
+    {
+        if (labels[i] == which_label)
+        {
+            BLOCKD *bd = &x->e_mbd.block[i];
+            BLOCK *be = &x->block[i];
+
+            vp8_build_inter_predictors_b(bd, 16, base_pre, pre_stride, x->e_mbd.subpixel_predict);
+            vp8_subtract_b(be, bd, 16);
+            x->short_fdct4x4(be->src_diff, be->coeff, 32);
+            x->quantize_b(be, bd);
+
+            distortion += vp8_block_error(be->coeff, bd->dqcoeff);
+        }
+    }
+
+    return distortion;
+}
+
+
+static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
+
+
+typedef struct
+{
+  int_mv *ref_mv;
+  int_mv mvp;
+
+  int segment_rd;
+  int segment_num;
+  int r;
+  int d;
+  int segment_yrate;
+  B_PREDICTION_MODE modes[16];
+  int_mv mvs[16];
+  unsigned char eobs[16];
+
+  int mvthresh;
+  int *mdcounts;
+
+  int_mv sv_mvp[4]; /* save 4 mvp from 8x8 */
+  int sv_istep[2];  /* save 2 initial step_param for 16x8/8x16 */
+
+} BEST_SEG_INFO;
+
+
+static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
+                             BEST_SEG_INFO *bsi, unsigned int segmentation)
+{
+    int i;
+    int const *labels;
+    int br = 0;
+    int bd = 0;
+    B_PREDICTION_MODE this_mode;
+
+
+    int label_count;
+    int this_segment_rd = 0;
+    int label_mv_thresh;
+    int rate = 0;
+    int sbr = 0;
+    int sbd = 0;
+    int segmentyrate = 0;
+
+    vp8_variance_fn_ptr_t *v_fn_ptr;
+
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta;
+    ENTROPY_CONTEXT *tl;
+    ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
+    ENTROPY_CONTEXT *ta_b;
+    ENTROPY_CONTEXT *tl_b;
+
+    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+    ta_b = (ENTROPY_CONTEXT *)&t_above_b;
+    tl_b = (ENTROPY_CONTEXT *)&t_left_b;
+
+    br = 0;
+    bd = 0;
+
+    v_fn_ptr = &cpi->fn_ptr[segmentation];
+    labels = vp8_mbsplits[segmentation];
+    label_count = vp8_mbsplit_count[segmentation];
+
+    /* 64 makes this threshold really big effectively making it so that we
+     * very rarely check mvs on segments.   setting this to 1 would make mv
+     * thresh roughly equal to what it is for macroblocks
+     */
+    label_mv_thresh = 1 * bsi->mvthresh / label_count ;
+
+    /* Segmentation method overheads */
+    rate = vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + segmentation);
+    rate += vp8_cost_mv_ref(SPLITMV, bsi->mdcounts);
+    this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
+    br += rate;
+
+    for (i = 0; i < label_count; i++)
+    {
+        int_mv mode_mv[B_MODE_COUNT];
+        int best_label_rd = INT_MAX;
+        B_PREDICTION_MODE mode_selected = ZERO4X4;
+        int bestlabelyrate = 0;
+
+        /* search for the best motion vector on this segment */
+        for (this_mode = LEFT4X4; this_mode <= NEW4X4 ; this_mode ++)
+        {
+            int this_rd;
+            int distortion;
+            int labelyrate;
+            ENTROPY_CONTEXT_PLANES t_above_s, t_left_s;
+            ENTROPY_CONTEXT *ta_s;
+            ENTROPY_CONTEXT *tl_s;
+
+            vpx_memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
+            vpx_memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
+
+            ta_s = (ENTROPY_CONTEXT *)&t_above_s;
+            tl_s = (ENTROPY_CONTEXT *)&t_left_s;
+
+            if (this_mode == NEW4X4)
+            {
+                int sseshift;
+                int num00;
+                int step_param = 0;
+                int further_steps;
+                int n;
+                int thissme;
+                int bestsme = INT_MAX;
+                int_mv  temp_mv;
+                BLOCK *c;
+                BLOCKD *e;
+
+                /* Is the best so far sufficiently good that we cant justify
+                 * doing a new motion search.
+                 */
+                if (best_label_rd < label_mv_thresh)
+                    break;
+
+                if(cpi->compressor_speed)
+                {
+                    if (segmentation == BLOCK_8X16 || segmentation == BLOCK_16X8)
+                    {
+                        bsi->mvp.as_int = bsi->sv_mvp[i].as_int;
+                        if (i==1 && segmentation == BLOCK_16X8)
+                          bsi->mvp.as_int = bsi->sv_mvp[2].as_int;
+
+                        step_param = bsi->sv_istep[i];
+                    }
+
+                    /* use previous block's result as next block's MV
+                     * predictor.
+                     */
+                    if (segmentation == BLOCK_4X4 && i>0)
+                    {
+                        bsi->mvp.as_int = x->e_mbd.block[i-1].bmi.mv.as_int;
+                        if (i==4 || i==8 || i==12)
+                            bsi->mvp.as_int = x->e_mbd.block[i-4].bmi.mv.as_int;
+                        step_param = 2;
+                    }
+                }
+
+                further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+
+                {
+                    int sadpb = x->sadperbit4;
+                    int_mv mvp_full;
+
+                    mvp_full.as_mv.row = bsi->mvp.as_mv.row >>3;
+                    mvp_full.as_mv.col = bsi->mvp.as_mv.col >>3;
+
+                    /* find first label */
+                    n = vp8_mbsplit_offset[segmentation][i];
+
+                    c = &x->block[n];
+                    e = &x->e_mbd.block[n];
+
+                    {
+                        bestsme = cpi->diamond_search_sad(x, c, e, &mvp_full,
+                                                &mode_mv[NEW4X4], step_param,
+                                                sadpb, &num00, v_fn_ptr,
+                                                x->mvcost, bsi->ref_mv);
+
+                        n = num00;
+                        num00 = 0;
+
+                        while (n < further_steps)
+                        {
+                            n++;
+
+                            if (num00)
+                                num00--;
+                            else
+                            {
+                                thissme = cpi->diamond_search_sad(x, c, e,
+                                                    &mvp_full, &temp_mv,
+                                                    step_param + n, sadpb,
+                                                    &num00, v_fn_ptr,
+                                                    x->mvcost, bsi->ref_mv);
+
+                                if (thissme < bestsme)
+                                {
+                                    bestsme = thissme;
+                                    mode_mv[NEW4X4].as_int = temp_mv.as_int;
+                                }
+                            }
+                        }
+                    }
+
+                    sseshift = segmentation_to_sseshift[segmentation];
+
+                    /* Should we do a full search (best quality only) */
+                    if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000)
+                    {
+                        /* Check if mvp_full is within the range. */
+                        vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+
+                        thissme = cpi->full_search_sad(x, c, e, &mvp_full,
+                                                       sadpb, 16, v_fn_ptr,
+                                                       x->mvcost, bsi->ref_mv);
+
+                        if (thissme < bestsme)
+                        {
+                            bestsme = thissme;
+                            mode_mv[NEW4X4].as_int = e->bmi.mv.as_int;
+                        }
+                        else
+                        {
+                            /* The full search result is actually worse so
+                             * re-instate the previous best vector
+                             */
+                            e->bmi.mv.as_int = mode_mv[NEW4X4].as_int;
+                        }
+                    }
+                }
+
+                if (bestsme < INT_MAX)
+                {
+                    int distortion;
+                    unsigned int sse;
+                    cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4],
+                        bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost,
+                        &distortion, &sse);
+
+                }
+            } /* NEW4X4 */
+
+            rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
+                               bsi->ref_mv, x->mvcost);
+
+            /* Trap vectors that reach beyond the UMV borders */
+            if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+                ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
+            {
+                continue;
+            }
+
+            distortion = vp8_encode_inter_mb_segment(x, labels, i) / 4;
+
+            labelyrate = rdcost_mbsegment_y(x, labels, i, ta_s, tl_s);
+            rate += labelyrate;
+
+            this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+            if (this_rd < best_label_rd)
+            {
+                sbr = rate;
+                sbd = distortion;
+                bestlabelyrate = labelyrate;
+                mode_selected = this_mode;
+                best_label_rd = this_rd;
+
+                vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
+                vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+
+            }
+        } /*for each 4x4 mode*/
+
+        vpx_memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
+        vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+
+        labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
+                    bsi->ref_mv, x->mvcost);
+
+        br += sbr;
+        bd += sbd;
+        segmentyrate += bestlabelyrate;
+        this_segment_rd += best_label_rd;
+
+        if (this_segment_rd >= bsi->segment_rd)
+            break;
+
+    } /* for each label */
+
+    if (this_segment_rd < bsi->segment_rd)
+    {
+        bsi->r = br;
+        bsi->d = bd;
+        bsi->segment_yrate = segmentyrate;
+        bsi->segment_rd = this_segment_rd;
+        bsi->segment_num = segmentation;
+
+        /* store everything needed to come back to this!! */
+        for (i = 0; i < 16; i++)
+        {
+            bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
+            bsi->modes[i] = x->partition_info->bmi[i].mode;
+            bsi->eobs[i] = x->e_mbd.eobs[i];
+        }
+    }
+}
+
+static
+void vp8_cal_step_param(int sr, int *sp)
+{
+    int step = 0;
+
+    if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;
+    else if (sr < 1) sr = 1;
+
+    while (sr>>=1)
+        step++;
+
+    *sp = MAX_MVSEARCH_STEPS - 1 - step;
+}
+
+static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
+                                           int_mv *best_ref_mv, int best_rd,
+                                           int *mdcounts, int *returntotrate,
+                                           int *returnyrate, int *returndistortion,
+                                           int mvthresh)
+{
+    int i;
+    BEST_SEG_INFO bsi;
+
+    vpx_memset(&bsi, 0, sizeof(bsi));
+
+    bsi.segment_rd = best_rd;
+    bsi.ref_mv = best_ref_mv;
+    bsi.mvp.as_int = best_ref_mv->as_int;
+    bsi.mvthresh = mvthresh;
+    bsi.mdcounts = mdcounts;
+
+    for(i = 0; i < 16; i++)
+    {
+        bsi.modes[i] = ZERO4X4;
+    }
+
+    if(cpi->compressor_speed == 0)
+    {
+        /* for now, we will keep the original segmentation order
+           when in best quality mode */
+        rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
+        rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
+        rd_check_segment(cpi, x, &bsi, BLOCK_8X8);
+        rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
+    }
+    else
+    {
+        int sr;
+
+        rd_check_segment(cpi, x, &bsi, BLOCK_8X8);
+
+        if (bsi.segment_rd < best_rd)
+        {
+            int col_min = ((best_ref_mv->as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+            int row_min = ((best_ref_mv->as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
+            int col_max = (best_ref_mv->as_mv.col>>3) + MAX_FULL_PEL_VAL;
+            int row_max = (best_ref_mv->as_mv.row>>3) + MAX_FULL_PEL_VAL;
+
+            int tmp_col_min = x->mv_col_min;
+            int tmp_col_max = x->mv_col_max;
+            int tmp_row_min = x->mv_row_min;
+            int tmp_row_max = x->mv_row_max;
+
+            /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */
+            if (x->mv_col_min < col_min )
+                x->mv_col_min = col_min;
+            if (x->mv_col_max > col_max )
+                x->mv_col_max = col_max;
+            if (x->mv_row_min < row_min )
+                x->mv_row_min = row_min;
+            if (x->mv_row_max > row_max )
+                x->mv_row_max = row_max;
+
+            /* Get 8x8 result */
+            bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;
+            bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;
+            bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;
+            bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;
+
+            /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range according to the closeness of 2 MV. */
+            /* block 8X16 */
+            {
+                sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row))>>3, (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col))>>3);
+                vp8_cal_step_param(sr, &bsi.sv_istep[0]);
+
+                sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row))>>3, (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col))>>3);
+                vp8_cal_step_param(sr, &bsi.sv_istep[1]);
+
+                rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
+            }
+
+            /* block 16X8 */
+            {
+                sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row))>>3, (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col))>>3);
+                vp8_cal_step_param(sr, &bsi.sv_istep[0]);
+
+                sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row))>>3, (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col))>>3);
+                vp8_cal_step_param(sr, &bsi.sv_istep[1]);
+
+                rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
+            }
+
+            /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
+            /* Not skip 4x4 if speed=0 (good quality) */
+            if (cpi->sf.no_skip_block4x4_search || bsi.segment_num == BLOCK_8X8)  /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
+            {
+                bsi.mvp.as_int = bsi.sv_mvp[0].as_int;
+                rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
+            }
+
+            /* restore UMV window */
+            x->mv_col_min = tmp_col_min;
+            x->mv_col_max = tmp_col_max;
+            x->mv_row_min = tmp_row_min;
+            x->mv_row_max = tmp_row_max;
+        }
+    }
+
+    /* set it to the best */
+    for (i = 0; i < 16; i++)
+    {
+        BLOCKD *bd = &x->e_mbd.block[i];
+
+        bd->bmi.mv.as_int = bsi.mvs[i].as_int;
+        *bd->eob = bsi.eobs[i];
+    }
+
+    *returntotrate = bsi.r;
+    *returndistortion = bsi.d;
+    *returnyrate = bsi.segment_yrate;
+
+    /* save partitions */
+    x->e_mbd.mode_info_context->mbmi.partitioning = bsi.segment_num;
+    x->partition_info->count = vp8_mbsplit_count[bsi.segment_num];
+
+    for (i = 0; i < x->partition_info->count; i++)
+    {
+        int j;
+
+        j = vp8_mbsplit_offset[bsi.segment_num][i];
+
+        x->partition_info->bmi[i].mode = bsi.modes[j];
+        x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;
+    }
+    /*
+     * used to set x->e_mbd.mode_info_context->mbmi.mv.as_int
+     */
+    x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;
+
+    return bsi.segment_rd;
+}
+
+/* The improved MV prediction */
+void vp8_mv_pred
+(
+    VP8_COMP *cpi,
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv *mvp,
+    int refframe,
+    int *ref_frame_sign_bias,
+    int *sr,
+    int near_sadidx[]
+)
+{
+    const MODE_INFO *above = here - xd->mode_info_stride;
+    const MODE_INFO *left = here - 1;
+    const MODE_INFO *aboveleft = above - 1;
+    int_mv           near_mvs[8];
+    int              near_ref[8];
+    int_mv           mv;
+    int              vcnt=0;
+    int              find=0;
+    int              mb_offset;
+
+    int              mvx[8];
+    int              mvy[8];
+    int              i;
+
+    mv.as_int = 0;
+
+    if(here->mbmi.ref_frame != INTRA_FRAME)
+    {
+        near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = near_mvs[6].as_int = near_mvs[7].as_int = 0;
+        near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = near_ref[5] = near_ref[6] = near_ref[7] = 0;
+
+        /* read in 3 nearby block's MVs from current frame as prediction
+         * candidates.
+         */
+        if (above->mbmi.ref_frame != INTRA_FRAME)
+        {
+            near_mvs[vcnt].as_int = above->mbmi.mv.as_int;
+            mv_bias(ref_frame_sign_bias[above->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+            near_ref[vcnt] =  above->mbmi.ref_frame;
+        }
+        vcnt++;
+        if (left->mbmi.ref_frame != INTRA_FRAME)
+        {
+            near_mvs[vcnt].as_int = left->mbmi.mv.as_int;
+            mv_bias(ref_frame_sign_bias[left->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+            near_ref[vcnt] =  left->mbmi.ref_frame;
+        }
+        vcnt++;
+        if (aboveleft->mbmi.ref_frame != INTRA_FRAME)
+        {
+            near_mvs[vcnt].as_int = aboveleft->mbmi.mv.as_int;
+            mv_bias(ref_frame_sign_bias[aboveleft->mbmi.ref_frame], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+            near_ref[vcnt] =  aboveleft->mbmi.ref_frame;
+        }
+        vcnt++;
+
+        /* read in 5 nearby block's MVs from last frame. */
+        if(cpi->common.last_frame_type != KEY_FRAME)
+        {
+            mb_offset = (-xd->mb_to_top_edge/128 + 1) * (xd->mode_info_stride +1) + (-xd->mb_to_left_edge/128 +1) ;
+
+            /* current in last frame */
+            if (cpi->lf_ref_frame[mb_offset] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset].as_int;
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset];
+            }
+            vcnt++;
+
+            /* above in last frame */
+            if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride-1] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride-1].as_int;
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset - xd->mode_info_stride-1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - xd->mode_info_stride-1];
+            }
+            vcnt++;
+
+            /* left in last frame */
+            if (cpi->lf_ref_frame[mb_offset-1] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset -1].as_int;
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset -1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset - 1];
+            }
+            vcnt++;
+
+            /* right in last frame */
+            if (cpi->lf_ref_frame[mb_offset +1] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset +1].as_int;
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset +1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset +1];
+            }
+            vcnt++;
+
+            /* below in last frame */
+            if (cpi->lf_ref_frame[mb_offset + xd->mode_info_stride +1] != INTRA_FRAME)
+            {
+                near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + xd->mode_info_stride +1].as_int;
+                mv_bias(cpi->lf_ref_frame_sign_bias[mb_offset + xd->mode_info_stride +1], refframe, &near_mvs[vcnt], ref_frame_sign_bias);
+                near_ref[vcnt] =  cpi->lf_ref_frame[mb_offset + xd->mode_info_stride +1];
+            }
+            vcnt++;
+        }
+
+        for(i=0; i< vcnt; i++)
+        {
+            if(near_ref[near_sadidx[i]] != INTRA_FRAME)
+            {
+                if(here->mbmi.ref_frame == near_ref[near_sadidx[i]])
+                {
+                    mv.as_int = near_mvs[near_sadidx[i]].as_int;
+                    find = 1;
+                    if (i < 3)
+                        *sr = 3;
+                    else
+                        *sr = 2;
+                    break;
+                }
+            }
+        }
+
+        if(!find)
+        {
+            for(i=0; i<vcnt; i++)
+            {
+                mvx[i] = near_mvs[i].as_mv.row;
+                mvy[i] = near_mvs[i].as_mv.col;
+            }
+
+            insertsortmv(mvx, vcnt);
+            insertsortmv(mvy, vcnt);
+            mv.as_mv.row = mvx[vcnt/2];
+            mv.as_mv.col = mvy[vcnt/2];
+
+            find = 1;
+            /* sr is set to 0 to allow calling function to decide the search
+             * range.
+             */
+            *sr = 0;
+        }
+    }
+
+    /* Set up return values */
+    mvp->as_int = mv.as_int;
+    vp8_clamp_mv2(mvp, xd);
+}
+
+void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[])
+{
+    /* near_sad indexes:
+     *   0-cf above, 1-cf left, 2-cf aboveleft,
+     *   3-lf current, 4-lf above, 5-lf left, 6-lf right, 7-lf below
+     */
+    int near_sad[8] = {0};
+    BLOCK *b = &x->block[0];
+    unsigned char *src_y_ptr = *(b->base_src);
+
+    /* calculate sad for current frame 3 nearby MBs. */
+    if( xd->mb_to_top_edge==0 && xd->mb_to_left_edge ==0)
+    {
+        near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;
+    }else if(xd->mb_to_top_edge==0)
+    {   /* only has left MB for sad calculation. */
+        near_sad[0] = near_sad[2] = INT_MAX;
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
+    }else if(xd->mb_to_left_edge ==0)
+    {   /* only has left MB for sad calculation. */
+        near_sad[1] = near_sad[2] = INT_MAX;
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
+    }else
+    {
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
+        near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, UINT_MAX);
+    }
+
+    if(cpi->common.last_frame_type != KEY_FRAME)
+    {
+        /* calculate sad for last frame 5 nearby MBs. */
+        unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;
+        int pre_y_stride = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;
+
+        if(xd->mb_to_top_edge==0) near_sad[4] = INT_MAX;
+        if(xd->mb_to_left_edge ==0) near_sad[5] = INT_MAX;
+        if(xd->mb_to_right_edge ==0) near_sad[6] = INT_MAX;
+        if(xd->mb_to_bottom_edge==0) near_sad[7] = INT_MAX;
+
+        if(near_sad[4] != INT_MAX)
+            near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, UINT_MAX);
+        if(near_sad[5] != INT_MAX)
+            near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride, UINT_MAX);
+        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride, UINT_MAX);
+        if(near_sad[6] != INT_MAX)
+            near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride, UINT_MAX);
+        if(near_sad[7] != INT_MAX)
+            near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, UINT_MAX);
+    }
+
+    if(cpi->common.last_frame_type != KEY_FRAME)
+    {
+        insertsortsad(near_sad, near_sadidx, 8);
+    }else
+    {
+        insertsortsad(near_sad, near_sadidx, 3);
+    }
+}
+
+static void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
+{
+    if (x->e_mbd.mode_info_context->mbmi.mode == SPLITMV)
+    {
+        int i;
+
+        for (i = 0; i < x->partition_info->count; i++)
+        {
+            if (x->partition_info->bmi[i].mode == NEW4X4)
+            {
+                x->MVcount[0][mv_max+((x->partition_info->bmi[i].mv.as_mv.row
+                                          - best_ref_mv->as_mv.row) >> 1)]++;
+                x->MVcount[1][mv_max+((x->partition_info->bmi[i].mv.as_mv.col
+                                          - best_ref_mv->as_mv.col) >> 1)]++;
+            }
+        }
+    }
+    else if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
+    {
+        x->MVcount[0][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.row
+                                          - best_ref_mv->as_mv.row) >> 1)]++;
+        x->MVcount[1][mv_max+((x->e_mbd.mode_info_context->mbmi.mv.as_mv.col
+                                          - best_ref_mv->as_mv.col) >> 1)]++;
+    }
+}
+
+static int evaluate_inter_mode_rd(int mdcounts[4],
+                                  RATE_DISTORTION* rd,
+                                  int* disable_skip,
+                                  VP8_COMP *cpi, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+    BLOCK *b = &x->block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+    int distortion;
+    vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.predictor, 16);
+
+    if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
+        x->skip = 1;
+    }
+    else if (x->encode_breakout)
+    {
+        unsigned int sse;
+        unsigned int var;
+        unsigned int threshold = (xd->block[0].dequant[1]
+                    * xd->block[0].dequant[1] >>4);
+
+        if(threshold < x->encode_breakout)
+            threshold = x->encode_breakout;
+
+        var = vp8_variance16x16
+                (*(b->base_src), b->src_stride,
+                x->e_mbd.predictor, 16, &sse);
+
+        if (sse < threshold)
+        {
+             unsigned int q2dc = xd->block[24].dequant[0];
+            /* If theres is no codeable 2nd order dc
+               or a very small uniform pixel change change */
+            if ((sse - var < q2dc * q2dc >>4) ||
+                (sse /2 > var && sse-var < 64))
+            {
+                /* Check u and v to make sure skip is ok */
+                unsigned int sse2 = VP8_UVSSE(x);
+                if (sse2 * 2 < threshold)
+                {
+                    x->skip = 1;
+                    rd->distortion2 = sse + sse2;
+                    rd->rate2 = 500;
+
+                    /* for best_yrd calculation */
+                    rd->rate_uv = 0;
+                    rd->distortion_uv = sse2;
+
+                    *disable_skip = 1;
+                    return RDCOST(x->rdmult, x->rddiv, rd->rate2,
+                                  rd->distortion2);
+                }
+            }
+        }
+    }
+
+
+    /* Add in the Mv/mode cost */
+    rd->rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+
+    /* Y cost and distortion */
+    macro_block_yrd(x, &rd->rate_y, &distortion);
+    rd->rate2 += rd->rate_y;
+    rd->distortion2 += distortion;
+
+    /* UV cost and distortion */
+    rd_inter16x16_uv(cpi, x, &rd->rate_uv, &rd->distortion_uv,
+                     cpi->common.full_pixel);
+    rd->rate2 += rd->rate_uv;
+    rd->distortion2 += rd->distortion_uv;
+    return INT_MAX;
+}
+
+static int calculate_final_rd_costs(int this_rd,
+                                    RATE_DISTORTION* rd,
+                                    int* other_cost,
+                                    int disable_skip,
+                                    int uv_intra_tteob,
+                                    int intra_rd_penalty,
+                                    VP8_COMP *cpi, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+
+    /* Where skip is allowable add in the default per mb cost for the no
+     * skip case. where we then decide to skip we have to delete this and
+     * replace it with the cost of signalling a skip
+     */
+    if (cpi->common.mb_no_coeff_skip)
+    {
+        *other_cost += vp8_cost_bit(cpi->prob_skip_false, 0);
+        rd->rate2 += *other_cost;
+    }
+
+    /* Estimate the reference frame signaling cost and add it
+     * to the rolling cost variable.
+     */
+    rd->rate2 +=
+        x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+
+    if (!disable_skip)
+    {
+        /* Test for the condition where skip block will be activated
+         * because there are no non zero coefficients and make any
+         * necessary adjustment for rate
+         */
+        if (cpi->common.mb_no_coeff_skip)
+        {
+            int i;
+            int tteob;
+            int has_y2_block = (this_mode!=SPLITMV && this_mode!=B_PRED);
+
+            tteob = 0;
+            if(has_y2_block)
+                tteob += x->e_mbd.eobs[24];
+
+            for (i = 0; i < 16; i++)
+                tteob += (x->e_mbd.eobs[i] > has_y2_block);
+
+            if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+            {
+                for (i = 16; i < 24; i++)
+                    tteob += x->e_mbd.eobs[i];
+            }
+            else
+                tteob += uv_intra_tteob;
+
+            if (tteob == 0)
+            {
+                rd->rate2 -= (rd->rate_y + rd->rate_uv);
+                /* for best_yrd calculation */
+                rd->rate_uv = 0;
+
+                /* Back out no skip flag costing and add in skip flag costing */
+                if (cpi->prob_skip_false)
+                {
+                    int prob_skip_cost;
+
+                    prob_skip_cost = vp8_cost_bit(cpi->prob_skip_false, 1);
+                    prob_skip_cost -= vp8_cost_bit(cpi->prob_skip_false, 0);
+                    rd->rate2 += prob_skip_cost;
+                    *other_cost += prob_skip_cost;
+                }
+            }
+        }
+        /* Calculate the final RD estimate for this mode */
+        this_rd = RDCOST(x->rdmult, x->rddiv, rd->rate2, rd->distortion2);
+        if (this_rd < INT_MAX && x->e_mbd.mode_info_context->mbmi.ref_frame
+                                 == INTRA_FRAME)
+            this_rd += intra_rd_penalty;
+    }
+    return this_rd;
+}
+
+static void update_best_mode(BEST_MODE* best_mode, int this_rd,
+                             RATE_DISTORTION* rd, int other_cost, MACROBLOCK *x)
+{
+    MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
+
+    other_cost +=
+    x->ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
+
+    /* Calculate the final y RD estimate for this mode */
+    best_mode->yrd = RDCOST(x->rdmult, x->rddiv, (rd->rate2-rd->rate_uv-other_cost),
+                      (rd->distortion2-rd->distortion_uv));
+
+    best_mode->rd = this_rd;
+    vpx_memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
+    vpx_memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO));
+
+    if ((this_mode == B_PRED) || (this_mode == SPLITMV))
+    {
+        int i;
+        for (i = 0; i < 16; i++)
+        {
+            best_mode->bmodes[i] = x->e_mbd.block[i].bmi;
+        }
+    }
+}
+
+void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+                            int recon_uvoffset, int *returnrate,
+                            int *returndistortion, int *returnintra)
+{
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+    int_mv best_ref_mv_sb[2];
+    int_mv mode_mv_sb[2][MB_MODE_COUNT];
+    int_mv best_ref_mv;
+    int_mv *mode_mv;
+    MB_PREDICTION_MODE this_mode;
+    int num00;
+    int best_mode_index = 0;
+    BEST_MODE best_mode;
+
+    int i;
+    int mode_index;
+    int mdcounts[4];
+    int rate;
+    RATE_DISTORTION rd;
+    int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
+    int uv_intra_tteob = 0;
+    int uv_intra_done = 0;
+
+    MB_PREDICTION_MODE uv_intra_mode = 0;
+    int_mv mvp;
+    int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    int saddone=0;
+    /* search range got from mv_pred(). It uses step_param levels. (0-7) */
+    int sr=0;
+
+    unsigned char *plane[4][3];
+    int ref_frame_map[4];
+    int sign_bias = 0;
+
+    int intra_rd_penalty =  10* vp8_dc_quant(cpi->common.base_qindex,
+                                             cpi->common.y1dc_delta_q);
+
+#if CONFIG_TEMPORAL_DENOISING
+    unsigned int zero_mv_sse = INT_MAX, best_sse = INT_MAX,
+            best_rd_sse = INT_MAX;
+#endif
+
+    mode_mv = mode_mv_sb[sign_bias];
+    best_ref_mv.as_int = 0;
+    best_mode.rd = INT_MAX;
+    best_mode.yrd = INT_MAX;
+    best_mode.intra_rd = INT_MAX;
+    vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
+    vpx_memset(&best_mode.mbmode, 0, sizeof(best_mode.mbmode));
+    vpx_memset(&best_mode.bmodes, 0, sizeof(best_mode.bmodes));
+
+    /* Setup search priorities */
+    get_reference_search_order(cpi, ref_frame_map);
+
+    /* Check to see if there is at least 1 valid reference frame that we need
+     * to calculate near_mvs.
+     */
+    if (ref_frame_map[1] > 0)
+    {
+        sign_bias = vp8_find_near_mvs_bias(&x->e_mbd,
+                                           x->e_mbd.mode_info_context,
+                                           mode_mv_sb,
+                                           best_ref_mv_sb,
+                                           mdcounts,
+                                           ref_frame_map[1],
+                                           cpi->common.ref_frame_sign_bias);
+
+        mode_mv = mode_mv_sb[sign_bias];
+        best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+    }
+
+    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
+
+    *returnintra = INT_MAX;
+    /* Count of the number of MBs tested so far this frame */
+    cpi->mbs_tested_so_far++;
+
+    x->skip = 0;
+
+    for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
+    {
+        int this_rd = INT_MAX;
+        int disable_skip = 0;
+        int other_cost = 0;
+        int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
+
+        /* Test best rd so far against threshold for trying this mode. */
+        if (best_mode.rd <= cpi->rd_threshes[mode_index])
+            continue;
+
+        if (this_ref_frame < 0)
+            continue;
+
+        /* These variables hold are rolling total cost and distortion for
+         * this mode
+         */
+        rd.rate2 = 0;
+        rd.distortion2 = 0;
+
+        this_mode = vp8_mode_order[mode_index];
+
+        x->e_mbd.mode_info_context->mbmi.mode = this_mode;
+        x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+
+        /* Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+         * unless ARNR filtering is enabled in which case we want
+         * an unfiltered alternative
+         */
+        if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+        {
+            if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
+                continue;
+        }
+
+        /* everything but intra */
+        if (x->e_mbd.mode_info_context->mbmi.ref_frame)
+        {
+            x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+            x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+            x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+            if (sign_bias != cpi->common.ref_frame_sign_bias[this_ref_frame])
+            {
+                sign_bias = cpi->common.ref_frame_sign_bias[this_ref_frame];
+                mode_mv = mode_mv_sb[sign_bias];
+                best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
+            }
+        }
+
+        /* Check to see if the testing frequency for this mode is at its
+         * max If so then prevent it from being tested and increase the
+         * threshold for its testing
+         */
+        if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
+        {
+            if (cpi->mbs_tested_so_far  <= cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index])
+            {
+                /* Increase the threshold for coding this mode to make it
+                 * less likely to be chosen
+                 */
+                cpi->rd_thresh_mult[mode_index] += 4;
+
+                if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                    cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+                cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+
+                continue;
+            }
+        }
+
+        /* We have now reached the point where we are going to test the
+         * current mode so increment the counter for the number of times
+         * it has been tested
+         */
+        cpi->mode_test_hit_counts[mode_index] ++;
+
+        /* Experimental code. Special case for gf and arf zeromv modes.
+         * Increase zbin size to supress noise
+         */
+        if (cpi->zbin_mode_boost_enabled)
+        {
+            if ( this_ref_frame == INTRA_FRAME )
+                cpi->zbin_mode_boost = 0;
+            else
+            {
+                if (vp8_mode_order[mode_index] == ZEROMV)
+                {
+                    if (this_ref_frame != LAST_FRAME)
+                        cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+                    else
+                        cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+                }
+                else if (vp8_mode_order[mode_index] == SPLITMV)
+                    cpi->zbin_mode_boost = 0;
+                else
+                    cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+            }
+
+            vp8_update_zbin_extra(cpi, x);
+        }
+
+        if(!uv_intra_done && this_ref_frame == INTRA_FRAME)
+        {
+            rd_pick_intra_mbuv_mode(x, &uv_intra_rate,
+                                    &uv_intra_rate_tokenonly,
+                                    &uv_intra_distortion);
+            uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;
+
+            /*
+             * Total of the eobs is used later to further adjust rate2. Since uv
+             * block's intra eobs will be overwritten when we check inter modes,
+             * we need to save uv_intra_tteob here.
+             */
+            for (i = 16; i < 24; i++)
+                uv_intra_tteob += x->e_mbd.eobs[i];
+
+            uv_intra_done = 1;
+        }
+
+        switch (this_mode)
+        {
+        case B_PRED:
+        {
+            int tmp_rd;
+
+            /* Note the rate value returned here includes the cost of
+             * coding the BPRED mode: x->mbmode_cost[x->e_mbd.frame_type][BPRED]
+             */
+            int distortion;
+            tmp_rd = rd_pick_intra4x4mby_modes(x, &rate, &rd.rate_y, &distortion, best_mode.yrd);
+            rd.rate2 += rate;
+            rd.distortion2 += distortion;
+
+            if(tmp_rd < best_mode.yrd)
+            {
+                rd.rate2 += uv_intra_rate;
+                rd.rate_uv = uv_intra_rate_tokenonly;
+                rd.distortion2 += uv_intra_distortion;
+                rd.distortion_uv = uv_intra_distortion;
+            }
+            else
+            {
+                this_rd = INT_MAX;
+                disable_skip = 1;
+            }
+        }
+        break;
+
+        case SPLITMV:
+        {
+            int tmp_rd;
+            int this_rd_thresh;
+            int distortion;
+
+            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 1) ? cpi->rd_threshes[THR_NEW1] : cpi->rd_threshes[THR_NEW3];
+            this_rd_thresh = (vp8_ref_frame_order[mode_index] == 2) ? cpi->rd_threshes[THR_NEW2] : this_rd_thresh;
+
+            tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
+                                                     best_mode.yrd, mdcounts,
+                                                     &rate, &rd.rate_y, &distortion, this_rd_thresh) ;
+
+            rd.rate2 += rate;
+            rd.distortion2 += distortion;
+
+            /* If even the 'Y' rd value of split is higher than best so far
+             * then dont bother looking at UV
+             */
+            if (tmp_rd < best_mode.yrd)
+            {
+                /* Now work out UV cost and add it in */
+                rd_inter4x4_uv(cpi, x, &rd.rate_uv, &rd.distortion_uv, cpi->common.full_pixel);
+                rd.rate2 += rd.rate_uv;
+                rd.distortion2 += rd.distortion_uv;
+            }
+            else
+            {
+                this_rd = INT_MAX;
+                disable_skip = 1;
+            }
+        }
+        break;
+        case DC_PRED:
+        case V_PRED:
+        case H_PRED:
+        case TM_PRED:
+        {
+            int distortion;
+            x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+            vp8_build_intra_predictors_mby_s(xd,
+                                             xd->dst.y_buffer - xd->dst.y_stride,
+                                             xd->dst.y_buffer - 1,
+                                             xd->dst.y_stride,
+                                             xd->predictor,
+                                             16);
+            macro_block_yrd(x, &rd.rate_y, &distortion) ;
+            rd.rate2 += rd.rate_y;
+            rd.distortion2 += distortion;
+            rd.rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
+            rd.rate2 += uv_intra_rate;
+            rd.rate_uv = uv_intra_rate_tokenonly;
+            rd.distortion2 += uv_intra_distortion;
+            rd.distortion_uv = uv_intra_distortion;
+        }
+        break;
+
+        case NEWMV:
+        {
+            int thissme;
+            int bestsme = INT_MAX;
+            int step_param = cpi->sf.first_step;
+            int further_steps;
+            int n;
+            int do_refine=1;   /* If last step (1-away) of n-step search doesn't pick the center point as the best match,
+                                  we will do a final 1-away diamond refining search  */
+
+            int sadpb = x->sadperbit16;
+            int_mv mvp_full;
+
+            int col_min = ((best_ref_mv.as_mv.col+7)>>3) - MAX_FULL_PEL_VAL;
+            int row_min = ((best_ref_mv.as_mv.row+7)>>3) - MAX_FULL_PEL_VAL;
+            int col_max = (best_ref_mv.as_mv.col>>3) + MAX_FULL_PEL_VAL;
+            int row_max = (best_ref_mv.as_mv.row>>3) + MAX_FULL_PEL_VAL;
+
+            int tmp_col_min = x->mv_col_min;
+            int tmp_col_max = x->mv_col_max;
+            int tmp_row_min = x->mv_row_min;
+            int tmp_row_max = x->mv_row_max;
+
+            if(!saddone)
+            {
+                vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+                saddone = 1;
+            }
+
+            vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
+                        x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
+
+            mvp_full.as_mv.col = mvp.as_mv.col>>3;
+            mvp_full.as_mv.row = mvp.as_mv.row>>3;
+
+            /* Get intersection of UMV window and valid MV window to
+             * reduce # of checks in diamond search.
+             */
+            if (x->mv_col_min < col_min )
+                x->mv_col_min = col_min;
+            if (x->mv_col_max > col_max )
+                x->mv_col_max = col_max;
+            if (x->mv_row_min < row_min )
+                x->mv_row_min = row_min;
+            if (x->mv_row_max > row_max )
+                x->mv_row_max = row_max;
+
+            /* adjust search range according to sr from mv prediction */
+            if(sr > step_param)
+                step_param = sr;
+
+            /* Initial step/diamond search */
+            {
+                bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.mv,
+                                        step_param, sadpb, &num00,
+                                        &cpi->fn_ptr[BLOCK_16X16],
+                                        x->mvcost, &best_ref_mv);
+                mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+
+                /* Further step/diamond searches as necessary */
+                n = 0;
+                further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+                n = num00;
+                num00 = 0;
+
+                /* If there won't be more n-step search, check to see if refining search is needed. */
+                if (n > further_steps)
+                    do_refine = 0;
+
+                while (n < further_steps)
+                {
+                    n++;
+
+                    if (num00)
+                        num00--;
+                    else
+                    {
+                        thissme = cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                    &d->bmi.mv, step_param + n, sadpb, &num00,
+                                    &cpi->fn_ptr[BLOCK_16X16], x->mvcost,
+                                    &best_ref_mv);
+
+                        /* check to see if refining search is needed. */
+                        if (num00 > (further_steps-n))
+                            do_refine = 0;
+
+                        if (thissme < bestsme)
+                        {
+                            bestsme = thissme;
+                            mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                        }
+                        else
+                        {
+                            d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+                        }
+                    }
+                }
+            }
+
+            /* final 1-away diamond refining search */
+            if (do_refine == 1)
+            {
+                int search_range;
+
+                search_range = 8;
+
+                thissme = cpi->refining_search_sad(x, b, d, &d->bmi.mv, sadpb,
+                                       search_range, &cpi->fn_ptr[BLOCK_16X16],
+                                       x->mvcost, &best_ref_mv);
+
+                if (thissme < bestsme)
+                {
+                    bestsme = thissme;
+                    mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+                }
+                else
+                {
+                    d->bmi.mv.as_int = mode_mv[NEWMV].as_int;
+                }
+            }
+
+            x->mv_col_min = tmp_col_min;
+            x->mv_col_max = tmp_col_max;
+            x->mv_row_min = tmp_row_min;
+            x->mv_row_max = tmp_row_max;
+
+            if (bestsme < INT_MAX)
+            {
+                int dis; /* TODO: use dis in distortion calculation later. */
+                unsigned int sse;
+                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv,
+                                             x->errorperbit,
+                                             &cpi->fn_ptr[BLOCK_16X16],
+                                             x->mvcost, &dis, &sse);
+            }
+
+            mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
+
+            /* Add the new motion vector cost to our rolling cost variable */
+            rd.rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
+        }
+
+        case NEARESTMV:
+        case NEARMV:
+            /* Clip "next_nearest" so that it does not extend to far out
+             * of image
+             */
+            vp8_clamp_mv2(&mode_mv[this_mode], xd);
+
+            /* Do not bother proceeding if the vector (from newmv, nearest
+             * or near) is 0,0 as this should then be coded using the zeromv
+             * mode.
+             */
+            if (((this_mode == NEARMV) || (this_mode == NEARESTMV)) && (mode_mv[this_mode].as_int == 0))
+                continue;
+
+        case ZEROMV:
+
+            /* Trap vectors that reach beyond the UMV borders
+             * Note that ALL New MV, Nearest MV Near MV and Zero MV code
+             * drops through to this point because of the lack of break
+             * statements in the previous two cases.
+             */
+            if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+                ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
+                continue;
+
+            vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
+            this_rd = evaluate_inter_mode_rd(mdcounts, &rd,
+                                             &disable_skip, cpi, x);
+            break;
+
+        default:
+            break;
+        }
+
+        this_rd = calculate_final_rd_costs(this_rd, &rd, &other_cost,
+                                           disable_skip, uv_intra_tteob,
+                                           intra_rd_penalty, cpi, x);
+
+        /* Keep record of best intra distortion */
+        if ((x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
+            (this_rd < best_mode.intra_rd) )
+        {
+          best_mode.intra_rd = this_rd;
+            *returnintra = rd.distortion2 ;
+        }
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity)
+        {
+            unsigned int sse;
+            vp8_get_inter_mbpred_error(x,&cpi->fn_ptr[BLOCK_16X16],&sse,
+                                   mode_mv[this_mode]);
+
+            if (sse < best_rd_sse)
+                best_rd_sse = sse;
+
+            /* Store for later use by denoiser. */
+            if (this_mode == ZEROMV && sse < zero_mv_sse )
+            {
+                zero_mv_sse = sse;
+                x->best_zeromv_reference_frame =
+                        x->e_mbd.mode_info_context->mbmi.ref_frame;
+            }
+
+            /* Store the best NEWMV in x for later use in the denoiser. */
+            if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
+                    sse < best_sse)
+            {
+                best_sse = sse;
+                vp8_get_inter_mbpred_error(x,&cpi->fn_ptr[BLOCK_16X16],&best_sse,
+                                       mode_mv[this_mode]);
+                x->best_sse_inter_mode = NEWMV;
+                x->best_sse_mv = x->e_mbd.mode_info_context->mbmi.mv;
+                x->need_to_clamp_best_mvs =
+                    x->e_mbd.mode_info_context->mbmi.need_to_clamp_mvs;
+                x->best_reference_frame =
+                    x->e_mbd.mode_info_context->mbmi.ref_frame;
+            }
+        }
+#endif
+
+        /* Did this mode help.. i.i is it the new best mode */
+        if (this_rd < best_mode.rd || x->skip)
+        {
+            /* Note index of best mode so far */
+            best_mode_index = mode_index;
+            *returnrate = rd.rate2;
+            *returndistortion = rd.distortion2;
+            if (this_mode <= B_PRED)
+            {
+                x->e_mbd.mode_info_context->mbmi.uv_mode = uv_intra_mode;
+                /* required for left and above block mv */
+                x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+            }
+            update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
+
+
+            /* Testing this mode gave rise to an improvement in best error
+             * score. Lower threshold a bit for next time
+             */
+            cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+        }
+
+        /* If the mode did not help improve the best error case then raise
+         * the threshold for testing that mode next time around.
+         */
+        else
+        {
+            cpi->rd_thresh_mult[mode_index] += 4;
+
+            if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+        }
+
+        if (x->skip)
+            break;
+
+    }
+
+    /* Reduce the activation RD thresholds for the best choice mode */
+    if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
+    {
+        int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
+
+        cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+        cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+    }
+
+    /* Note how often each mode chosen as best */
+    cpi->mode_chosen_counts[best_mode_index] ++;
+
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity)
+    {
+        if (x->best_sse_inter_mode == DC_PRED)
+        {
+            /* No best MV found. */
+            x->best_sse_inter_mode = best_mode.mbmode.mode;
+            x->best_sse_mv = best_mode.mbmode.mv;
+            x->need_to_clamp_best_mvs = best_mode.mbmode.need_to_clamp_mvs;
+            x->best_reference_frame = best_mode.mbmode.ref_frame;
+            best_sse = best_rd_sse;
+        }
+        vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
+                                recon_yoffset, recon_uvoffset);
+
+
+        /* Reevaluate ZEROMV after denoising. */
+        if (best_mode.mbmode.ref_frame == INTRA_FRAME &&
+            x->best_zeromv_reference_frame != INTRA_FRAME)
+        {
+            int this_rd = INT_MAX;
+            int disable_skip = 0;
+            int other_cost = 0;
+            int this_ref_frame = x->best_zeromv_reference_frame;
+            rd.rate2 = x->ref_frame_cost[this_ref_frame] +
+                    vp8_cost_mv_ref(ZEROMV, mdcounts);
+            rd.distortion2 = 0;
+
+            /* set up the proper prediction buffers for the frame */
+            x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
+            x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
+            x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
+            x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
+
+            x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+            x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+            x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+
+            this_rd = evaluate_inter_mode_rd(mdcounts, &rd, &disable_skip, cpi, x);
+            this_rd = calculate_final_rd_costs(this_rd, &rd, &other_cost,
+                                               disable_skip, uv_intra_tteob,
+                                               intra_rd_penalty, cpi, x);
+            if (this_rd < best_mode.rd || x->skip)
+            {
+                /* Note index of best mode so far */
+                best_mode_index = mode_index;
+                *returnrate = rd.rate2;
+                *returndistortion = rd.distortion2;
+                update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
+            }
+        }
+
+    }
+#endif
+
+    if (cpi->is_src_frame_alt_ref &&
+        (best_mode.mbmode.mode != ZEROMV || best_mode.mbmode.ref_frame != ALTREF_FRAME))
+    {
+        x->e_mbd.mode_info_context->mbmi.mode = ZEROMV;
+        x->e_mbd.mode_info_context->mbmi.ref_frame = ALTREF_FRAME;
+        x->e_mbd.mode_info_context->mbmi.mv.as_int = 0;
+        x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
+        x->e_mbd.mode_info_context->mbmi.mb_skip_coeff =
+                                        (cpi->common.mb_no_coeff_skip);
+        x->e_mbd.mode_info_context->mbmi.partitioning = 0;
+        return;
+    }
+
+
+    /* macroblock modes */
+    vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, sizeof(MB_MODE_INFO));
+
+    if (best_mode.mbmode.mode == B_PRED)
+    {
+        for (i = 0; i < 16; i++)
+            xd->mode_info_context->bmi[i].as_mode = best_mode.bmodes[i].as_mode;
+    }
+
+    if (best_mode.mbmode.mode == SPLITMV)
+    {
+        for (i = 0; i < 16; i++)
+            xd->mode_info_context->bmi[i].mv.as_int = best_mode.bmodes[i].mv.as_int;
+
+        vpx_memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO));
+
+        x->e_mbd.mode_info_context->mbmi.mv.as_int =
+                                      x->partition_info->bmi[15].mv.as_int;
+    }
+
+    if (sign_bias
+        != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
+        best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;
+
+    rd_update_mvcount(cpi, x, &best_ref_mv);
+}
+
+void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_)
+{
+    int error4x4, error16x16;
+    int rate4x4, rate16x16 = 0, rateuv;
+    int dist4x4, dist16x16, distuv;
+    int rate;
+    int rate4x4_tokenonly = 0;
+    int rate16x16_tokenonly = 0;
+    int rateuv_tokenonly = 0;
+
+    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+
+    rd_pick_intra_mbuv_mode(x, &rateuv, &rateuv_tokenonly, &distuv);
+    rate = rateuv;
+
+    error16x16 = rd_pick_intra16x16mby_mode(x, &rate16x16, &rate16x16_tokenonly,
+                                            &dist16x16);
+
+    error4x4 = rd_pick_intra4x4mby_modes(x, &rate4x4, &rate4x4_tokenonly,
+                                         &dist4x4, error16x16);
+
+    if (error4x4 < error16x16)
+    {
+        x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
+        rate += rate4x4;
+    }
+    else
+    {
+        rate += rate16x16;
+    }
+
+    *rate_ = rate;
+}
diff --git a/libvpx/vp8/encoder/rdopt.h b/libvpx/vp8/encoder/rdopt.h
new file mode 100644
index 0000000..d7b0442
--- /dev/null
+++ b/libvpx/vp8/encoder/rdopt.h
@@ -0,0 +1,133 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RDOPT_H
+#define __INC_RDOPT_H
+
+#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+
+static void insertsortmv(int arr[], int len)
+{
+    int i, j, k;
+
+    for ( i = 1 ; i <= len-1 ; i++ )
+    {
+        for ( j = 0 ; j < i ; j++ )
+        {
+            if ( arr[j] > arr[i] )
+            {
+                int temp;
+
+                temp = arr[i];
+
+                for ( k = i; k >j; k--)
+                    arr[k] = arr[k - 1] ;
+
+                arr[j] = temp ;
+            }
+        }
+    }
+}
+
+static void insertsortsad(int arr[],int idx[], int len)
+{
+    int i, j, k;
+
+    for ( i = 1 ; i <= len-1 ; i++ )
+    {
+        for ( j = 0 ; j < i ; j++ )
+        {
+            if ( arr[j] > arr[i] )
+            {
+                int temp, tempi;
+
+                temp = arr[i];
+                tempi = idx[i];
+
+                for ( k = i; k >j; k--)
+                {
+                    arr[k] = arr[k - 1] ;
+                    idx[k] = idx[k - 1];
+                }
+
+                arr[j] = temp ;
+                idx[j] = tempi;
+            }
+        }
+    }
+}
+
+extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
+extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
+extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);
+
+
+static void get_plane_pointers(const YV12_BUFFER_CONFIG *fb,
+                               unsigned char            *plane[3],
+                               unsigned int              recon_yoffset,
+                               unsigned int              recon_uvoffset)
+{
+    plane[0] = fb->y_buffer + recon_yoffset;
+    plane[1] = fb->u_buffer + recon_uvoffset;
+    plane[2] = fb->v_buffer + recon_uvoffset;
+}
+
+
+static void get_predictor_pointers(const VP8_COMP *cpi,
+                                       unsigned char  *plane[4][3],
+                                       unsigned int    recon_yoffset,
+                                       unsigned int    recon_uvoffset)
+{
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+        get_plane_pointers(&cpi->common.yv12_fb[cpi->common.lst_fb_idx],
+                           plane[LAST_FRAME], recon_yoffset, recon_uvoffset);
+
+    if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+        get_plane_pointers(&cpi->common.yv12_fb[cpi->common.gld_fb_idx],
+                           plane[GOLDEN_FRAME], recon_yoffset, recon_uvoffset);
+
+    if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+        get_plane_pointers(&cpi->common.yv12_fb[cpi->common.alt_fb_idx],
+                           plane[ALTREF_FRAME], recon_yoffset, recon_uvoffset);
+}
+
+
+static void get_reference_search_order(const VP8_COMP *cpi,
+                                           int             ref_frame_map[4])
+{
+    int i=0;
+
+    ref_frame_map[i++] = INTRA_FRAME;
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+        ref_frame_map[i++] = LAST_FRAME;
+    if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+        ref_frame_map[i++] = GOLDEN_FRAME;
+    if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+        ref_frame_map[i++] = ALTREF_FRAME;
+    for(; i<4; i++)
+        ref_frame_map[i] = -1;
+}
+
+
+extern void vp8_mv_pred
+(
+    VP8_COMP *cpi,
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    int_mv *mvp,
+    int refframe,
+    int *ref_frame_sign_bias,
+    int *sr,
+    int near_sadidx[]
+);
+void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]);
+
+#endif
diff --git a/libvpx/vp8/encoder/segmentation.c b/libvpx/vp8/encoder/segmentation.c
new file mode 100644
index 0000000..37972e2
--- /dev/null
+++ b/libvpx/vp8/encoder/segmentation.c
@@ -0,0 +1,66 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "segmentation.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x)
+{
+    int mb_row, mb_col;
+
+    MODE_INFO *this_mb_mode_info = cm->mi;
+
+    x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
+
+    if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame))
+    {
+        /* Reset Gf useage monitors */
+        vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+        cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
+    }
+    else
+    {
+        /* for each macroblock row in image */
+        for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+        {
+            /* for each macroblock col in image */
+            for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+            {
+
+                /* If using golden then set GF active flag if not already set.
+                 * If using last frame 0,0 mode then leave flag as it is
+                 * else if using non 0,0 motion or intra modes then clear
+                 * flag if it is currently set
+                 */
+                if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) || (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME))
+                {
+                    if (*(x->gf_active_ptr) == 0)
+                    {
+                        *(x->gf_active_ptr) = 1;
+                        cpi->gf_active_count ++;
+                    }
+                }
+                else if ((this_mb_mode_info->mbmi.mode != ZEROMV) && *(x->gf_active_ptr))
+                {
+                    *(x->gf_active_ptr) = 0;
+                    cpi->gf_active_count--;
+                }
+
+                x->gf_active_ptr++;          /* Step onto next entry */
+                this_mb_mode_info++;         /* skip to next mb */
+
+            }
+
+            /* this is to account for the border */
+            this_mb_mode_info++;
+        }
+    }
+}
diff --git a/libvpx/vp8/encoder/segmentation.h b/libvpx/vp8/encoder/segmentation.h
new file mode 100644
index 0000000..12815b0
--- /dev/null
+++ b/libvpx/vp8/encoder/segmentation.h
@@ -0,0 +1,16 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "string.h"
+#include "vp8/common/blockd.h"
+#include "onyx_int.h"
+
+extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x);
diff --git a/libvpx/vp8/encoder/ssim.c b/libvpx/vp8/encoder/ssim.c
new file mode 100644
index 0000000..e751608
--- /dev/null
+++ b/libvpx/vp8/encoder/ssim.c
@@ -0,0 +1,233 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "onyx_int.h"
+
+void vp8_ssim_parms_16x16_c
+(
+    unsigned char *s,
+    int sp,
+    unsigned char *r,
+    int rp,
+    unsigned long *sum_s,
+    unsigned long *sum_r,
+    unsigned long *sum_sq_s,
+    unsigned long *sum_sq_r,
+    unsigned long *sum_sxr
+)
+{
+    int i,j;
+    for(i=0;i<16;i++,s+=sp,r+=rp)
+     {
+         for(j=0;j<16;j++)
+         {
+             *sum_s += s[j];
+             *sum_r += r[j];
+             *sum_sq_s += s[j] * s[j];
+             *sum_sq_r += r[j] * r[j];
+             *sum_sxr += s[j] * r[j];
+         }
+     }
+}
+void vp8_ssim_parms_8x8_c
+(
+    unsigned char *s,
+    int sp,
+    unsigned char *r,
+    int rp,
+    unsigned long *sum_s,
+    unsigned long *sum_r,
+    unsigned long *sum_sq_s,
+    unsigned long *sum_sq_r,
+    unsigned long *sum_sxr
+)
+{
+    int i,j;
+    for(i=0;i<8;i++,s+=sp,r+=rp)
+     {
+         for(j=0;j<8;j++)
+         {
+             *sum_s += s[j];
+             *sum_r += r[j];
+             *sum_sq_s += s[j] * s[j];
+             *sum_sq_r += r[j] * r[j];
+             *sum_sxr += s[j] * r[j];
+         }
+     }
+}
+
+const static int64_t cc1 =  26634; // (64^2*(.01*255)^2
+const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
+
+static double similarity
+(
+    unsigned long sum_s,
+    unsigned long sum_r,
+    unsigned long sum_sq_s,
+    unsigned long sum_sq_r,
+    unsigned long sum_sxr,
+    int count
+)
+{
+    int64_t ssim_n, ssim_d;
+    int64_t c1, c2;
+
+    //scale the constants by number of pixels
+    c1 = (cc1*count*count)>>12;
+    c2 = (cc2*count*count)>>12;
+
+    ssim_n = (2*sum_s*sum_r+ c1)*((int64_t) 2*count*sum_sxr-
+          (int64_t) 2*sum_s*sum_r+c2);
+
+    ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
+        ((int64_t)count*sum_sq_s-(int64_t)sum_s*sum_s +
+        (int64_t)count*sum_sq_r-(int64_t) sum_r*sum_r +c2) ;
+
+    return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp)
+{
+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+    vp8_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
+}
+static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp)
+{
+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+    vp8_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+}
+
+// TODO: (jbb) tried to scale this function such that we may be able to use it
+// for distortion metric in mode selection code ( provided we do a reconstruction)
+long dssim(unsigned char *s,int sp, unsigned char *r,int rp)
+{
+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+    int64_t ssim3;
+    int64_t ssim_n1,ssim_n2;
+    int64_t ssim_d1,ssim_d2;
+    int64_t ssim_t1,ssim_t2;
+    int64_t c1, c2;
+
+    // normalize by 256/64
+    c1 = cc1*16;
+    c2 = cc2*16;
+
+    vp8_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    ssim_n1 = (2*sum_s*sum_r+ c1);
+
+    ssim_n2 =((int64_t) 2*256*sum_sxr-(int64_t) 2*sum_s*sum_r+c2);
+
+    ssim_d1 =((int64_t)sum_s*sum_s +(int64_t)sum_r*sum_r+c1);
+
+    ssim_d2 = (256 * (int64_t) sum_sq_s-(int64_t) sum_s*sum_s +
+                    (int64_t) 256*sum_sq_r-(int64_t) sum_r*sum_r +c2) ;
+
+    ssim_t1 = 256 - 256 * ssim_n1 / ssim_d1;
+    ssim_t2 = 256 - 256 * ssim_n2 / ssim_d2;
+
+    ssim3 = 256 *ssim_t1 * ssim_t2;
+    if(ssim3 <0 )
+        ssim3=0;
+    return (long)( ssim3  );
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+double vp8_ssim2
+(
+    unsigned char *img1,
+    unsigned char *img2,
+    int stride_img1,
+    int stride_img2,
+    int width,
+    int height
+)
+{
+    int i,j;
+    int samples =0;
+    double ssim_total=0;
+
+    // sample point start with each 4x4 location
+    for(i=0; i < height-8; i+=4, img1 += stride_img1*4, img2 += stride_img2*4)
+    {
+        for(j=0; j < width-8; j+=4 )
+        {
+            double v = ssim_8x8(img1+j, stride_img1, img2+j, stride_img2);
+            ssim_total += v;
+            samples++;
+        }
+    }
+    ssim_total /= samples;
+    return ssim_total;
+}
+double vp8_calc_ssim
+(
+    YV12_BUFFER_CONFIG *source,
+    YV12_BUFFER_CONFIG *dest,
+    int lumamask,
+    double *weight
+)
+{
+    double a, b, c;
+    double ssimv;
+
+    a = vp8_ssim2(source->y_buffer, dest->y_buffer,
+                 source->y_stride, dest->y_stride, source->y_width,
+                 source->y_height);
+
+    b = vp8_ssim2(source->u_buffer, dest->u_buffer,
+                 source->uv_stride, dest->uv_stride, source->uv_width,
+                 source->uv_height);
+
+    c = vp8_ssim2(source->v_buffer, dest->v_buffer,
+                 source->uv_stride, dest->uv_stride, source->uv_width,
+                 source->uv_height);
+
+    ssimv = a * .8 + .1 * (b + c);
+
+    *weight = 1;
+
+    return ssimv;
+}
+
+double vp8_calc_ssimg
+(
+    YV12_BUFFER_CONFIG *source,
+    YV12_BUFFER_CONFIG *dest,
+    double *ssim_y,
+    double *ssim_u,
+    double *ssim_v
+)
+{
+    double ssim_all = 0;
+    double a, b, c;
+
+    a = vp8_ssim2(source->y_buffer, dest->y_buffer,
+                 source->y_stride, dest->y_stride, source->y_width,
+                 source->y_height);
+
+    b = vp8_ssim2(source->u_buffer, dest->u_buffer,
+                 source->uv_stride, dest->uv_stride, source->uv_width,
+                 source->uv_height);
+
+    c = vp8_ssim2(source->v_buffer, dest->v_buffer,
+                 source->uv_stride, dest->uv_stride, source->uv_width,
+                 source->uv_height);
+    *ssim_y = a;
+    *ssim_u = b;
+    *ssim_v = c;
+    ssim_all = (a * 4 + b + c) /6;
+
+    return ssim_all;
+}
diff --git a/libvpx/vp8/encoder/temporal_filter.c b/libvpx/vp8/encoder/temporal_filter.c
new file mode 100644
index 0000000..b83ae89
--- /dev/null
+++ b/libvpx/vp8/encoder/temporal_filter.c
@@ -0,0 +1,519 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/onyxc_int.h"
+#include "onyx_int.h"
+#include "vp8/common/systemdependent.h"
+#include "quantize.h"
+#include "vp8/common/alloccommon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "psnr.h"
+#include "vpx_scale/vpxscale.h"
+#include "vp8/common/extend.h"
+#include "ratectrl.h"
+#include "vp8/common/quant_common.h"
+#include "segmentation.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "vp8/common/threading.h"
+#include "vpx_ports/vpx_timer.h"
+
+#include <math.h>
+#include <limits.h>
+
+#define ALT_REF_MC_ENABLED 1    /* dis/enable MC in AltRef filtering */
+#define ALT_REF_SUBPEL_ENABLED 1 /* dis/enable subpel in MC AltRef filtering */
+
+#if VP8_TEMPORAL_ALT_REF
+
+static void vp8_temporal_filter_predictors_mb_c
+(
+    MACROBLOCKD *x,
+    unsigned char *y_mb_ptr,
+    unsigned char *u_mb_ptr,
+    unsigned char *v_mb_ptr,
+    int stride,
+    int mv_row,
+    int mv_col,
+    unsigned char *pred
+)
+{
+    int offset;
+    unsigned char *yptr, *uptr, *vptr;
+
+    /* Y */
+    yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
+
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict16x16(yptr, stride,
+                                    mv_col & 7, mv_row & 7, &pred[0], 16);
+    }
+    else
+    {
+        vp8_copy_mem16x16(yptr, stride, &pred[0], 16);
+    }
+
+    /* U & V */
+    mv_row >>= 1;
+    mv_col >>= 1;
+    stride = (stride + 1) >> 1;
+    offset = (mv_row >> 3) * stride + (mv_col >> 3);
+    uptr = u_mb_ptr + offset;
+    vptr = v_mb_ptr + offset;
+
+    if ((mv_row | mv_col) & 7)
+    {
+        x->subpixel_predict8x8(uptr, stride,
+                            mv_col & 7, mv_row & 7, &pred[256], 8);
+        x->subpixel_predict8x8(vptr, stride,
+                            mv_col & 7, mv_row & 7, &pred[320], 8);
+    }
+    else
+    {
+        vp8_copy_mem8x8(uptr, stride, &pred[256], 8);
+        vp8_copy_mem8x8(vptr, stride, &pred[320], 8);
+    }
+}
+void vp8_temporal_filter_apply_c
+(
+    unsigned char *frame1,
+    unsigned int stride,
+    unsigned char *frame2,
+    unsigned int block_size,
+    int strength,
+    int filter_weight,
+    unsigned int *accumulator,
+    unsigned short *count
+)
+{
+    unsigned int i, j, k;
+    int modifier;
+    int byte = 0;
+
+    for (i = 0,k = 0; i < block_size; i++)
+    {
+        for (j = 0; j < block_size; j++, k++)
+        {
+
+            int src_byte = frame1[byte];
+            int pixel_value = *frame2++;
+
+            modifier   = src_byte - pixel_value;
+            /* This is an integer approximation of:
+             * float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+             * modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
+             */
+            modifier  *= modifier;
+            modifier  *= 3;
+            modifier  += 1 << (strength - 1);
+            modifier >>= strength;
+
+            if (modifier > 16)
+                modifier = 16;
+
+            modifier = 16 - modifier;
+            modifier *= filter_weight;
+
+            count[k] += modifier;
+            accumulator[k] += modifier * pixel_value;
+
+            byte++;
+        }
+
+        byte += stride - block_size;
+    }
+}
+
+#if ALT_REF_MC_ENABLED
+
+static int vp8_temporal_filter_find_matching_mb_c
+(
+    VP8_COMP *cpi,
+    YV12_BUFFER_CONFIG *arf_frame,
+    YV12_BUFFER_CONFIG *frame_ptr,
+    int mb_offset,
+    int error_thresh
+)
+{
+    MACROBLOCK *x = &cpi->mb;
+    int step_param;
+    int sadpb = x->sadperbit16;
+    int bestsme = INT_MAX;
+
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    int_mv best_ref_mv1;
+    int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+    /* Save input state */
+    unsigned char **base_src = b->base_src;
+    int src = b->src;
+    int src_stride = b->src_stride;
+    unsigned char *base_pre = x->e_mbd.pre.y_buffer;
+    int pre = d->offset;
+    int pre_stride = x->e_mbd.pre.y_stride;
+
+    best_ref_mv1.as_int = 0;
+    best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >>3;
+    best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >>3;
+
+    /* Setup frame pointers */
+    b->base_src = &arf_frame->y_buffer;
+    b->src_stride = arf_frame->y_stride;
+    b->src = mb_offset;
+
+    x->e_mbd.pre.y_buffer = frame_ptr->y_buffer;
+    x->e_mbd.pre.y_stride = frame_ptr->y_stride;
+    d->offset = mb_offset;
+
+    /* Further step/diamond searches as necessary */
+    if (cpi->Speed < 8)
+    {
+        step_param = cpi->sf.first_step + (cpi->Speed > 5);
+    }
+    else
+    {
+        step_param = cpi->sf.first_step + 2;
+    }
+
+    /* TODO Check that the 16x16 vf & sdf are selected here */
+    /* Ignore mv costing by sending NULL cost arrays */
+    bestsme = vp8_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.mv,
+                             step_param, sadpb,
+                             &cpi->fn_ptr[BLOCK_16X16],
+                             NULL, NULL, &best_ref_mv1);
+
+#if ALT_REF_SUBPEL_ENABLED
+    /* Try sub-pixel MC? */
+    {
+        int distortion;
+        unsigned int sse;
+        /* Ignore mv costing by sending NULL cost array */
+        bestsme = cpi->find_fractional_mv_step(x, b, d,
+                                               &d->bmi.mv,
+                                               &best_ref_mv1,
+                                               x->errorperbit,
+                                               &cpi->fn_ptr[BLOCK_16X16],
+                                               NULL, &distortion, &sse);
+    }
+#endif
+
+    /* Save input state */
+    b->base_src = base_src;
+    b->src = src;
+    b->src_stride = src_stride;
+    x->e_mbd.pre.y_buffer = base_pre;
+    d->offset = pre;
+    x->e_mbd.pre.y_stride = pre_stride;
+
+    return bestsme;
+}
+#endif
+
+static void vp8_temporal_filter_iterate_c
+(
+    VP8_COMP *cpi,
+    int frame_count,
+    int alt_ref_index,
+    int strength
+)
+{
+    int byte;
+    int frame;
+    int mb_col, mb_row;
+    unsigned int filter_weight;
+    int mb_cols = cpi->common.mb_cols;
+    int mb_rows = cpi->common.mb_rows;
+    int mb_y_offset = 0;
+    int mb_uv_offset = 0;
+    DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16*16 + 8*8 + 8*8);
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16*16 + 8*8 + 8*8);
+    MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+    YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
+    unsigned char *dst1, *dst2;
+    DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16*16 + 8*8 + 8*8);
+
+    /* Save input state */
+    unsigned char *y_buffer = mbd->pre.y_buffer;
+    unsigned char *u_buffer = mbd->pre.u_buffer;
+    unsigned char *v_buffer = mbd->pre.v_buffer;
+
+    for (mb_row = 0; mb_row < mb_rows; mb_row++)
+    {
+#if ALT_REF_MC_ENABLED
+        /* Source frames are extended to 16 pixels.  This is different than
+         *  L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS)
+         * A 6 tap filter is used for motion search.  This requires 2 pixels
+         *  before and 3 pixels after.  So the largest Y mv on a border would
+         *  then be 16 - 3.  The UV blocks are half the size of the Y and
+         *  therefore only extended by 8.  The largest mv that a UV block
+         *  can support is 8 - 3.  A UV mv is half of a Y mv.
+         *  (16 - 3) >> 1 == 6 which is greater than 8 - 3.
+         * To keep the mv in play for both Y and UV planes the max that it
+         *  can be on a border is therefore 16 - 5.
+         */
+        cpi->mb.mv_row_min = -((mb_row * 16) + (16 - 5));
+        cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
+                                + (16 - 5);
+#endif
+
+        for (mb_col = 0; mb_col < mb_cols; mb_col++)
+        {
+            int i, j, k;
+            int stride;
+
+            vpx_memset(accumulator, 0, 384*sizeof(unsigned int));
+            vpx_memset(count, 0, 384*sizeof(unsigned short));
+
+#if ALT_REF_MC_ENABLED
+            cpi->mb.mv_col_min = -((mb_col * 16) + (16 - 5));
+            cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
+                                    + (16 - 5);
+#endif
+
+            for (frame = 0; frame < frame_count; frame++)
+            {
+                if (cpi->frames[frame] == NULL)
+                    continue;
+
+                mbd->block[0].bmi.mv.as_mv.row = 0;
+                mbd->block[0].bmi.mv.as_mv.col = 0;
+
+                if (frame == alt_ref_index)
+                {
+                    filter_weight = 2;
+                }
+                else
+                {
+                    int err = 0;
+#if ALT_REF_MC_ENABLED
+#define THRESH_LOW   10000
+#define THRESH_HIGH  20000
+                    /* Find best match in this frame by MC */
+                    err = vp8_temporal_filter_find_matching_mb_c
+                              (cpi,
+                               cpi->frames[alt_ref_index],
+                               cpi->frames[frame],
+                               mb_y_offset,
+                               THRESH_LOW);
+#endif
+                    /* Assign higher weight to matching MB if it's error
+                     * score is lower. If not applying MC default behavior
+                     * is to weight all MBs equal.
+                     */
+                    filter_weight = err<THRESH_LOW
+                                       ? 2 : err<THRESH_HIGH ? 1 : 0;
+                }
+
+                if (filter_weight != 0)
+                {
+                    /* Construct the predictors */
+                    vp8_temporal_filter_predictors_mb_c
+                        (mbd,
+                         cpi->frames[frame]->y_buffer + mb_y_offset,
+                         cpi->frames[frame]->u_buffer + mb_uv_offset,
+                         cpi->frames[frame]->v_buffer + mb_uv_offset,
+                         cpi->frames[frame]->y_stride,
+                         mbd->block[0].bmi.mv.as_mv.row,
+                         mbd->block[0].bmi.mv.as_mv.col,
+                         predictor);
+
+                    /* Apply the filter (YUV) */
+                    vp8_temporal_filter_apply
+                        (f->y_buffer + mb_y_offset,
+                         f->y_stride,
+                         predictor,
+                         16,
+                         strength,
+                         filter_weight,
+                         accumulator,
+                         count);
+
+                    vp8_temporal_filter_apply
+                        (f->u_buffer + mb_uv_offset,
+                         f->uv_stride,
+                         predictor + 256,
+                         8,
+                         strength,
+                         filter_weight,
+                         accumulator + 256,
+                         count + 256);
+
+                    vp8_temporal_filter_apply
+                        (f->v_buffer + mb_uv_offset,
+                         f->uv_stride,
+                         predictor + 320,
+                         8,
+                         strength,
+                         filter_weight,
+                         accumulator + 320,
+                         count + 320);
+                }
+            }
+
+            /* Normalize filter output to produce AltRef frame */
+            dst1 = cpi->alt_ref_buffer.y_buffer;
+            stride = cpi->alt_ref_buffer.y_stride;
+            byte = mb_y_offset;
+            for (i = 0,k = 0; i < 16; i++)
+            {
+                for (j = 0; j < 16; j++, k++)
+                {
+                    unsigned int pval = accumulator[k] + (count[k] >> 1);
+                    pval *= cpi->fixed_divide[count[k]];
+                    pval >>= 19;
+
+                    dst1[byte] = (unsigned char)pval;
+
+                    /* move to next pixel */
+                    byte++;
+                }
+
+                byte += stride - 16;
+            }
+
+            dst1 = cpi->alt_ref_buffer.u_buffer;
+            dst2 = cpi->alt_ref_buffer.v_buffer;
+            stride = cpi->alt_ref_buffer.uv_stride;
+            byte = mb_uv_offset;
+            for (i = 0,k = 256; i < 8; i++)
+            {
+                for (j = 0; j < 8; j++, k++)
+                {
+                    int m=k+64;
+
+                    /* U */
+                    unsigned int pval = accumulator[k] + (count[k] >> 1);
+                    pval *= cpi->fixed_divide[count[k]];
+                    pval >>= 19;
+                    dst1[byte] = (unsigned char)pval;
+
+                    /* V */
+                    pval = accumulator[m] + (count[m] >> 1);
+                    pval *= cpi->fixed_divide[count[m]];
+                    pval >>= 19;
+                    dst2[byte] = (unsigned char)pval;
+
+                    /* move to next pixel */
+                    byte++;
+                }
+
+                byte += stride - 8;
+            }
+
+            mb_y_offset += 16;
+            mb_uv_offset += 8;
+        }
+
+        mb_y_offset += 16*(f->y_stride-mb_cols);
+        mb_uv_offset += 8*(f->uv_stride-mb_cols);
+    }
+
+    /* Restore input state */
+    mbd->pre.y_buffer = y_buffer;
+    mbd->pre.u_buffer = u_buffer;
+    mbd->pre.v_buffer = v_buffer;
+}
+
+void vp8_temporal_filter_prepare_c
+(
+    VP8_COMP *cpi,
+    int distance
+)
+{
+    int frame = 0;
+
+    int num_frames_backward = 0;
+    int num_frames_forward = 0;
+    int frames_to_blur_backward = 0;
+    int frames_to_blur_forward = 0;
+    int frames_to_blur = 0;
+    int start_frame = 0;
+
+    int strength = cpi->oxcf.arnr_strength;
+
+    int blur_type = cpi->oxcf.arnr_type;
+
+    int max_frames = cpi->active_arnr_frames;
+
+    num_frames_backward = distance;
+    num_frames_forward = vp8_lookahead_depth(cpi->lookahead)
+                         - (num_frames_backward + 1);
+
+    switch (blur_type)
+    {
+    case 1:
+        /* Backward Blur */
+
+        frames_to_blur_backward = num_frames_backward;
+
+        if (frames_to_blur_backward >= max_frames)
+            frames_to_blur_backward = max_frames - 1;
+
+        frames_to_blur = frames_to_blur_backward + 1;
+        break;
+
+    case 2:
+        /* Forward Blur */
+
+        frames_to_blur_forward = num_frames_forward;
+
+        if (frames_to_blur_forward >= max_frames)
+            frames_to_blur_forward = max_frames - 1;
+
+        frames_to_blur = frames_to_blur_forward + 1;
+        break;
+
+    case 3:
+    default:
+        /* Center Blur */
+        frames_to_blur_forward = num_frames_forward;
+        frames_to_blur_backward = num_frames_backward;
+
+        if (frames_to_blur_forward > frames_to_blur_backward)
+            frames_to_blur_forward = frames_to_blur_backward;
+
+        if (frames_to_blur_backward > frames_to_blur_forward)
+            frames_to_blur_backward = frames_to_blur_forward;
+
+        /* When max_frames is even we have 1 more frame backward than forward */
+        if (frames_to_blur_forward > (max_frames - 1) / 2)
+            frames_to_blur_forward = ((max_frames - 1) / 2);
+
+        if (frames_to_blur_backward > (max_frames / 2))
+            frames_to_blur_backward = (max_frames / 2);
+
+        frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
+        break;
+    }
+
+    start_frame = distance + frames_to_blur_forward;
+
+    /* Setup frame pointers, NULL indicates frame not included in filter */
+    vpx_memset(cpi->frames, 0, max_frames*sizeof(YV12_BUFFER_CONFIG *));
+    for (frame = 0; frame < frames_to_blur; frame++)
+    {
+        int which_buffer =  start_frame - frame;
+        struct lookahead_entry* buf = vp8_lookahead_peek(cpi->lookahead,
+                                                         which_buffer,
+                                                         PEEK_FORWARD);
+        cpi->frames[frames_to_blur-1-frame] = &buf->img;
+    }
+
+    vp8_temporal_filter_iterate_c (
+        cpi,
+        frames_to_blur,
+        frames_to_blur_backward,
+        strength );
+}
+#endif
diff --git a/libvpx/vp8/encoder/tokenize.c b/libvpx/vp8/encoder/tokenize.c
new file mode 100644
index 0000000..3b5268b
--- /dev/null
+++ b/libvpx/vp8/encoder/tokenize.c
@@ -0,0 +1,604 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "onyx_int.h"
+#include "tokenize.h"
+#include "vpx_mem/vpx_mem.h"
+
+/* Global event counters used for accumulating statistics across several
+   compressions, then generating context.c = initial stats. */
+
+#ifdef ENTROPY_STATS
+_int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+#endif
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) ;
+void vp8_fix_contexts(MACROBLOCKD *x);
+
+#include "dct_value_tokens.h"
+#include "dct_value_cost.h"
+
+const TOKENVALUE *const vp8_dct_value_tokens_ptr = dct_value_tokens +
+        DCT_MAX_VALUE;
+const short *const vp8_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
+
+#if 0
+int skip_true_count = 0;
+int skip_false_count = 0;
+#endif
+
+/* function used to generate dct_value_tokens and dct_value_cost tables */
+/*
+static void fill_value_tokens()
+{
+
+    TOKENVALUE *t = dct_value_tokens + DCT_MAX_VALUE;
+    const vp8_extra_bit_struct *e = vp8_extra_bits;
+
+    int i = -DCT_MAX_VALUE;
+    int sign = 1;
+
+    do
+    {
+        if (!i)
+            sign = 0;
+
+        {
+            const int a = sign ? -i : i;
+            int eb = sign;
+
+            if (a > 4)
+            {
+                int j = 4;
+
+                while (++j < 11  &&  e[j].base_val <= a) {}
+
+                t[i].Token = --j;
+                eb |= (a - e[j].base_val) << 1;
+            }
+            else
+                t[i].Token = a;
+
+            t[i].Extra = eb;
+        }
+
+        // initialize the cost for extra bits for all possible coefficient value.
+        {
+            int cost = 0;
+            const vp8_extra_bit_struct *p = vp8_extra_bits + t[i].Token;
+
+            if (p->base_val)
+            {
+                const int extra = t[i].Extra;
+                const int Length = p->Len;
+
+                if (Length)
+                    cost += vp8_treed_cost(p->tree, p->prob, extra >> 1, Length);
+
+                cost += vp8_cost_bit(vp8_prob_half, extra & 1); // sign
+                dct_value_cost[i + DCT_MAX_VALUE] = cost;
+            }
+
+        }
+
+    }
+    while (++i < DCT_MAX_VALUE);
+
+    vp8_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
+    vp8_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
+}
+*/
+
+static void tokenize2nd_order_b
+(
+    MACROBLOCK *x,
+    TOKENEXTRA **tp,
+    VP8_COMP *cpi
+)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+    int pt;             /* near block/prev token context index */
+    int c;              /* start at DC */
+    TOKENEXTRA *t = *tp;/* store tokens starting here */
+    const BLOCKD *b;
+    const short *qcoeff_ptr;
+    ENTROPY_CONTEXT * a;
+    ENTROPY_CONTEXT * l;
+    int band, rc, v, token;
+    int eob;
+
+    b = xd->block + 24;
+    qcoeff_ptr = b->qcoeff;
+    a = (ENTROPY_CONTEXT *)xd->above_context + 8;
+    l = (ENTROPY_CONTEXT *)xd->left_context + 8;
+    eob = xd->eobs[24];
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+    if(!eob)
+    {
+        /* c = band for this case */
+        t->Token = DCT_EOB_TOKEN;
+        t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
+        t->skip_eob_node = 0;
+
+        ++x->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
+        t++;
+        *tp = t;
+        *a = *l = 0;
+        return;
+    }
+
+    v = qcoeff_ptr[0];
+    t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+    token    = vp8_dct_value_tokens_ptr[v].Token;
+    t->Token = token;
+
+    t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
+    t->skip_eob_node = 0;
+    ++x->coef_counts       [1] [0] [pt] [token];
+    pt = vp8_prev_token_class[token];
+    t++;
+    c = 1;
+
+    for (; c < eob; c++)
+    {
+        rc = vp8_default_zig_zag1d[c];
+        band = vp8_coef_bands[c];
+        v = qcoeff_ptr[rc];
+
+        t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+        token    = vp8_dct_value_tokens_ptr[v].Token;
+
+        t->Token = token;
+        t->context_tree = cpi->common.fc.coef_probs [1] [band] [pt];
+
+        t->skip_eob_node = ((pt == 0));
+
+        ++x->coef_counts       [1] [band] [pt] [token];
+
+        pt = vp8_prev_token_class[token];
+        t++;
+    }
+    if (c < 16)
+    {
+        band = vp8_coef_bands[c];
+        t->Token = DCT_EOB_TOKEN;
+        t->context_tree = cpi->common.fc.coef_probs [1] [band] [pt];
+
+        t->skip_eob_node = 0;
+
+        ++x->coef_counts       [1] [band] [pt] [DCT_EOB_TOKEN];
+
+        t++;
+    }
+
+    *tp = t;
+    *a = *l = 1;
+
+}
+
+static void tokenize1st_order_b
+(
+    MACROBLOCK *x,
+    TOKENEXTRA **tp,
+    int type,           /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+    VP8_COMP *cpi
+)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+    unsigned int block;
+    const BLOCKD *b;
+    int pt;             /* near block/prev token context index */
+    int c;
+    int token;
+    TOKENEXTRA *t = *tp;/* store tokens starting here */
+    const short *qcoeff_ptr;
+    ENTROPY_CONTEXT * a;
+    ENTROPY_CONTEXT * l;
+    int band, rc, v;
+    int tmp1, tmp2;
+
+    b = xd->block;
+    /* Luma */
+    for (block = 0; block < 16; block++, b++)
+    {
+        tmp1 = vp8_block2above[block];
+        tmp2 = vp8_block2left[block];
+        qcoeff_ptr = b->qcoeff;
+        a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
+        l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;
+
+        VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+        c = type ? 0 : 1;
+
+        if(c >= *b->eob)
+        {
+            /* c = band for this case */
+            t->Token = DCT_EOB_TOKEN;
+            t->context_tree = cpi->common.fc.coef_probs [type] [c] [pt];
+            t->skip_eob_node = 0;
+
+            ++x->coef_counts       [type] [c] [pt] [DCT_EOB_TOKEN];
+            t++;
+            *tp = t;
+            *a = *l = 0;
+            continue;
+        }
+
+        v = qcoeff_ptr[c];
+
+        t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+        token    = vp8_dct_value_tokens_ptr[v].Token;
+        t->Token = token;
+
+        t->context_tree = cpi->common.fc.coef_probs [type] [c] [pt];
+        t->skip_eob_node = 0;
+        ++x->coef_counts       [type] [c] [pt] [token];
+        pt = vp8_prev_token_class[token];
+        t++;
+        c++;
+
+        for (; c < *b->eob; c++)
+        {
+            rc = vp8_default_zig_zag1d[c];
+            band = vp8_coef_bands[c];
+            v = qcoeff_ptr[rc];
+
+            t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+            token    = vp8_dct_value_tokens_ptr[v].Token;
+
+            t->Token = token;
+            t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+
+            t->skip_eob_node = (pt == 0);
+            ++x->coef_counts       [type] [band] [pt] [token];
+
+            pt = vp8_prev_token_class[token];
+            t++;
+        }
+        if (c < 16)
+        {
+            band = vp8_coef_bands[c];
+            t->Token = DCT_EOB_TOKEN;
+            t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+
+            t->skip_eob_node = 0;
+            ++x->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
+
+            t++;
+        }
+        *tp = t;
+        *a = *l = 1;
+    }
+
+    /* Chroma */
+    for (block = 16; block < 24; block++, b++)
+    {
+        tmp1 = vp8_block2above[block];
+        tmp2 = vp8_block2left[block];
+        qcoeff_ptr = b->qcoeff;
+        a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
+        l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;
+
+        VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+        if(!(*b->eob))
+        {
+            /* c = band for this case */
+            t->Token = DCT_EOB_TOKEN;
+            t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
+            t->skip_eob_node = 0;
+
+            ++x->coef_counts       [2] [0] [pt] [DCT_EOB_TOKEN];
+            t++;
+            *tp = t;
+            *a = *l = 0;
+            continue;
+        }
+
+        v = qcoeff_ptr[0];
+
+        t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+        token    = vp8_dct_value_tokens_ptr[v].Token;
+        t->Token = token;
+
+        t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
+        t->skip_eob_node = 0;
+        ++x->coef_counts       [2] [0] [pt] [token];
+        pt = vp8_prev_token_class[token];
+        t++;
+        c = 1;
+
+        for (; c < *b->eob; c++)
+        {
+            rc = vp8_default_zig_zag1d[c];
+            band = vp8_coef_bands[c];
+            v = qcoeff_ptr[rc];
+
+            t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+            token    = vp8_dct_value_tokens_ptr[v].Token;
+
+            t->Token = token;
+            t->context_tree = cpi->common.fc.coef_probs [2] [band] [pt];
+
+            t->skip_eob_node = (pt == 0);
+
+            ++x->coef_counts       [2] [band] [pt] [token];
+
+            pt = vp8_prev_token_class[token];
+            t++;
+        }
+        if (c < 16)
+        {
+            band = vp8_coef_bands[c];
+            t->Token = DCT_EOB_TOKEN;
+            t->context_tree = cpi->common.fc.coef_probs [2] [band] [pt];
+
+            t->skip_eob_node = 0;
+
+            ++x->coef_counts       [2] [band] [pt] [DCT_EOB_TOKEN];
+
+            t++;
+        }
+        *tp = t;
+        *a = *l = 1;
+    }
+}
+
+
+static int mb_is_skippable(MACROBLOCKD *x, int has_y2_block)
+{
+    int skip = 1;
+    int i = 0;
+
+    if (has_y2_block)
+    {
+        for (i = 0; i < 16; i++)
+            skip &= (x->eobs[i] < 2);
+    }
+
+    for (; i < 24 + has_y2_block; i++)
+        skip &= (!x->eobs[i]);
+
+    return skip;
+}
+
+
+void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+    int plane_type;
+    int has_y2_block;
+
+    has_y2_block = (xd->mode_info_context->mbmi.mode != B_PRED
+                    && xd->mode_info_context->mbmi.mode != SPLITMV);
+
+    xd->mode_info_context->mbmi.mb_skip_coeff =
+        mb_is_skippable(xd, has_y2_block);
+    if (xd->mode_info_context->mbmi.mb_skip_coeff)
+    {
+        if (!cpi->common.mb_no_coeff_skip)
+        {
+            vp8_stuff_mb(cpi, x, t);
+        }
+        else
+        {
+            vp8_fix_contexts(xd);
+            x->skip_true_count++;
+        }
+
+        return;
+    }
+
+    plane_type = 3;
+    if(has_y2_block)
+    {
+        tokenize2nd_order_b(x, t, cpi);
+        plane_type = 0;
+    }
+
+    tokenize1st_order_b(x, t, plane_type, cpi);
+}
+
+
+#ifdef ENTROPY_STATS
+
+void init_context_counters(void)
+{
+    vpx_memset(context_counters, 0, sizeof(context_counters));
+}
+
+void print_context_counters()
+{
+
+    int type, band, pt, t;
+
+    FILE *const f = fopen("context.c", "w");
+
+    fprintf(f, "#include \"entropy.h\"\n");
+
+    fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
+
+    fprintf(f, "int Contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];\n\n");
+
+    fprintf(f, "const int default_contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
+
+# define Comma( X) (X? ",":"")
+
+    type = 0;
+
+    do
+    {
+        fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
+
+        band = 0;
+
+        do
+        {
+            fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
+
+            pt = 0;
+
+            do
+            {
+                fprintf(f, "%s\n      {", Comma(pt));
+
+                t = 0;
+
+                do
+                {
+                    const _int64 x = context_counters [type] [band] [pt] [t];
+                    const int y = (int) x;
+
+                    assert(x == (_int64) y);  /* no overflow handling yet */
+                    fprintf(f, "%s %d", Comma(t), y);
+
+                }
+                while (++t < MAX_ENTROPY_TOKENS);
+
+                fprintf(f, "}");
+            }
+            while (++pt < PREV_COEF_CONTEXTS);
+
+            fprintf(f, "\n    }");
+
+        }
+        while (++band < COEF_BANDS);
+
+        fprintf(f, "\n  }");
+    }
+    while (++type < BLOCK_TYPES);
+
+    fprintf(f, "\n};\n");
+    fclose(f);
+}
+#endif
+
+
+static void stuff2nd_order_b
+(
+    TOKENEXTRA **tp,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    VP8_COMP *cpi,
+    MACROBLOCK *x
+)
+{
+    int pt; /* near block/prev token context index */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+    t->Token = DCT_EOB_TOKEN;
+    t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
+    t->skip_eob_node = 0;
+    ++x->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
+    ++t;
+
+    *tp = t;
+    pt = 0;
+    *a = *l = pt;
+}
+
+static void stuff1st_order_b
+(
+    TOKENEXTRA **tp,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    int type,
+    VP8_COMP *cpi,
+    MACROBLOCK *x
+)
+{
+    int pt; /* near block/prev token context index */
+    int band;
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+    band = type ? 0 : 1;
+    t->Token = DCT_EOB_TOKEN;
+    t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+    t->skip_eob_node = 0;
+    ++x->coef_counts       [type] [band] [pt] [DCT_EOB_TOKEN];
+    ++t;
+    *tp = t;
+    pt = 0; /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+}
+
+static
+void stuff1st_order_buv
+(
+    TOKENEXTRA **tp,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    VP8_COMP *cpi,
+    MACROBLOCK *x
+)
+{
+    int pt; /* near block/prev token context index */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+    t->Token = DCT_EOB_TOKEN;
+    t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
+    t->skip_eob_node = 0;
+    ++x->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
+    ++t;
+    *tp = t;
+    pt = 0; /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+}
+
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
+{
+    MACROBLOCKD *xd = &x->e_mbd;
+    ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)xd->above_context;
+    ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)xd->left_context;
+    int plane_type;
+    int b;
+    plane_type = 3;
+    if((xd->mode_info_context->mbmi.mode != B_PRED
+                        && xd->mode_info_context->mbmi.mode != SPLITMV))
+    {
+        stuff2nd_order_b(t,
+                     A + vp8_block2above[24], L + vp8_block2left[24], cpi, x);
+        plane_type = 0;
+    }
+
+    for (b = 0; b < 16; b++)
+        stuff1st_order_b(t,
+                         A + vp8_block2above[b],
+                         L + vp8_block2left[b], plane_type, cpi, x);
+
+    for (b = 16; b < 24; b++)
+        stuff1st_order_buv(t,
+                           A + vp8_block2above[b],
+                           L + vp8_block2left[b], cpi, x);
+
+}
+void vp8_fix_contexts(MACROBLOCKD *x)
+{
+    /* Clear entropy contexts for Y2 blocks */
+    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
+    {
+        vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+        vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+    }
+    else
+    {
+        vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+        vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+    }
+
+}
diff --git a/libvpx/vp8/encoder/tokenize.h b/libvpx/vp8/encoder/tokenize.h
new file mode 100644
index 0000000..c2d1438
--- /dev/null
+++ b/libvpx/vp8/encoder/tokenize.h
@@ -0,0 +1,50 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef tokenize_h
+#define tokenize_h
+
+#include "vp8/common/entropy.h"
+#include "block.h"
+
+void vp8_tokenize_initialize();
+
+typedef struct
+{
+    short Token;
+    short Extra;
+} TOKENVALUE;
+
+typedef struct
+{
+    const vp8_prob *context_tree;
+    short           Extra;
+    unsigned char   Token;
+    unsigned char   skip_eob_node;
+} TOKENEXTRA;
+
+int rd_cost_mby(MACROBLOCKD *);
+
+#ifdef ENTROPY_STATS
+void init_context_counters();
+void print_context_counters();
+
+extern _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+#endif
+
+extern const short *const vp8_dct_value_cost_ptr;
+/* TODO: The Token field should be broken out into a separate char array to
+ *  improve cache locality, since it's needed for costing when the rest of the
+ *  fields are not.
+ */
+extern const TOKENVALUE *const vp8_dct_value_tokens_ptr;
+
+#endif  /* tokenize_h */
diff --git a/libvpx/vp8/encoder/treewriter.c b/libvpx/vp8/encoder/treewriter.c
new file mode 100644
index 0000000..ef25f67
--- /dev/null
+++ b/libvpx/vp8/encoder/treewriter.c
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "treewriter.h"
+
+static void cost(
+    int *const C,
+    vp8_tree T,
+    const vp8_prob *const P,
+    int i,
+    int c
+)
+{
+    const vp8_prob p = P [i>>1];
+
+    do
+    {
+        const vp8_tree_index j = T[i];
+        const int d = c + vp8_cost_bit(p, i & 1);
+
+        if (j <= 0)
+            C[-j] = d;
+        else
+            cost(C, T, P, j, d);
+    }
+    while (++i & 1);
+}
+void vp8_cost_tokens(int *c, const vp8_prob *p, vp8_tree t)
+{
+    cost(c, t, p, 0, 0);
+}
+void vp8_cost_tokens2(int *c, const vp8_prob *p, vp8_tree t,int start)
+{
+    cost(c, t, p, start, 0);
+}
diff --git a/libvpx/vp8/encoder/treewriter.h b/libvpx/vp8/encoder/treewriter.h
new file mode 100644
index 0000000..48574f3
--- /dev/null
+++ b/libvpx/vp8/encoder/treewriter.h
@@ -0,0 +1,126 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TREEWRITER_H
+#define __INC_TREEWRITER_H
+
+/* Trees map alphabets into huffman-like codes suitable for an arithmetic
+   bit coder.  Timothy S Murphy  11 October 2004 */
+
+#include "vp8/common/treecoder.h"
+
+#include "boolhuff.h"       /* for now */
+
+typedef BOOL_CODER vp8_writer;
+
+#define vp8_write vp8_encode_bool
+#define vp8_write_literal vp8_encode_value
+#define vp8_write_bit( W, V) vp8_write( W, V, vp8_prob_half)
+
+#define vp8bc_write vp8bc_write_bool
+#define vp8bc_write_literal vp8bc_write_bits
+#define vp8bc_write_bit( W, V) vp8bc_write_bits( W, V, 1)
+
+
+/* Approximate length of an encoded bool in 256ths of a bit at given prob */
+
+#define vp8_cost_zero( x) ( vp8_prob_cost[x])
+#define vp8_cost_one( x)  vp8_cost_zero( vp8_complement(x))
+
+#define vp8_cost_bit( x, b) vp8_cost_zero( (b)?  vp8_complement(x) : (x) )
+
+/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */
+
+
+/* Both of these return bits, not scaled bits. */
+
+static unsigned int vp8_cost_branch(const unsigned int ct[2], vp8_prob p)
+{
+    /* Imitate existing calculation */
+
+    return ((ct[0] * vp8_cost_zero(p))
+            + (ct[1] * vp8_cost_one(p))) >> 8;
+}
+
+/* Small functions to write explicit values and tokens, as well as
+   estimate their lengths. */
+
+static void vp8_treed_write
+(
+    vp8_writer *const w,
+    vp8_tree t,
+    const vp8_prob *const p,
+    int v,
+    int n               /* number of bits in v, assumed nonzero */
+)
+{
+    vp8_tree_index i = 0;
+
+    do
+    {
+        const int b = (v >> --n) & 1;
+        vp8_write(w, b, p[i>>1]);
+        i = t[i+b];
+    }
+    while (n);
+}
+static void vp8_write_token
+(
+    vp8_writer *const w,
+    vp8_tree t,
+    const vp8_prob *const p,
+    vp8_token *const x
+)
+{
+    vp8_treed_write(w, t, p, x->value, x->Len);
+}
+
+static int vp8_treed_cost(
+    vp8_tree t,
+    const vp8_prob *const p,
+    int v,
+    int n               /* number of bits in v, assumed nonzero */
+)
+{
+    int c = 0;
+    vp8_tree_index i = 0;
+
+    do
+    {
+        const int b = (v >> --n) & 1;
+        c += vp8_cost_bit(p[i>>1], b);
+        i = t[i+b];
+    }
+    while (n);
+
+    return c;
+}
+static int vp8_cost_token
+(
+    vp8_tree t,
+    const vp8_prob *const p,
+    vp8_token *const x
+)
+{
+    return vp8_treed_cost(t, p, x->value, x->Len);
+}
+
+/* Fill array of costs for all possible token values. */
+
+void vp8_cost_tokens(
+    int *Costs, const vp8_prob *, vp8_tree
+);
+
+void vp8_cost_tokens2(
+    int *Costs, const vp8_prob *, vp8_tree, int
+);
+
+#endif
diff --git a/libvpx/vp8/encoder/x86/dct_mmx.asm b/libvpx/vp8/encoder/x86/dct_mmx.asm
new file mode 100644
index 0000000..6f188cb
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/dct_mmx.asm
@@ -0,0 +1,241 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_mmx) PRIVATE
+sym(vp8_short_fdct4x4_mmx):
+    push        rbp
+    mov         rbp,        rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov         rsi,        arg(0)      ; input
+        mov         rdi,        arg(1)      ; output
+
+        movsxd      rax,        dword ptr arg(2) ;pitch
+
+        lea         rcx,        [rsi + rax*2]
+        ; read the input data
+        movq        mm0,        [rsi]
+        movq        mm1,        [rsi + rax]
+
+        movq        mm2,        [rcx]
+        movq        mm4,        [rcx + rax]
+
+        ; transpose for the first stage
+        movq        mm3,        mm0         ; 00 01 02 03
+        movq        mm5,        mm2         ; 20 21 22 23
+
+        punpcklwd   mm0,        mm1         ; 00 10 01 11
+        punpckhwd   mm3,        mm1         ; 02 12 03 13
+
+        punpcklwd   mm2,        mm4         ; 20 30 21 31
+        punpckhwd   mm5,        mm4         ; 22 32 23 33
+
+        movq        mm1,        mm0         ; 00 10 01 11
+        punpckldq   mm0,        mm2         ; 00 10 20 30
+
+        punpckhdq   mm1,        mm2         ; 01 11 21 31
+
+        movq        mm2,        mm3         ; 02 12 03 13
+        punpckldq   mm2,        mm5         ; 02 12 22 32
+
+        punpckhdq   mm3,        mm5         ; 03 13 23 33
+
+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 3
+
+        ; first stage
+        movq        mm5,        mm0
+        movq        mm4,        mm1
+
+        paddw       mm0,        mm3         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2
+
+        psubw       mm4,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm3         ; d1 = 0 - 3
+
+        psllw       mm5,        3
+        psllw       mm4,        3
+
+        psllw       mm0,        3
+        psllw       mm1,        3
+
+        ; output 0 and 2
+        movq        mm2,        mm0         ; a1
+
+        paddw       mm0,        mm1         ; op[0] = a1 + b1
+        psubw       mm2,        mm1         ; op[2] = a1 - b1
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm4         ; c1 d1
+        punpckhwd   mm5,        mm4         ; c1 d1
+
+        movq        mm3,        mm1
+        movq        mm4,        mm5
+
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
+
+        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+
+        packssdw    mm1,        mm4         ; op[1]
+        packssdw    mm3,        mm5         ; op[3]
+
+        ; done with vertical
+        ; transpose for the second stage
+        movq        mm4,        mm0         ; 00 10 20 30
+        movq        mm5,        mm2         ; 02 12 22 32
+
+        punpcklwd   mm0,        mm1         ; 00 01 10 11
+        punpckhwd   mm4,        mm1         ; 20 21 30 31
+
+        punpcklwd   mm2,        mm3         ; 02 03 12 13
+        punpckhwd   mm5,        mm3         ; 22 23 32 33
+
+        movq        mm1,        mm0         ; 00 01 10 11
+        punpckldq   mm0,        mm2         ; 00 01 02 03
+
+        punpckhdq   mm1,        mm2         ; 01 22 12 13
+
+        movq        mm2,        mm4         ; 20 31 30 31
+        punpckldq   mm2,        mm5         ; 20 21 22 23
+
+        punpckhdq   mm4,        mm5         ; 30 31 32 33
+
+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 4
+
+        movq        mm5,        mm0
+        movq        mm3,        mm1
+
+        paddw       mm0,        mm4         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2
+
+        psubw       mm3,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm4         ; d1 = 0 - 3
+
+        pxor        mm6,        mm6         ; zero out for compare
+
+        pcmpeqw     mm6,        mm5         ; d1 != 0
+
+        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
+                                                                ; and keep bit 0 of lower
+
+        ; output 0 and 2
+        movq        mm2,        mm0         ; a1
+
+        paddw       mm0,        mm1         ; a1 + b1
+        psubw       mm2,        mm1         ; a1 - b1
+
+        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
+        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
+
+        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
+
+        movq        MMWORD PTR[rdi + 0 ],  mm0
+        movq        MMWORD PTR[rdi + 16],  mm2
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm3         ; c1 d1
+        punpckhwd   mm5,        mm3         ; c1 d1
+
+        movq        mm3,        mm1
+        movq        mm4,        mm5
+
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
+
+        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+
+        packssdw    mm1,        mm4         ; op[4]
+        packssdw    mm3,        mm5         ; op[12]
+
+        paddw       mm1,        mm6         ; op[4] += (d1!=0)
+
+        movq        MMWORD PTR[rdi + 8 ],  mm1
+        movq        MMWORD PTR[rdi + 24],  mm3
+
+     ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 8
+_5352_2217:
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+align 8
+_2217_neg5352:
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+align 8
+_cmp_mask:
+    times 4 dw 1
+align 8
+_7w:
+    times 4 dw 7
+align 8
+_14500:
+    times 2 dd 14500
+align 8
+_7500:
+    times 2 dd 7500
+align 8
+_12000:
+    times 2 dd 12000
+align 8
+_51000:
+    times 2 dd 51000
diff --git a/libvpx/vp8/encoder/x86/dct_sse2.asm b/libvpx/vp8/encoder/x86/dct_sse2.asm
new file mode 100644
index 0000000..d880ce0
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/dct_sse2.asm
@@ -0,0 +1,432 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE 0
+%if ABI_IS_32BIT
+  %define       input       rsi
+  %define       output      rdi
+  %define       pitch       rax
+    push        rbp
+    mov         rbp, rsp
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rsi, arg(0)
+    mov         rdi, arg(1)
+
+    movsxd      rax, dword ptr arg(2)
+    lea         rcx, [rsi + rax*2]
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    %define     input       rcx
+    %define     output      rdx
+    %define     pitch       r8
+    SAVE_XMM 7, u
+  %else
+    %define     input       rdi
+    %define     output      rsi
+    %define     pitch       rdx
+  %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY 0
+  %define     input
+  %define     output
+  %define     pitch
+
+%if ABI_IS_32BIT
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    pop         rbp
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    RESTORE_XMM
+  %endif
+%endif
+    ret
+%endmacro
+
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2) PRIVATE
+sym(vp8_short_fdct4x4_sse2):
+
+    STACK_FRAME_CREATE
+
+    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
+    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
+    lea         input,          [input+2*pitch]
+    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
+    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
+
+    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
+    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
+
+    movdqa      xmm2, xmm0
+    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
+    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
+    movdqa      xmm1, xmm0
+    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
+    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
+    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
+
+    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
+    movdqa      xmm3, xmm0
+    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
+    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
+    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
+    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
+
+    movdqa      xmm1, xmm0
+    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
+    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
+    movdqa      xmm4, xmm3
+    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
+    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
+
+    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
+    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
+    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
+    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
+
+    packssdw    xmm0, xmm1                      ;op[2] op[0]
+    packssdw    xmm3, xmm4                      ;op[3] op[1]
+    ; 23 22 21 20 03 02 01 00
+    ;
+    ; 33 32 31 30 13 12 11 10
+    ;
+    movdqa      xmm2, xmm0
+    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
+    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
+
+    movdqa      xmm3, xmm0
+    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
+    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
+    movdqa      xmm2, xmm0
+    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
+    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
+
+    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
+    pshufd      xmm2, xmm2, 04eh
+    movdqa      xmm3, xmm0
+    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
+    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
+
+    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
+    movdqa      xmm2, xmm3                      ;save d1 for compare
+    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
+    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
+    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
+    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
+    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
+    movdqa      xmm1, xmm0
+    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
+    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
+
+    pxor        xmm4, xmm4                      ;zero out for compare
+    paddd       xmm0, xmm5
+    paddd       xmm1, xmm5
+    pcmpeqw     xmm2, xmm4
+    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
+    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
+    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
+                                                     ;and keep bit 0 of lower
+
+    movdqa      xmm4, xmm3
+    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
+    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
+    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
+    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
+    packssdw    xmm0, xmm1                      ;op[8] op[0]
+    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
+    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
+
+    packssdw    xmm3, xmm4                      ;op[12] op[4]
+    movdqa      xmm1, xmm0
+    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
+    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
+    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
+
+    movdqa      XMMWORD PTR[output +  0], xmm0
+    movdqa      XMMWORD PTR[output + 16], xmm1
+
+    STACK_FRAME_DESTROY
+
+;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct8x4_sse2) PRIVATE
+sym(vp8_short_fdct8x4_sse2):
+
+    STACK_FRAME_CREATE
+
+        ; read the input data
+        movdqa      xmm0,       [input        ]
+        movdqa      xmm2,       [input+  pitch]
+        lea         input,      [input+2*pitch]
+        movdqa      xmm4,       [input        ]
+        movdqa      xmm3,       [input+  pitch]
+
+        ; transpose for the first stage
+        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
+        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
+
+        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
+        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
+
+        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
+        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
+
+        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
+        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
+
+        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
+
+        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
+        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
+
+        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
+        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
+
+        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
+        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
+
+        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
+        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
+
+        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
+
+        ; xmm0 0
+        ; xmm1 1
+        ; xmm2 2
+        ; xmm3 3
+
+        ; first stage
+        movdqa      xmm5,       xmm0
+        movdqa      xmm4,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm2        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        psllw       xmm5,        3
+        psllw       xmm4,        3
+
+        psllw       xmm0,        3
+        psllw       xmm1,        3
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
+        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
+
+        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+
+        packssdw    xmm1,       xmm4        ; op[1]
+        packssdw    xmm3,       xmm5        ; op[3]
+
+        ; done with vertical
+        ; transpose for the second stage
+        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
+        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
+
+        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
+        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
+
+        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
+        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
+
+        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
+        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
+
+        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
+
+        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
+        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
+
+        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
+        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
+
+        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
+        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
+
+        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
+        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
+
+        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
+
+        ; xmm0 0
+        ; xmm1 4
+        ; xmm2 1
+        ; xmm3 3
+
+        movdqa      xmm5,       xmm0
+        movdqa      xmm2,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm4        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        pxor        xmm6,       xmm6        ; zero out for compare
+
+        pcmpeqw     xmm6,       xmm5        ; d1 != 0
+
+        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
+                                                                    ; and keep bit 0 of lower
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; a1 + b1
+        psubw       xmm2,       xmm1        ; a1 - b1
+
+        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
+        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
+
+        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
+
+        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+
+        packssdw    xmm1,       xmm4        ; op[4]
+        packssdw    xmm3,       xmm5        ; op[12]
+
+        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
+
+        movdqa      xmm4,       xmm0
+        movdqa      xmm5,       xmm2
+
+        punpcklqdq  xmm0,       xmm1
+        punpckhqdq  xmm4,       xmm1
+
+        punpcklqdq  xmm2,       xmm3
+        punpckhqdq  xmm5,       xmm3
+
+        movdqa      XMMWORD PTR[output + 0 ],  xmm0
+        movdqa      XMMWORD PTR[output + 16],  xmm2
+        movdqa      XMMWORD PTR[output + 32],  xmm4
+        movdqa      XMMWORD PTR[output + 48],  xmm5
+
+    STACK_FRAME_DESTROY
+
+SECTION_RODATA
+align 16
+_5352_2217:
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+align 16
+_2217_neg5352:
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+align 16
+_mult_add:
+    times 8 dw 1
+align 16
+_cmp_mask:
+    times 4 dw 1
+    times 4 dw 0
+align 16
+_cmp_mask8x4:
+    times 8 dw 1
+align 16
+_mult_sub:
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+    dw 1
+    dw -1
+align 16
+_7:
+    times 4 dd 7
+align 16
+_7w:
+    times 8 dw 7
+align 16
+_14500:
+    times 4 dd 14500
+align 16
+_7500:
+    times 4 dd 7500
+align 16
+_12000:
+    times 4 dd 12000
+align 16
+_51000:
+    times 4 dd 51000
diff --git a/libvpx/vp8/encoder/x86/denoising_sse2.c b/libvpx/vp8/encoder/x86/denoising_sse2.c
new file mode 100644
index 0000000..c1ac6c1
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/denoising_sse2.c
@@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/encoder/denoising.h"
+#include "vp8/common/reconinter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_rtcd.h"
+
+#include <emmintrin.h>
+
+union sum_union {
+    __m128i v;
+    signed char e[16];
+};
+
+int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg,
+                             YV12_BUFFER_CONFIG *running_avg,
+                             MACROBLOCK *signal, unsigned int motion_magnitude,
+                             int y_offset, int uv_offset)
+{
+    unsigned char *sig = signal->thismb;
+    int sig_stride = 16;
+    unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
+    int mc_avg_y_stride = mc_running_avg->y_stride;
+    unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
+    int avg_y_stride = running_avg->y_stride;
+    int r;
+    __m128i acc_diff = _mm_setzero_si128();
+    const __m128i k_0 = _mm_setzero_si128();
+    const __m128i k_4 = _mm_set1_epi8(4);
+    const __m128i k_8 = _mm_set1_epi8(8);
+    const __m128i k_16 = _mm_set1_epi8(16);
+    /* Modify each level's adjustment according to motion_magnitude. */
+    const __m128i l3 = _mm_set1_epi8(
+                      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 : 6);
+    /* Difference between level 3 and level 2 is 2. */
+    const __m128i l32 = _mm_set1_epi8(2);
+    /* Difference between level 2 and level 1 is 1. */
+    const __m128i l21 = _mm_set1_epi8(1);
+
+    for (r = 0; r < 16; ++r)
+    {
+        /* Calculate differences */
+        const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
+        const __m128i v_mc_running_avg_y = _mm_loadu_si128(
+                                           (__m128i *)(&mc_running_avg_y[0]));
+        __m128i v_running_avg_y;
+        const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+        const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+        /* Obtain the sign. FF if diff is negative. */
+        const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+        /* Clamp absolute difference to 16 to be used to get mask. Doing this
+         * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
+        const __m128i clamped_absdiff = _mm_min_epu8(
+                                        _mm_or_si128(pdiff, ndiff), k_16);
+        /* Get masks for l2 l1 and l0 adjustments */
+        const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
+        const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
+        const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
+        /* Get adjustments for l2, l1, and l0 */
+        __m128i adj2 = _mm_and_si128(mask2, l32);
+        const __m128i adj1 = _mm_and_si128(mask1, l21);
+        const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+        __m128i adj,  padj, nadj;
+
+        /* Combine the adjustments and get absolute adjustments. */
+        adj2 = _mm_add_epi8(adj2, adj1);
+        adj = _mm_sub_epi8(l3, adj2);
+        adj = _mm_andnot_si128(mask0, adj);
+        adj = _mm_or_si128(adj, adj0);
+
+        /* Restore the sign and get positive and negative adjustments. */
+        padj = _mm_andnot_si128(diff_sign, adj);
+        nadj = _mm_and_si128(diff_sign, adj);
+
+        /* Calculate filtered value. */
+        v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+        v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+        _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+        /* Adjustments <=7, and each element in acc_diff can fit in signed
+         * char.
+         */
+        acc_diff = _mm_adds_epi8(acc_diff, padj);
+        acc_diff = _mm_subs_epi8(acc_diff, nadj);
+
+        /* Update pointers for next iteration. */
+        sig += sig_stride;
+        mc_running_avg_y += mc_avg_y_stride;
+        running_avg_y += avg_y_stride;
+    }
+
+    {
+        /* Compute the sum of all pixel differences of this MB. */
+        union sum_union s;
+        int sum_diff = 0;
+        s.v = acc_diff;
+        sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
+                 + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
+                 + s.e[12] + s.e[13] + s.e[14] + s.e[15];
+
+        if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
+        {
+            return COPY_BLOCK;
+        }
+    }
+
+    vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
+                      signal->thismb, sig_stride);
+    return FILTER_BLOCK;
+}
diff --git a/libvpx/vp8/encoder/x86/encodeopt.asm b/libvpx/vp8/encoder/x86/encodeopt.asm
new file mode 100644
index 0000000..fe26b18
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/encodeopt.asm
@@ -0,0 +1,386 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp8_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
+global sym(vp8_block_error_xmm) PRIVATE
+sym(vp8_block_error_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prologue
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        mov         rdi,        arg(1) ;dcoef_ptr
+
+        movdqa      xmm0,       [rsi]
+        movdqa      xmm1,       [rdi]
+
+        movdqa      xmm2,       [rsi+16]
+        movdqa      xmm3,       [rdi+16]
+
+        psubw       xmm0,       xmm1
+        psubw       xmm2,       xmm3
+
+        pmaddwd     xmm0,       xmm0
+        pmaddwd     xmm2,       xmm2
+
+        paddd       xmm0,       xmm2
+
+        pxor        xmm5,       xmm5
+        movdqa      xmm1,       xmm0
+
+        punpckldq   xmm0,       xmm5
+        punpckhdq   xmm1,       xmm5
+
+        paddd       xmm0,       xmm1
+        movdqa      xmm1,       xmm0
+
+        psrldq      xmm0,       8
+        paddd       xmm0,       xmm1
+
+        movq        rax,        xmm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;int vp8_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
+global sym(vp8_block_error_mmx) PRIVATE
+sym(vp8_block_error_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        mm7,        mm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        movq        mm3,        [rsi]
+
+        movq        mm4,        [rdi]
+        movq        mm5,        [rsi+8]
+
+        movq        mm6,        [rdi+8]
+        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
+
+        movq        mm2,        mm7
+        psubw       mm5,        mm6
+
+        por         mm1,        mm2
+        pmaddwd     mm5,        mm5
+
+        pcmpeqw     mm1,        mm7
+        psubw       mm3,        mm4
+
+        pand        mm1,        mm3
+        pmaddwd     mm1,        mm1
+
+        paddd       mm1,        mm5
+        movq        mm3,        [rsi+16]
+
+        movq        mm4,        [rdi+16]
+        movq        mm5,        [rsi+24]
+
+        movq        mm6,        [rdi+24]
+        psubw       mm5,        mm6
+
+        pmaddwd     mm5,        mm5
+        psubw       mm3,        mm4
+
+        pmaddwd     mm3,        mm3
+        paddd       mm3,        mm5
+
+        paddd       mm1,        mm3
+        movq        mm0,        mm1
+
+        psrlq       mm1,        32
+        paddd       mm0,        mm1
+
+        movq        rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_mmx_impl) PRIVATE
+sym(vp8_mbblock_error_mmx_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        mm7,        mm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        pxor        mm2,        mm2
+
+        movd        mm1,        dword ptr arg(2) ;dc
+        por         mm1,        mm2
+
+        pcmpeqw     mm1,        mm7
+        mov         rcx,        16
+
+.mberror_loop_mmx:
+        movq        mm3,       [rsi]
+        movq        mm4,       [rdi]
+
+        movq        mm5,       [rsi+8]
+        movq        mm6,       [rdi+8]
+
+
+        psubw       mm5,        mm6
+        pmaddwd     mm5,        mm5
+
+        psubw       mm3,        mm4
+        pand        mm3,        mm1
+
+        pmaddwd     mm3,        mm3
+        paddd       mm2,        mm5
+
+        paddd       mm2,        mm3
+        movq        mm3,       [rsi+16]
+
+        movq        mm4,       [rdi+16]
+        movq        mm5,       [rsi+24]
+
+        movq        mm6,       [rdi+24]
+        psubw       mm5,        mm6
+
+        pmaddwd     mm5,        mm5
+        psubw       mm3,        mm4
+
+        pmaddwd     mm3,        mm3
+        paddd       mm2,        mm5
+
+        paddd       mm2,        mm3
+        add         rsi,        32
+
+        add         rdi,        32
+        sub         rcx,        1
+
+        jnz         .mberror_loop_mmx
+
+        movq        mm0,        mm2
+        psrlq       mm2,        32
+
+        paddd       mm0,        mm2
+        movq        rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_xmm_impl) PRIVATE
+sym(vp8_mbblock_error_xmm_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM 6
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        xmm6,       xmm6
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        pxor        xmm4,       xmm4
+
+        movd        xmm5,       dword ptr arg(2) ;dc
+        por         xmm5,       xmm4
+
+        pcmpeqw     xmm5,       xmm6
+        mov         rcx,        16
+
+.mberror_loop:
+        movdqa      xmm0,       [rsi]
+        movdqa      xmm1,       [rdi]
+
+        movdqa      xmm2,       [rsi+16]
+        movdqa      xmm3,       [rdi+16]
+
+
+        psubw       xmm2,       xmm3
+        pmaddwd     xmm2,       xmm2
+
+        psubw       xmm0,       xmm1
+        pand        xmm0,       xmm5
+
+        pmaddwd     xmm0,       xmm0
+        add         rsi,        32
+
+        add         rdi,        32
+
+        sub         rcx,        1
+        paddd       xmm4,       xmm2
+
+        paddd       xmm4,       xmm0
+        jnz         .mberror_loop
+
+        movdqa      xmm0,       xmm4
+        punpckldq   xmm0,       xmm6
+
+        punpckhdq   xmm4,       xmm6
+        paddd       xmm0,       xmm4
+
+        movdqa      xmm1,       xmm0
+        psrldq      xmm0,       8
+
+        paddd       xmm0,       xmm1
+        movq        rax,        xmm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_mmx_impl) PRIVATE
+sym(vp8_mbuverror_mmx_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;s_ptr
+        mov             rdi,        arg(1) ;d_ptr
+
+        mov             rcx,        16
+        pxor            mm7,        mm7
+
+.mbuverror_loop_mmx:
+
+        movq            mm1,        [rsi]
+        movq            mm2,        [rdi]
+
+        psubw           mm1,        mm2
+        pmaddwd         mm1,        mm1
+
+
+        movq            mm3,        [rsi+8]
+        movq            mm4,        [rdi+8]
+
+        psubw           mm3,        mm4
+        pmaddwd         mm3,        mm3
+
+
+        paddd           mm7,        mm1
+        paddd           mm7,        mm3
+
+
+        add             rsi,        16
+        add             rdi,        16
+
+        dec             rcx
+        jnz             .mbuverror_loop_mmx
+
+        movq            mm0,        mm7
+        psrlq           mm7,        32
+
+        paddd           mm0,        mm7
+        movq            rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_xmm_impl) PRIVATE
+sym(vp8_mbuverror_xmm_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;s_ptr
+        mov             rdi,        arg(1) ;d_ptr
+
+        mov             rcx,        16
+        pxor            xmm3,       xmm3
+
+.mbuverror_loop:
+
+        movdqa          xmm1,       [rsi]
+        movdqa          xmm2,       [rdi]
+
+        psubw           xmm1,       xmm2
+        pmaddwd         xmm1,       xmm1
+
+        paddd           xmm3,       xmm1
+
+        add             rsi,        16
+        add             rdi,        16
+
+        dec             rcx
+        jnz             .mbuverror_loop
+
+        pxor        xmm0,           xmm0
+        movdqa      xmm1,           xmm3
+
+        movdqa      xmm2,           xmm1
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        paddd       xmm1,           xmm2
+
+        movdqa      xmm2,           xmm1
+
+        psrldq      xmm1,           8
+        paddd       xmm1,           xmm2
+
+        movq            rax,            xmm1
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libvpx/vp8/encoder/x86/fwalsh_sse2.asm b/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
new file mode 100644
index 0000000..f498927
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
@@ -0,0 +1,164 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_walsh4x4_sse2) PRIVATE
+sym(vp8_short_walsh4x4_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov     rsi, arg(0)           ; input
+    mov     rdi, arg(1)           ; output
+    movsxd  rdx, dword ptr arg(2) ; pitch
+
+    ; first for loop
+    movq    xmm0, MMWORD PTR [rsi]           ; load input
+    movq    xmm1, MMWORD PTR [rsi + rdx]
+    lea     rsi,  [rsi + rdx*2]
+    movq    xmm2, MMWORD PTR [rsi]
+    movq    xmm3, MMWORD PTR [rsi + rdx]
+
+    punpcklwd xmm0,  xmm1
+    punpcklwd xmm2,  xmm3
+
+    movdqa    xmm1, xmm0
+    punpckldq xmm0, xmm2           ; ip[1] ip[0]
+    punpckhdq xmm1, xmm2           ; ip[3] ip[2]
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+
+    psllw     xmm0, 2              ; d1  a1
+    psllw     xmm2, 2              ; c1  b1
+
+    movdqa    xmm1, xmm0
+    punpcklqdq xmm0, xmm2          ; b1  a1
+    punpckhqdq xmm1, xmm2          ; c1  d1
+
+    pxor      xmm6, xmm6
+    movq      xmm6, xmm0
+    pxor      xmm7, xmm7
+    pcmpeqw   xmm7, xmm6
+    paddw     xmm7, [GLOBAL(c1)]
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1           ; b1+c1  a1+d1
+    psubw     xmm2, xmm1           ; b1-c1  a1-d1
+    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
+
+    ; second for loop
+    ; input: 13  9  5  1 12  8  4  0 (xmm0)
+    ;        14 10  6  2 15 11  7  3 (xmm2)
+    ; after shuffle:
+    ;        13  5  9  1 12  4  8  0 (xmm0)
+    ;        14  6 10  2 15  7 11  3 (xmm1)
+    pshuflw   xmm3, xmm0, 0xd8
+    pshufhw   xmm0, xmm3, 0xd8
+    pshuflw   xmm3, xmm2, 0xd8
+    pshufhw   xmm1, xmm3, 0xd8
+
+    movdqa    xmm2, xmm0
+    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
+    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
+    movdqa    xmm3, xmm1
+    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
+    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
+
+    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
+    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
+    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
+    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
+
+    movdqa    xmm0, xmm4
+    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
+    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
+    movdqa    xmm1, xmm6
+    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
+    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
+
+    movdqa    xmm2, xmm0
+    paddd     xmm0, xmm4            ; b21 b20 a21 a20
+    psubd     xmm2, xmm4            ; c21 c20 d21 d20
+    movdqa    xmm3, xmm1
+    paddd     xmm1, xmm6            ; b23 b22 a23 a22
+    psubd     xmm3, xmm6            ; c23 c22 d23 d22
+
+    pxor      xmm4, xmm4
+    movdqa    xmm5, xmm4
+    pcmpgtd   xmm4, xmm0
+    pcmpgtd   xmm5, xmm2
+    pand      xmm4, [GLOBAL(cd1)]
+    pand      xmm5, [GLOBAL(cd1)]
+
+    pxor      xmm6, xmm6
+    movdqa    xmm7, xmm6
+    pcmpgtd   xmm6, xmm1
+    pcmpgtd   xmm7, xmm3
+    pand      xmm6, [GLOBAL(cd1)]
+    pand      xmm7, [GLOBAL(cd1)]
+
+    paddd     xmm0, xmm4
+    paddd     xmm2, xmm5
+    paddd     xmm0, [GLOBAL(cd3)]
+    paddd     xmm2, [GLOBAL(cd3)]
+    paddd     xmm1, xmm6
+    paddd     xmm3, xmm7
+    paddd     xmm1, [GLOBAL(cd3)]
+    paddd     xmm3, [GLOBAL(cd3)]
+
+    psrad     xmm0, 3
+    psrad     xmm1, 3
+    psrad     xmm2, 3
+    psrad     xmm3, 3
+    movdqa    xmm4, xmm0
+    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
+    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
+    movdqa    xmm5, xmm2
+    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
+    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
+
+    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
+    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
+
+    movdqa  XMMWORD PTR [rdi], xmm0
+    movdqa  XMMWORD PTR [rdi + 16], xmm2
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+c1:
+    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
+align 16
+cn1:
+    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
+align 16
+cd1:
+    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+align 16
+cd3:
+    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
diff --git a/libvpx/vp8/encoder/x86/quantize_mmx.asm b/libvpx/vp8/encoder/x86/quantize_mmx.asm
new file mode 100644
index 0000000..2864ce1
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/quantize_mmx.asm
@@ -0,0 +1,286 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+;                           short *qcoeff_ptr,short *dequant_ptr,
+;                           short *scan_mask, short *round_ptr,
+;                           short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE
+sym(vp8_fast_quantize_b_impl_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;coeff_ptr
+        movq            mm0,        [rsi]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm1,        [rax]
+
+        movq            mm3,        mm0
+        psraw           mm0,        15
+
+        pxor            mm3,        mm0
+        psubw           mm3,        mm0         ; abs
+
+        movq            mm2,        mm3
+        pcmpgtw         mm1,        mm2
+
+        pandn           mm1,        mm2
+        movq            mm3,        mm1
+
+        mov             rdx,        arg(6) ;quant_ptr
+        movq            mm1,        [rdx]
+
+        mov             rcx,        arg(5) ;round_ptr
+        movq            mm2,        [rcx]
+
+        paddw           mm3,        mm2
+        pmulhuw         mm3,        mm1
+
+        pxor            mm3,        mm0
+        psubw           mm3,        mm0     ;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+        movq            mm0,        mm3
+
+        movq            [rdi],      mm3
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm2,        [rax]
+
+        pmullw          mm3,        mm2
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax],      mm3
+
+        ; next 8
+        movq            mm4,        [rsi+8]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+8]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+8]
+        movq            mm6,        [rcx+8]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+8],    mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+8]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+8],    mm7
+
+
+                ; next 8
+        movq            mm4,        [rsi+16]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+16]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+16]
+        movq            mm6,        [rcx+16]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+16],   mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+16]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+16],   mm7
+
+
+                ; next 8
+        movq            mm4,        [rsi+24]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+24]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+24]
+        movq            mm6,        [rcx+24]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+24],   mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+24]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+24],   mm7
+
+
+
+        mov             rdi,        arg(4) ;scan_mask
+        mov             rsi,        arg(2) ;qcoeff_ptr
+
+        pxor            mm5,        mm5
+        pxor            mm7,        mm7
+
+        movq            mm0,        [rsi]
+        movq            mm1,        [rsi+8]
+
+        movq            mm2,        [rdi]
+        movq            mm3,        [rdi+8];
+
+        pcmpeqw         mm0,        mm7
+        pcmpeqw         mm1,        mm7
+
+        pcmpeqw         mm6,        mm6
+        pxor            mm0,        mm6
+
+        pxor            mm1,        mm6
+        psrlw           mm0,        15
+
+        psrlw           mm1,        15
+        pmaddwd         mm0,        mm2
+
+        pmaddwd         mm1,        mm3
+        movq            mm5,        mm0
+
+        paddd           mm5,        mm1
+
+        movq            mm0,        [rsi+16]
+        movq            mm1,        [rsi+24]
+
+        movq            mm2,        [rdi+16]
+        movq            mm3,        [rdi+24];
+
+        pcmpeqw         mm0,        mm7
+        pcmpeqw         mm1,        mm7
+
+        pcmpeqw         mm6,        mm6
+        pxor            mm0,        mm6
+
+        pxor            mm1,        mm6
+        psrlw           mm0,        15
+
+        psrlw           mm1,        15
+        pmaddwd         mm0,        mm2
+
+        pmaddwd         mm1,        mm3
+        paddd           mm5,        mm0
+
+        paddd           mm5,        mm1
+        movq            mm0,        mm5
+
+        psrlq           mm5,        32
+        paddd           mm0,        mm5
+
+        ; eob adjustment begins here
+        movq            rcx,        mm0
+        and             rcx,        0xffff
+
+        xor             rdx,        rdx
+        sub             rdx,        rcx ; rdx=-rcx
+
+        bsr             rax,        rcx
+        inc             rax
+
+        sar             rdx,        31
+        and             rax,        rdx
+        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
+        ; following is kept as reference
+        ;    movq            rcx,        mm0
+        ;    bsr             rax,        rcx
+        ;
+        ;    mov             eob,        rax
+        ;    mov             eee,        rcx
+        ;
+        ;if(eee==0)
+        ;{
+        ;    eob=-1;
+        ;}
+        ;else if(eee<0)
+        ;{
+        ;    eob=15;
+        ;}
+        ;d->eob = eob+1;
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libvpx/vp8/encoder/x86/quantize_sse2.asm b/libvpx/vp8/encoder/x86/quantize_sse2.asm
new file mode 100644
index 0000000..724e54c
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/quantize_sse2.asm
@@ -0,0 +1,386 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp8_regular_quantize_b_sse2 | arg
+;  (BLOCK  *b,                     |  0
+;   BLOCKD *d)                     |  1
+
+global sym(vp8_regular_quantize_b_sse2) PRIVATE
+sym(vp8_regular_quantize_b_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SAVE_XMM 7
+    GET_GOT     rbx
+
+%if ABI_IS_32BIT
+    push        rdi
+    push        rsi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    push        rdi
+    push        rsi
+  %endif
+%endif
+
+    ALIGN_STACK 16, rax
+    %define zrun_zbin_boost   0  ;  8
+    %define abs_minus_zbin    8  ; 32
+    %define temp_qcoeff       40 ; 32
+    %define qcoeff            72 ; 32
+    %define stack_size        104
+    sub         rsp, stack_size
+    ; end prolog
+
+%if ABI_IS_32BIT
+    mov         rdi, arg(0)                 ; BLOCK *b
+    mov         rsi, arg(1)                 ; BLOCKD *d
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    mov         rdi, rcx                    ; BLOCK *b
+    mov         rsi, rdx                    ; BLOCKD *d
+  %else
+    ;mov         rdi, rdi                    ; BLOCK *b
+    ;mov         rsi, rsi                    ; BLOCKD *d
+  %endif
+%endif
+
+    mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr
+    mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr
+    movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
+
+    ; z
+    movdqa      xmm0, [rdx]
+    movdqa      xmm4, [rdx + 16]
+    mov         rdx, [rdi + vp8_block_round] ; round_ptr
+
+    pshuflw     xmm7, xmm7, 0
+    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    ; sz
+    psraw       xmm0, 15
+    psraw       xmm4, 15
+
+    ; (z ^ sz)
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+
+    ; x = abs(z)
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    movdqa      xmm2, [rcx]
+    movdqa      xmm3, [rcx + 16]
+    mov         rcx, [rdi + vp8_block_quant] ; quant_ptr
+
+    ; *zbin_ptr + zbin_oq_value
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7
+
+    ; x - (*zbin_ptr + zbin_oq_value)
+    psubw       xmm1, xmm2
+    psubw       xmm5, xmm3
+    movdqa      [rsp + abs_minus_zbin], xmm1
+    movdqa      [rsp + abs_minus_zbin + 16], xmm5
+
+    ; add (zbin_ptr + zbin_oq_value) back
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm3
+
+    movdqa      xmm2, [rdx]
+    movdqa      xmm6, [rdx + 16]
+
+    movdqa      xmm3, [rcx]
+    movdqa      xmm7, [rcx + 16]
+
+    ; x + round
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm6
+
+    ; y = x * quant_ptr >> 16
+    pmulhw      xmm3, xmm1
+    pmulhw      xmm7, xmm5
+
+    ; y += x
+    paddw       xmm1, xmm3
+    paddw       xmm5, xmm7
+
+    movdqa      [rsp + temp_qcoeff], xmm1
+    movdqa      [rsp + temp_qcoeff + 16], xmm5
+
+    pxor        xmm6, xmm6
+    ; zero qcoeff
+    movdqa      [rsp + qcoeff], xmm6
+    movdqa      [rsp + qcoeff + 16], xmm6
+
+    mov         rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
+    mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
+    mov         [rsp + zrun_zbin_boost], rdx
+
+%macro ZIGZAG_LOOP 1
+    ; x
+    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
+
+    ; if (x >= zbin)
+    sub         cx, WORD PTR[rdx]           ; x - zbin
+    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
+    jl          .rq_zigzag_loop_%1           ; x < zbin
+
+    movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
+
+    ; downshift by quant_shift[rc]
+    movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
+    sar         edi, cl                     ; also sets Z bit
+    je          .rq_zigzag_loop_%1           ; !y
+    mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+    mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
+.rq_zigzag_loop_%1:
+%endmacro
+; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
+ZIGZAG_LOOP  0
+ZIGZAG_LOOP  1
+ZIGZAG_LOOP  4
+ZIGZAG_LOOP  8
+ZIGZAG_LOOP  5
+ZIGZAG_LOOP  2
+ZIGZAG_LOOP  3
+ZIGZAG_LOOP  6
+ZIGZAG_LOOP  9
+ZIGZAG_LOOP 12
+ZIGZAG_LOOP 13
+ZIGZAG_LOOP 10
+ZIGZAG_LOOP  7
+ZIGZAG_LOOP 11
+ZIGZAG_LOOP 14
+ZIGZAG_LOOP 15
+
+    movdqa      xmm2, [rsp + qcoeff]
+    movdqa      xmm3, [rsp + qcoeff + 16]
+
+    mov         rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr
+    mov         rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
+
+    ; y ^ sz
+    pxor        xmm2, xmm0
+    pxor        xmm3, xmm4
+    ; x = (y ^ sz) - sz
+    psubw       xmm2, xmm0
+    psubw       xmm3, xmm4
+
+    ; dequant
+    movdqa      xmm0, [rcx]
+    movdqa      xmm1, [rcx + 16]
+
+    mov         rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr
+
+    pmullw      xmm0, xmm2
+    pmullw      xmm1, xmm3
+
+    movdqa      [rcx], xmm2        ; store qcoeff
+    movdqa      [rcx + 16], xmm3
+    movdqa      [rdi], xmm0        ; store dqcoeff
+    movdqa      [rdi + 16], xmm1
+
+    mov         rcx, [rsi + vp8_blockd_eob]
+
+    ; select the last value (in zig_zag order) for EOB
+    pcmpeqw     xmm2, xmm6
+    pcmpeqw     xmm3, xmm6
+    ; !
+    pcmpeqw     xmm6, xmm6
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm6
+    ; mask inv_zig_zag
+    pand        xmm2, [GLOBAL(inv_zig_zag)]
+    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
+    ; select the max value
+    pmaxsw      xmm2, xmm3
+    pshufd      xmm3, xmm2, 00001110b
+    pmaxsw      xmm2, xmm3
+    pshuflw     xmm3, xmm2, 00001110b
+    pmaxsw      xmm2, xmm3
+    pshuflw     xmm3, xmm2, 00000001b
+    pmaxsw      xmm2, xmm3
+    movd        eax, xmm2
+    and         eax, 0xff
+
+    mov         BYTE PTR [rcx], al          ; store eob
+
+    ; begin epilog
+    add         rsp, stack_size
+    pop         rsp
+%if ABI_IS_32BIT
+    pop         rsi
+    pop         rdi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rsi
+    pop         rdi
+  %endif
+%endif
+    RESTORE_GOT
+    RESTORE_XMM
+    pop         rbp
+    ret
+
+; void vp8_fast_quantize_b_sse2 | arg
+;  (BLOCK  *b,                  |  0
+;   BLOCKD *d)                  |  1
+
+global sym(vp8_fast_quantize_b_sse2) PRIVATE
+sym(vp8_fast_quantize_b_sse2):
+    push        rbp
+    mov         rbp, rsp
+    GET_GOT     rbx
+
+%if ABI_IS_32BIT
+    push        rdi
+    push        rsi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    push        rdi
+    push        rsi
+  %else
+    ; these registers are used for passing arguments
+  %endif
+%endif
+
+    ; end prolog
+
+%if ABI_IS_32BIT
+    mov         rdi, arg(0)                 ; BLOCK *b
+    mov         rsi, arg(1)                 ; BLOCKD *d
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    mov         rdi, rcx                    ; BLOCK *b
+    mov         rsi, rdx                    ; BLOCKD *d
+  %else
+    ;mov         rdi, rdi                    ; BLOCK *b
+    ;mov         rsi, rsi                    ; BLOCKD *d
+  %endif
+%endif
+
+    mov         rax, [rdi + vp8_block_coeff]
+    mov         rcx, [rdi + vp8_block_round]
+    mov         rdx, [rdi + vp8_block_quant_fast]
+
+    ; z = coeff
+    movdqa      xmm0, [rax]
+    movdqa      xmm4, [rax + 16]
+
+    ; dup z so we can save sz
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    ; sz = z >> 15
+    psraw       xmm0, 15
+    psraw       xmm4, 15
+
+    ; x = abs(z) = (z ^ sz) - sz
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    ; x += round
+    paddw       xmm1, [rcx]
+    paddw       xmm5, [rcx + 16]
+
+    mov         rax, [rsi + vp8_blockd_qcoeff]
+    mov         rcx, [rsi + vp8_blockd_dequant]
+    mov         rdi, [rsi + vp8_blockd_dqcoeff]
+
+    ; y = x * quant >> 16
+    pmulhw      xmm1, [rdx]
+    pmulhw      xmm5, [rdx + 16]
+
+    ; x = (y ^ sz) - sz
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    ; qcoeff = x
+    movdqa      [rax], xmm1
+    movdqa      [rax + 16], xmm5
+
+    ; x * dequant
+    movdqa      xmm2, xmm1
+    movdqa      xmm3, xmm5
+    pmullw      xmm2, [rcx]
+    pmullw      xmm3, [rcx + 16]
+
+    ; dqcoeff = x * dequant
+    movdqa      [rdi], xmm2
+    movdqa      [rdi + 16], xmm3
+
+    pxor        xmm4, xmm4                  ;clear all bits
+    pcmpeqw     xmm1, xmm4
+    pcmpeqw     xmm5, xmm4
+
+    pcmpeqw     xmm4, xmm4                  ;set all bits
+    pxor        xmm1, xmm4
+    pxor        xmm5, xmm4
+
+    pand        xmm1, [GLOBAL(inv_zig_zag)]
+    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
+
+    pmaxsw      xmm1, xmm5
+
+    mov         rcx, [rsi + vp8_blockd_eob]
+
+    ; now down to 8
+    pshufd      xmm5, xmm1, 00001110b
+
+    pmaxsw      xmm1, xmm5
+
+    ; only 4 left
+    pshuflw     xmm5, xmm1, 00001110b
+
+    pmaxsw      xmm1, xmm5
+
+    ; okay, just 2!
+    pshuflw     xmm5, xmm1, 00000001b
+
+    pmaxsw      xmm1, xmm5
+
+    movd        eax, xmm1
+    and         eax, 0xff
+
+    mov         BYTE PTR [rcx], al          ; store eob
+
+    ; begin epilog
+%if ABI_IS_32BIT
+    pop         rsi
+    pop         rdi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rsi
+    pop         rdi
+  %endif
+%endif
+
+    RESTORE_GOT
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+inv_zig_zag:
+  dw 0x0001, 0x0002, 0x0006, 0x0007
+  dw 0x0003, 0x0005, 0x0008, 0x000d
+  dw 0x0004, 0x0009, 0x000c, 0x000e
+  dw 0x000a, 0x000b, 0x000f, 0x0010
diff --git a/libvpx/vp8/encoder/x86/quantize_sse4.asm b/libvpx/vp8/encoder/x86/quantize_sse4.asm
new file mode 100644
index 0000000..f0e5d40
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/quantize_sse4.asm
@@ -0,0 +1,256 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp8_regular_quantize_b_sse4 | arg
+;  (BLOCK  *b,                     |  0
+;   BLOCKD *d)                     |  1
+
+global sym(vp8_regular_quantize_b_sse4) PRIVATE
+sym(vp8_regular_quantize_b_sse4):
+
+%if ABI_IS_32BIT
+    push        rbp
+    mov         rbp, rsp
+    GET_GOT     rbx
+    push        rdi
+    push        rsi
+
+    ALIGN_STACK 16, rax
+    %define qcoeff      0 ; 32
+    %define stack_size 32
+    sub         rsp, stack_size
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    SAVE_XMM 8, u
+    push        rdi
+    push        rsi
+  %endif
+%endif
+    ; end prolog
+
+%if ABI_IS_32BIT
+    mov         rdi, arg(0)                 ; BLOCK *b
+    mov         rsi, arg(1)                 ; BLOCKD *d
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    mov         rdi, rcx                    ; BLOCK *b
+    mov         rsi, rdx                    ; BLOCKD *d
+  %else
+    ;mov         rdi, rdi                    ; BLOCK *b
+    ;mov         rsi, rsi                    ; BLOCKD *d
+  %endif
+%endif
+
+    mov         rax, [rdi + vp8_block_coeff]
+    mov         rcx, [rdi + vp8_block_zbin]
+    mov         rdx, [rdi + vp8_block_round]
+    movd        xmm7, [rdi + vp8_block_zbin_extra]
+
+    ; z
+    movdqa      xmm0, [rax]
+    movdqa      xmm1, [rax + 16]
+
+    ; duplicate zbin_oq_value
+    pshuflw     xmm7, xmm7, 0
+    punpcklwd   xmm7, xmm7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    ; sz
+    psraw       xmm0, 15
+    psraw       xmm1, 15
+
+    ; (z ^ sz)
+    pxor        xmm2, xmm0
+    pxor        xmm3, xmm1
+
+    ; x = abs(z)
+    psubw       xmm2, xmm0
+    psubw       xmm3, xmm1
+
+    ; zbin
+    movdqa      xmm4, [rcx]
+    movdqa      xmm5, [rcx + 16]
+
+    ; *zbin_ptr + zbin_oq_value
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm3
+
+    ; x - (*zbin_ptr + zbin_oq_value)
+    psubw       xmm6, xmm4
+    psubw       xmm7, xmm5
+
+    ; round
+    movdqa      xmm4, [rdx]
+    movdqa      xmm5, [rdx + 16]
+
+    mov         rax, [rdi + vp8_block_quant_shift]
+    mov         rcx, [rdi + vp8_block_quant]
+    mov         rdx, [rdi + vp8_block_zrun_zbin_boost]
+
+    ; x + round
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    ; quant
+    movdqa      xmm4, [rcx]
+    movdqa      xmm5, [rcx + 16]
+
+    ; y = x * quant_ptr >> 16
+    pmulhw      xmm4, xmm2
+    pmulhw      xmm5, xmm3
+
+    ; y += x
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    pxor        xmm4, xmm4
+%if ABI_IS_32BIT
+    movdqa      [rsp + qcoeff], xmm4
+    movdqa      [rsp + qcoeff + 16], xmm4
+%else
+    pxor        xmm8, xmm8
+%endif
+
+    ; quant_shift
+    movdqa      xmm5, [rax]
+
+    ; zrun_zbin_boost
+    mov         rax, rdx
+
+%macro ZIGZAG_LOOP 5
+    ; x
+    pextrw      ecx, %4, %2
+
+    ; if (x >= zbin)
+    sub         cx, WORD PTR[rdx]           ; x - zbin
+    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
+    jl          .rq_zigzag_loop_%1          ; x < zbin
+
+    pextrw      edi, %3, %2                 ; y
+
+    ; downshift by quant_shift[rc]
+    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
+    sar         edi, cl                     ; also sets Z bit
+    je          .rq_zigzag_loop_%1          ; !y
+%if ABI_IS_32BIT
+    mov         WORD PTR[rsp + qcoeff + %1 *2], di
+%else
+    pinsrw      %5, edi, %2                 ; qcoeff[rc]
+%endif
+    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
+.rq_zigzag_loop_%1:
+%endmacro
+; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
+ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
+ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
+ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
+ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
+ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
+ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
+
+    mov         rcx, [rsi + vp8_blockd_dequant]
+    mov         rdi, [rsi + vp8_blockd_dqcoeff]
+
+%if ABI_IS_32BIT
+    movdqa      xmm4, [rsp + qcoeff]
+    movdqa      xmm5, [rsp + qcoeff + 16]
+%else
+    %define     xmm5 xmm8
+%endif
+
+    ; y ^ sz
+    pxor        xmm4, xmm0
+    pxor        xmm5, xmm1
+    ; x = (y ^ sz) - sz
+    psubw       xmm4, xmm0
+    psubw       xmm5, xmm1
+
+    ; dequant
+    movdqa      xmm0, [rcx]
+    movdqa      xmm1, [rcx + 16]
+
+    mov         rcx, [rsi + vp8_blockd_qcoeff]
+
+    pmullw      xmm0, xmm4
+    pmullw      xmm1, xmm5
+
+    ; store qcoeff
+    movdqa      [rcx], xmm4
+    movdqa      [rcx + 16], xmm5
+
+    ; store dqcoeff
+    movdqa      [rdi], xmm0
+    movdqa      [rdi + 16], xmm1
+
+    mov         rcx, [rsi + vp8_blockd_eob]
+
+    ; select the last value (in zig_zag order) for EOB
+    pxor        xmm6, xmm6
+    pcmpeqw     xmm4, xmm6
+    pcmpeqw     xmm5, xmm6
+
+    packsswb    xmm4, xmm5
+    pshufb      xmm4, [GLOBAL(zig_zag1d)]
+    pmovmskb    edx, xmm4
+    xor         rdi, rdi
+    mov         eax, -1
+    xor         dx, ax
+    bsr         eax, edx
+    sub         edi, edx
+    sar         edi, 31
+    add         eax, 1
+    and         eax, edi
+
+    mov         BYTE PTR [rcx], al          ; store eob
+
+    ; begin epilog
+%if ABI_IS_32BIT
+    add         rsp, stack_size
+    pop         rsp
+
+    pop         rsi
+    pop         rdi
+    RESTORE_GOT
+    pop         rbp
+%else
+  %undef xmm5
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rsi
+    pop         rdi
+    RESTORE_XMM
+  %endif
+%endif
+
+    ret
+
+SECTION_RODATA
+align 16
+; vp8/common/entropy.c: vp8_default_zig_zag1d
+zig_zag1d:
+    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --git a/libvpx/vp8/encoder/x86/quantize_ssse3.asm b/libvpx/vp8/encoder/x86/quantize_ssse3.asm
new file mode 100644
index 0000000..dd526f4
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/quantize_ssse3.asm
@@ -0,0 +1,138 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
+
+
+; void vp8_fast_quantize_b_ssse3 | arg
+;  (BLOCK  *b,                   |  0
+;   BLOCKD *d)                   |  1
+;
+
+global sym(vp8_fast_quantize_b_ssse3) PRIVATE
+sym(vp8_fast_quantize_b_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    GET_GOT     rbx
+
+%if ABI_IS_32BIT
+    push        rdi
+    push        rsi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    push        rdi
+    push        rsi
+  %endif
+%endif
+    ; end prolog
+
+%if ABI_IS_32BIT
+    mov         rdi, arg(0)                 ; BLOCK *b
+    mov         rsi, arg(1)                 ; BLOCKD *d
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    mov         rdi, rcx                    ; BLOCK *b
+    mov         rsi, rdx                    ; BLOCKD *d
+  %else
+    ;mov         rdi, rdi                    ; BLOCK *b
+    ;mov         rsi, rsi                    ; BLOCKD *d
+  %endif
+%endif
+
+    mov         rax, [rdi + vp8_block_coeff]
+    mov         rcx, [rdi + vp8_block_round]
+    mov         rdx, [rdi + vp8_block_quant_fast]
+
+    ; coeff
+    movdqa      xmm0, [rax]
+    movdqa      xmm4, [rax + 16]
+
+    ; round
+    movdqa      xmm2, [rcx]
+    movdqa      xmm3, [rcx + 16]
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm5, xmm4
+
+    ; sz = z >> 15
+    psraw       xmm0, 15
+    psraw       xmm4, 15
+
+    pabsw       xmm1, xmm1
+    pabsw       xmm5, xmm5
+
+    paddw       xmm1, xmm2
+    paddw       xmm5, xmm3
+
+    ; quant_fast
+    pmulhw      xmm1, [rdx]
+    pmulhw      xmm5, [rdx + 16]
+
+    mov         rax, [rsi + vp8_blockd_qcoeff]
+    mov         rdi, [rsi + vp8_blockd_dequant]
+    mov         rcx, [rsi + vp8_blockd_dqcoeff]
+
+    movdqa      xmm2, xmm1                  ;store y for getting eob
+    movdqa      xmm3, xmm5
+
+    pxor        xmm1, xmm0
+    pxor        xmm5, xmm4
+    psubw       xmm1, xmm0
+    psubw       xmm5, xmm4
+
+    movdqa      [rax], xmm1
+    movdqa      [rax + 16], xmm5
+
+    movdqa      xmm0, [rdi]
+    movdqa      xmm4, [rdi + 16]
+
+    pmullw      xmm0, xmm1
+    pmullw      xmm4, xmm5
+    pxor        xmm1, xmm1
+
+    pcmpgtw     xmm2, xmm1                  ;calculate eob
+    pcmpgtw     xmm3, xmm1
+    packsswb    xmm2, xmm3
+    pshufb      xmm2, [GLOBAL(zz_shuf)]
+
+    pmovmskb    edx, xmm2
+
+    movdqa      [rcx], xmm0                 ;store dqcoeff
+    movdqa      [rcx + 16], xmm4            ;store dqcoeff
+    mov         rcx, [rsi + vp8_blockd_eob]
+
+    bsr         eax, edx                    ;count 0
+    add         eax, 1
+
+    cmp         edx, 0                      ;if all 0, eob=0
+    cmove       eax, edx
+
+    mov         BYTE PTR [rcx], al          ;store eob
+
+    ; begin epilog
+%if ABI_IS_32BIT
+    pop         rsi
+    pop         rdi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rsi
+    pop         rdi
+  %endif
+%endif
+
+    RESTORE_GOT
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+zz_shuf:
+    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
diff --git a/libvpx/vp8/encoder/x86/ssim_opt.asm b/libvpx/vp8/encoder/x86/ssim_opt.asm
new file mode 100644
index 0000000..5964a85
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/ssim_opt.asm
@@ -0,0 +1,216 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddd           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddd           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddd           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_16x16_sse2) PRIVATE
+sym(vp8_ssim_parms_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_8x8_sse2) PRIVATE
+sym(vp8_ssim_parms_8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libvpx/vp8/encoder/x86/subtract_mmx.asm b/libvpx/vp8/encoder/x86/subtract_mmx.asm
new file mode 100644
index 0000000..794dd22
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/subtract_mmx.asm
@@ -0,0 +1,223 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
+;                            short *diff, unsigned char *Predictor,
+;                            int pitch);
+global sym(vp8_subtract_b_mmx_impl) PRIVATE
+sym(vp8_subtract_b_mmx_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov     rdi,        arg(2) ;diff
+        mov     rax,        arg(3) ;Predictor
+        mov     rsi,        arg(0) ;z
+        movsxd  rdx,        dword ptr arg(1);src_stride;
+        movsxd  rcx,        dword ptr arg(4);pitch
+        pxor    mm7,        mm7
+
+        movd    mm0,        [rsi]
+        movd    mm1,        [rax]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi],      mm0
+
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*2],mm0
+
+
+        movd    mm0,        [rsi+rdx*2]
+        movd    mm1,        [rax+rcx*2]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*4],        mm0
+
+        lea     rsi,        [rsi+rdx*2]
+        lea     rcx,        [rcx+rcx*2]
+
+
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*2],        mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
+;unsigned char *pred, int pred_stride)
+global sym(vp8_subtract_mby_mmx) PRIVATE
+sym(vp8_subtract_mby_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push rsi
+    push rdi
+    ; end prolog
+
+    mov         rdi,        arg(0)          ;diff
+    mov         rsi,        arg(1)          ;src
+    movsxd      rdx,        dword ptr arg(2);src_stride
+    mov         rax,        arg(3)          ;pred
+    push        rbx
+    movsxd      rbx,        dword ptr arg(4);pred_stride
+
+    pxor        mm0,        mm0
+    mov         rcx,        16
+
+
+.submby_loop:
+    movq        mm1,        [rsi]
+    movq        mm3,        [rax]
+
+    movq        mm2,        mm1
+    movq        mm4,        mm3
+
+    punpcklbw   mm1,        mm0
+    punpcklbw   mm3,        mm0
+
+    punpckhbw   mm2,        mm0
+    punpckhbw   mm4,        mm0
+
+    psubw       mm1,        mm3
+    psubw       mm2,        mm4
+
+    movq        [rdi],      mm1
+    movq        [rdi+8],    mm2
+
+    movq        mm1,        [rsi+8]
+    movq        mm3,        [rax+8]
+
+    movq        mm2,        mm1
+    movq        mm4,        mm3
+
+    punpcklbw   mm1,        mm0
+    punpcklbw   mm3,        mm0
+
+    punpckhbw   mm2,        mm0
+    punpckhbw   mm4,        mm0
+
+    psubw       mm1,        mm3
+    psubw       mm2,        mm4
+
+    movq        [rdi+16],   mm1
+    movq        [rdi+24],   mm2
+    add         rdi,        32
+    lea         rax,        [rax+rbx]
+    lea         rsi,        [rsi+rdx]
+    dec         rcx
+    jnz         .submby_loop
+
+    pop rbx
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
+;                         int src_stride, unsigned char *upred,
+;                         unsigned char *vpred, int pred_stride)
+
+global sym(vp8_subtract_mbuv_mmx) PRIVATE
+sym(vp8_subtract_mbuv_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push rsi
+    push rdi
+    ; end prolog
+
+    mov         rdi,        arg(0)          ;diff
+    mov         rsi,        arg(1)          ;usrc
+    movsxd      rdx,        dword ptr arg(3);src_stride;
+    mov         rax,        arg(4)          ;upred
+    add         rdi,        256*2           ;diff = diff + 256 (shorts)
+    mov         rcx,        8
+    push        rbx
+    movsxd      rbx,        dword ptr arg(6);pred_stride
+
+    pxor        mm7,        mm7
+
+.submbu_loop:
+    movq        mm0,        [rsi]
+    movq        mm1,        [rax]
+    movq        mm3,        mm0
+    movq        mm4,        mm1
+    punpcklbw   mm0,        mm7
+    punpcklbw   mm1,        mm7
+    punpckhbw   mm3,        mm7
+    punpckhbw   mm4,        mm7
+    psubw       mm0,        mm1
+    psubw       mm3,        mm4
+    movq        [rdi],      mm0
+    movq        [rdi+8],    mm3
+    add         rdi, 16
+    add         rsi, rdx
+    add         rax, rbx
+
+    dec         rcx
+    jnz         .submbu_loop
+
+    mov         rsi,        arg(2)          ;vsrc
+    mov         rax,        arg(5)          ;vpred
+    mov         rcx,        8
+
+.submbv_loop:
+    movq        mm0,        [rsi]
+    movq        mm1,        [rax]
+    movq        mm3,        mm0
+    movq        mm4,        mm1
+    punpcklbw   mm0,        mm7
+    punpcklbw   mm1,        mm7
+    punpckhbw   mm3,        mm7
+    punpckhbw   mm4,        mm7
+    psubw       mm0,        mm1
+    psubw       mm3,        mm4
+    movq        [rdi],      mm0
+    movq        [rdi+8],    mm3
+    add         rdi, 16
+    add         rsi, rdx
+    add         rax, rbx
+
+    dec         rcx
+    jnz         .submbv_loop
+
+    pop         rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libvpx/vp8/encoder/x86/subtract_sse2.asm b/libvpx/vp8/encoder/x86/subtract_sse2.asm
new file mode 100644
index 0000000..a5d17f5
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/subtract_sse2.asm
@@ -0,0 +1,245 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
+;                            short *diff, unsigned char *Predictor,
+;                            int pitch);
+global sym(vp8_subtract_b_sse2_impl) PRIVATE
+sym(vp8_subtract_b_sse2_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov     rdi,        arg(2) ;diff
+        mov     rax,        arg(3) ;Predictor
+        mov     rsi,        arg(0) ;z
+        movsxd  rdx,        dword ptr arg(1);src_stride;
+        movsxd  rcx,        dword ptr arg(4);pitch
+        pxor    mm7,        mm7
+
+        movd    mm0,        [rsi]
+        movd    mm1,        [rax]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi],      mm0
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi+rcx*2], mm0
+
+        movd    mm0,        [rsi+rdx*2]
+        movd    mm1,        [rax+rcx*2]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi+rcx*4], mm0
+
+        lea     rsi,        [rsi+rdx*2]
+        lea     rcx,        [rcx+rcx*2]
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    MMWORD PTR [rdi+rcx*2], mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
+;unsigned char *pred, int pred_stride)
+global sym(vp8_subtract_mby_sse2) PRIVATE
+sym(vp8_subtract_mby_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+    mov         rdi,        arg(0)          ;diff
+    mov         rsi,        arg(1)          ;src
+    movsxd      rdx,        dword ptr arg(2);src_stride
+    mov         rax,        arg(3)          ;pred
+    movdqa      xmm4,       [GLOBAL(t80)]
+    push        rbx
+    mov         rcx,        8               ; do two lines at one time
+    movsxd      rbx,        dword ptr arg(4);pred_stride
+
+.submby_loop:
+    movdqa      xmm0,       [rsi]           ; src
+    movdqa      xmm1,       [rax]           ; pred
+
+    movdqa      xmm2,       xmm0
+    psubb       xmm0,       xmm1
+
+    pxor        xmm1,       xmm4            ;convert to signed values
+    pxor        xmm2,       xmm4
+    pcmpgtb     xmm1,       xmm2            ; obtain sign information
+
+    movdqa      xmm2,       xmm0
+    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
+    punpckhbw   xmm2,       xmm1            ; put sign back to subtraction
+
+    movdqa      xmm3,       [rsi + rdx]
+    movdqa      xmm5,       [rax + rbx]
+
+    lea         rsi,        [rsi+rdx*2]
+    lea         rax,        [rax+rbx*2]
+
+    movdqa      [rdi],      xmm0
+    movdqa      [rdi +16],  xmm2
+
+    movdqa      xmm1,       xmm3
+    psubb       xmm3,       xmm5
+
+    pxor        xmm5,       xmm4            ;convert to signed values
+    pxor        xmm1,       xmm4
+    pcmpgtb     xmm5,       xmm1            ; obtain sign information
+
+    movdqa      xmm1,       xmm3
+    punpcklbw   xmm3,       xmm5            ; put sign back to subtraction
+    punpckhbw   xmm1,       xmm5            ; put sign back to subtraction
+
+    movdqa      [rdi +32],  xmm3
+    movdqa      [rdi +48],  xmm1
+
+    add         rdi,        64
+    dec         rcx
+    jnz         .submby_loop
+
+    pop rbx
+    pop rdi
+    pop rsi
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
+;                         int src_stride, unsigned char *upred,
+;                         unsigned char *vpred, int pred_stride)
+global sym(vp8_subtract_mbuv_sse2) PRIVATE
+sym(vp8_subtract_mbuv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+    movdqa      xmm4,       [GLOBAL(t80)]
+    mov         rdi,        arg(0)          ;diff
+    mov         rsi,        arg(1)          ;usrc
+    movsxd      rdx,        dword ptr arg(3);src_stride;
+    mov         rax,        arg(4)          ;upred
+    add         rdi,        256*2           ;diff = diff + 256 (shorts)
+    mov         rcx,        4
+    push        rbx
+    movsxd      rbx,        dword ptr arg(6);pred_stride
+
+    ;u
+.submbu_loop:
+    movq        xmm0,       [rsi]           ; src
+    movq        xmm2,       [rsi+rdx]       ; src -- next line
+    movq        xmm1,       [rax]           ; pred
+    movq        xmm3,       [rax+rbx]       ; pred -- next line
+    lea         rsi,        [rsi + rdx*2]
+    lea         rax,        [rax + rbx*2]
+
+    punpcklqdq  xmm0,       xmm2
+    punpcklqdq  xmm1,       xmm3
+
+    movdqa      xmm2,       xmm0
+    psubb       xmm0,       xmm1            ; subtraction with sign missed
+
+    pxor        xmm1,       xmm4            ;convert to signed values
+    pxor        xmm2,       xmm4
+    pcmpgtb     xmm1,       xmm2            ; obtain sign information
+
+    movdqa      xmm2,       xmm0
+    movdqa      xmm3,       xmm1
+    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
+    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
+
+    movdqa      [rdi],      xmm0            ; store difference
+    movdqa      [rdi +16],  xmm2            ; store difference
+    add         rdi,        32
+    sub         rcx, 1
+    jnz         .submbu_loop
+
+    mov         rsi,        arg(2)          ;vsrc
+    mov         rax,        arg(5)          ;vpred
+    mov         rcx,        4
+
+    ;v
+.submbv_loop:
+    movq        xmm0,       [rsi]           ; src
+    movq        xmm2,       [rsi+rdx]       ; src -- next line
+    movq        xmm1,       [rax]           ; pred
+    movq        xmm3,       [rax+rbx]       ; pred -- next line
+    lea         rsi,        [rsi + rdx*2]
+    lea         rax,        [rax + rbx*2]
+
+    punpcklqdq  xmm0,       xmm2
+    punpcklqdq  xmm1,       xmm3
+
+    movdqa      xmm2,       xmm0
+    psubb       xmm0,       xmm1            ; subtraction with sign missed
+
+    pxor        xmm1,       xmm4            ;convert to signed values
+    pxor        xmm2,       xmm4
+    pcmpgtb     xmm1,       xmm2            ; obtain sign information
+
+    movdqa      xmm2,       xmm0
+    movdqa      xmm3,       xmm1
+    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
+    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
+
+    movdqa      [rdi],      xmm0            ; store difference
+    movdqa      [rdi +16],  xmm2            ; store difference
+    add         rdi,        32
+    sub         rcx, 1
+    jnz         .submbv_loop
+
+    pop         rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t80:
+    times 16 db 0x80
diff --git a/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
new file mode 100644
index 0000000..ce9d983
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -0,0 +1,207 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; void vp8_temporal_filter_apply_sse2 | arg
+;  (unsigned char  *frame1,           |  0
+;   unsigned int    stride,           |  1
+;   unsigned char  *frame2,           |  2
+;   unsigned int    block_size,       |  3
+;   int             strength,         |  4
+;   int             filter_weight,    |  5
+;   unsigned int   *accumulator,      |  6
+;   unsigned short *count)            |  7
+global sym(vp8_temporal_filter_apply_sse2) PRIVATE
+sym(vp8_temporal_filter_apply_sse2):
+
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ALIGN_STACK 16, rax
+    %define block_size    0
+    %define strength      16
+    %define filter_weight 32
+    %define rounding_bit  48
+    %define rbp_backup    64
+    %define stack_size    80
+    sub         rsp,           stack_size
+    mov         [rsp + rbp_backup], rbp
+    ; end prolog
+
+        mov         rdx,            arg(3)
+        mov         [rsp + block_size], rdx
+        movd        xmm6,            arg(4)
+        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+        ; calculate the rounding bit outside the loop
+        ; 0x8000 >> (16 - strength)
+        mov         rdx,            16
+        sub         rdx,            arg(4) ; 16 - strength
+        movd        xmm4,           rdx    ; can't use rdx w/ shift
+        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
+        psrlw       xmm5,           xmm4
+        movdqa      [rsp + rounding_bit], xmm5
+
+        mov         rsi,            arg(0) ; src/frame1
+        mov         rdx,            arg(2) ; predictor frame
+        mov         rdi,            arg(6) ; accumulator
+        mov         rax,            arg(7) ; count
+
+        ; dup the filter weight and store for later
+        movd        xmm0,           arg(5) ; filter_weight
+        pshuflw     xmm0,           xmm0, 0
+        punpcklwd   xmm0,           xmm0
+        movdqa      [rsp + filter_weight], xmm0
+
+        mov         rbp,            arg(1) ; stride
+        pxor        xmm7,           xmm7   ; zero for extraction
+
+        lea         rcx,            [rdx + 16*16*1]
+        cmp         dword ptr [rsp + block_size], 8
+        jne         .temporal_filter_apply_load_16
+        lea         rcx,            [rdx + 8*8*1]
+
+.temporal_filter_apply_load_8:
+        movq        xmm0,           [rsi]  ; first row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        movq        xmm1,           [rsi]  ; second row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
+        jmp         .temporal_filter_apply_load_finished
+
+.temporal_filter_apply_load_16:
+        movdqa      xmm0,           [rsi]  ; src (frame1)
+        lea         rsi,            [rsi + rbp] ; += stride
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
+
+.temporal_filter_apply_load_finished:
+        movdqa      xmm2,           [rdx]  ; predictor (frame2)
+        movdqa      xmm3,           xmm2
+        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
+
+        ; modifier = src_byte - pixel_value
+        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
+        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
+
+        ; modifier *= modifier
+        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
+        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
+
+        ; modifier *= 3
+        pmullw      xmm0,           [GLOBAL(_const_3w)]
+        pmullw      xmm1,           [GLOBAL(_const_3w)]
+
+        ; modifer += 0x8000 >> (16 - strength)
+        paddw       xmm0,           [rsp + rounding_bit]
+        paddw       xmm1,           [rsp + rounding_bit]
+
+        ; modifier >>= strength
+        psrlw       xmm0,           [rsp + strength]
+        psrlw       xmm1,           [rsp + strength]
+
+        ; modifier = 16 - modifier
+        ; saturation takes care of modifier > 16
+        movdqa      xmm3,           [GLOBAL(_const_16w)]
+        movdqa      xmm2,           [GLOBAL(_const_16w)]
+        psubusw     xmm3,           xmm1
+        psubusw     xmm2,           xmm0
+
+        ; modifier *= filter_weight
+        pmullw      xmm2,           [rsp + filter_weight]
+        pmullw      xmm3,           [rsp + filter_weight]
+
+        ; count
+        movdqa      xmm4,           [rax]
+        movdqa      xmm5,           [rax+16]
+        ; += modifier
+        paddw       xmm4,           xmm2
+        paddw       xmm5,           xmm3
+        ; write back
+        movdqa      [rax],          xmm4
+        movdqa      [rax+16],       xmm5
+        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
+
+        ; load and extract the predictor up to shorts
+        pxor        xmm7,           xmm7
+        movdqa      xmm0,           [rdx]
+        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
+
+        ; modifier *= pixel_value
+        pmullw      xmm0,           xmm2
+        pmullw      xmm1,           xmm3
+
+        ; expand to double words
+        movdqa      xmm2,           xmm0
+        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
+        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
+        movdqa      xmm3,           xmm1
+        punpcklwd   xmm1,           xmm7   ; [ 8-11]
+        punpckhwd   xmm3,           xmm7   ; [12-15]
+
+        ; accumulator
+        movdqa      xmm4,           [rdi]
+        movdqa      xmm5,           [rdi+16]
+        movdqa      xmm6,           [rdi+32]
+        movdqa      xmm7,           [rdi+48]
+        ; += modifier
+        paddd       xmm4,           xmm0
+        paddd       xmm5,           xmm2
+        paddd       xmm6,           xmm1
+        paddd       xmm7,           xmm3
+        ; write back
+        movdqa      [rdi],          xmm4
+        movdqa      [rdi+16],       xmm5
+        movdqa      [rdi+32],       xmm6
+        movdqa      [rdi+48],       xmm7
+        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+        cmp         rdx,            rcx
+        je          .temporal_filter_apply_epilog
+        pxor        xmm7,           xmm7   ; zero for extraction
+        cmp         dword ptr [rsp + block_size], 16
+        je          .temporal_filter_apply_load_16
+        jmp         .temporal_filter_apply_load_8
+
+.temporal_filter_apply_epilog:
+    ; begin epilog
+    mov         rbp,            [rsp + rbp_backup]
+    add         rsp,            stack_size
+    pop         rsp
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+    times 8 dw 3
+align 16
+_const_top_bit:
+    times 8 dw 1<<15
+align 16
+_const_16w
+    times 8 dw 16
diff --git a/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
new file mode 100644
index 0000000..da25f52
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
@@ -0,0 +1,78 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_ports/x86.h"
+#include "vp8/encoder/block.h"
+
+void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
+void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
+{
+    vp8_short_fdct4x4_mmx(input,   output,    pitch);
+    vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+}
+
+int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+                                 short *qcoeff_ptr, short *dequant_ptr,
+                                 const short *scan_mask, short *round_ptr,
+                                 short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
+{
+    const short *scan_mask   = vp8_default_zig_zag_mask;
+    short *coeff_ptr   = b->coeff;
+    short *zbin_ptr    = b->zbin;
+    short *round_ptr   = b->round;
+    short *quant_ptr   = b->quant_fast;
+    short *qcoeff_ptr  = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = d->dequant;
+
+    *d->eob = (char)vp8_fast_quantize_b_impl_mmx(
+                                                 coeff_ptr,
+                                                 zbin_ptr,
+                                                 qcoeff_ptr,
+                                                 dequant_ptr,
+                                                 scan_mask,
+
+                                                 round_ptr,
+                                                 quant_ptr,
+                                                 dqcoeff_ptr
+                                                 );
+}
+
+int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc)
+{
+    short *coeff_ptr =  mb->block[0].coeff;
+    short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
+    return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_mmx(MACROBLOCK *mb)
+{
+    short *s_ptr = &mb->coeff[256];
+    short *d_ptr = &mb->e_mbd.dqcoeff[256];
+    return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
+}
+
+void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
+                             short *diff, unsigned char *predictor,
+                             int pitch);
+void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
+{
+    unsigned char *z = *(be->base_src) + be->src;
+    unsigned int  src_stride = be->src_stride;
+    short *diff = &be->src_diff[0];
+    unsigned char *predictor = &bd->predictor[0];
+    vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
+}
diff --git a/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c b/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c
new file mode 100644
index 0000000..68db815
--- /dev/null
+++ b/libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c
@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_ports/x86.h"
+#include "vp8/encoder/block.h"
+
+int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
+{
+    short *coeff_ptr =  mb->block[0].coeff;
+    short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
+    return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_xmm(MACROBLOCK *mb)
+{
+    short *s_ptr = &mb->coeff[256];
+    short *d_ptr = &mb->e_mbd.dqcoeff[256];
+    return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
+}
+
+void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
+                             short *diff, unsigned char *predictor,
+                             int pitch);
+void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
+{
+    unsigned char *z = *(be->base_src) + be->src;
+    unsigned int  src_stride = be->src_stride;
+    short *diff = &be->src_diff[0];
+    unsigned char *predictor = &bd->predictor[0];
+    vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
+}
diff --git a/libvpx/vp8/exports_dec b/libvpx/vp8/exports_dec
new file mode 100644
index 0000000..100ac5c
--- /dev/null
+++ b/libvpx/vp8/exports_dec
@@ -0,0 +1,2 @@
+data vpx_codec_vp8_dx_algo
+text vpx_codec_vp8_dx
diff --git a/libvpx/vp8/exports_enc b/libvpx/vp8/exports_enc
new file mode 100644
index 0000000..29ff35e
--- /dev/null
+++ b/libvpx/vp8/exports_enc
@@ -0,0 +1,2 @@
+data vpx_codec_vp8_cx_algo
+text vpx_codec_vp8_cx
diff --git a/libvpx/vp8/vp8_common.mk b/libvpx/vp8/vp8_common.mk
new file mode 100644
index 0000000..a328f46
--- /dev/null
+++ b/libvpx/vp8/vp8_common.mk
@@ -0,0 +1,193 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+VP8_COMMON_SRCS-yes += vp8_common.mk
+VP8_COMMON_SRCS-yes += common/pragmas.h
+VP8_COMMON_SRCS-yes += common/ppflags.h
+VP8_COMMON_SRCS-yes += common/onyx.h
+VP8_COMMON_SRCS-yes += common/onyxd.h
+VP8_COMMON_SRCS-yes += common/alloccommon.c
+VP8_COMMON_SRCS-yes += common/asm_com_offsets.c
+VP8_COMMON_SRCS-yes += common/blockd.c
+VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
+VP8_COMMON_SRCS-yes += common/debugmodes.c
+VP8_COMMON_SRCS-yes += common/default_coef_probs.h
+VP8_COMMON_SRCS-yes += common/dequantize.c
+VP8_COMMON_SRCS-yes += common/entropy.c
+VP8_COMMON_SRCS-yes += common/entropymode.c
+VP8_COMMON_SRCS-yes += common/entropymv.c
+VP8_COMMON_SRCS-yes += common/extend.c
+VP8_COMMON_SRCS-yes += common/filter.c
+VP8_COMMON_SRCS-yes += common/filter.h
+VP8_COMMON_SRCS-yes += common/findnearmv.c
+VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
+VP8_COMMON_SRCS-yes += common/idct_blk.c
+VP8_COMMON_SRCS-yes += common/idctllm.c
+VP8_COMMON_SRCS-yes += common/alloccommon.h
+VP8_COMMON_SRCS-yes += common/blockd.h
+VP8_COMMON_SRCS-yes += common/common.h
+VP8_COMMON_SRCS-yes += common/entropy.h
+VP8_COMMON_SRCS-yes += common/entropymode.h
+VP8_COMMON_SRCS-yes += common/entropymv.h
+VP8_COMMON_SRCS-yes += common/extend.h
+VP8_COMMON_SRCS-yes += common/findnearmv.h
+VP8_COMMON_SRCS-yes += common/header.h
+VP8_COMMON_SRCS-yes += common/invtrans.h
+VP8_COMMON_SRCS-yes += common/loopfilter.h
+VP8_COMMON_SRCS-yes += common/modecont.h
+VP8_COMMON_SRCS-yes += common/mv.h
+VP8_COMMON_SRCS-yes += common/onyxc_int.h
+VP8_COMMON_SRCS-yes += common/quant_common.h
+VP8_COMMON_SRCS-yes += common/reconinter.h
+VP8_COMMON_SRCS-yes += common/reconintra4x4.h
+VP8_COMMON_SRCS-yes += common/rtcd.c
+VP8_COMMON_SRCS-yes += common/rtcd_defs.sh
+VP8_COMMON_SRCS-yes += common/setupintrarecon.h
+VP8_COMMON_SRCS-yes += common/swapyv12buffer.h
+VP8_COMMON_SRCS-yes += common/systemdependent.h
+VP8_COMMON_SRCS-yes += common/threading.h
+VP8_COMMON_SRCS-yes += common/treecoder.h
+VP8_COMMON_SRCS-yes += common/loopfilter.c
+VP8_COMMON_SRCS-yes += common/loopfilter_filters.c
+VP8_COMMON_SRCS-yes += common/mbpitch.c
+VP8_COMMON_SRCS-yes += common/modecont.c
+VP8_COMMON_SRCS-yes += common/quant_common.c
+VP8_COMMON_SRCS-yes += common/reconinter.c
+VP8_COMMON_SRCS-yes += common/reconintra.c
+VP8_COMMON_SRCS-yes += common/reconintra4x4.c
+VP8_COMMON_SRCS-yes += common/sad_c.c
+VP8_COMMON_SRCS-yes += common/setupintrarecon.c
+VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
+VP8_COMMON_SRCS-yes += common/variance_c.c
+VP8_COMMON_SRCS-yes += common/variance.h
+VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
+
+
+
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
+VP8_COMMON_SRCS-yes += common/treecoder.c
+
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/sad_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_mmx.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_impl_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sad_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_sse2.c
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/sad_sse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/sad_ssse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c
+VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm
+
+ifeq ($(CONFIG_POSTPROC),yes)
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.c
+VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
+endif
+
+ifeq ($(ARCH_X86_64),yes)
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_block_sse2.asm
+endif
+
+# common (c)
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/idctllm_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/filter_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/loopfilter_filters_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/reconinter_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/idct_blk_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/dequantize_dspr2.c
+
+# common (c)
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/variance_arm.c
+
+# common (media)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/bilinearfilter_arm.c
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/bilinearfilter_arm.h
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/bilinearfilter_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/copymem8x4_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/copymem8x8_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/copymem16x16_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dc_only_idct_add_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/iwalsh_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/filter_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/loopfilter_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/simpleloopfilter_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/sixtappredict8x4_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/intra4x4_predict_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_sad16x16_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
+VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
+
+# common (neon)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict4x4_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict8x4_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict8x8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict16x16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem8x4_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem8x8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem16x16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/iwalsh_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfilter_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict4x4_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict8x4_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict8x8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict16x16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/save_reg_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_blk_neon.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM)
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c
new file mode 100644
index 0000000..eeac3a8
--- /dev/null
+++ b/libvpx/vp8/vp8_cx_iface.c
@@ -0,0 +1,1317 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_rtcd.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_version.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vpx/vp8cx.h"
+#include "vp8/encoder/firstpass.h"
+#include "vp8/common/onyx.h"
+#include <stdlib.h>
+#include <string.h>
+
+struct vp8_extracfg
+{
+    struct vpx_codec_pkt_list *pkt_list;
+    int                         cpu_used;                    /** available cpu percentage in 1/16*/
+    unsigned int                enable_auto_alt_ref;           /** if encoder decides to uses alternate reference frame */
+    unsigned int                noise_sensitivity;
+    unsigned int                Sharpness;
+    unsigned int                static_thresh;
+    unsigned int                token_partitions;
+    unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */
+    unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */
+    unsigned int                arnr_type;        /* alt_ref filter type */
+    vp8e_tuning                 tuning;
+    unsigned int                cq_level;         /* constrained quality level */
+    unsigned int                rc_max_intra_bitrate_pct;
+
+};
+
+struct extraconfig_map
+{
+    int                 usage;
+    struct vp8_extracfg cfg;
+};
+
+static const struct extraconfig_map extracfg_map[] =
+{
+    {
+        0,
+        {
+            NULL,
+#if !(CONFIG_REALTIME_ONLY)
+            0,                          /* cpu_used      */
+#else
+            4,                          /* cpu_used      */
+#endif
+            0,                          /* enable_auto_alt_ref */
+            0,                          /* noise_sensitivity */
+            0,                          /* Sharpness */
+            0,                          /* static_thresh */
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+            VP8_EIGHT_TOKENPARTITION,
+#else
+            VP8_ONE_TOKENPARTITION,     /* token_partitions */
+#endif
+            0,                          /* arnr_max_frames */
+            3,                          /* arnr_strength */
+            3,                          /* arnr_type*/
+            0,                          /* tuning*/
+            10,                         /* cq_level */
+            0,                          /* rc_max_intra_bitrate_pct */
+        }
+    }
+};
+
+struct vpx_codec_alg_priv
+{
+    vpx_codec_priv_t        base;
+    vpx_codec_enc_cfg_t     cfg;
+    struct vp8_extracfg     vp8_cfg;
+    VP8_CONFIG              oxcf;
+    struct VP8_COMP        *cpi;
+    unsigned char          *cx_data;
+    unsigned int            cx_data_sz;
+    vpx_image_t             preview_img;
+    unsigned int            next_frame_flag;
+    vp8_postproc_cfg_t      preview_ppcfg;
+    /* pkt_list size depends on the maximum number of lagged frames allowed. */
+    vpx_codec_pkt_list_decl(64) pkt_list;
+    unsigned int                fixed_kf_cntr;
+};
+
+
+static vpx_codec_err_t
+update_error_state(vpx_codec_alg_priv_t                 *ctx,
+                   const struct vpx_internal_error_info *error)
+{
+    vpx_codec_err_t res;
+
+    if ((res = error->error_code))
+        ctx->base.err_detail = error->has_detail
+                               ? error->detail
+                               : NULL;
+
+    return res;
+}
+
+
+#undef ERROR
+#define ERROR(str) do {\
+        ctx->base.err_detail = str;\
+        return VPX_CODEC_INVALID_PARAM;\
+    } while(0)
+
+#define RANGE_CHECK(p,memb,lo,hi) do {\
+        if(!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
+            ERROR(#memb " out of range ["#lo".."#hi"]");\
+    } while(0)
+
+#define RANGE_CHECK_HI(p,memb,hi) do {\
+        if(!((p)->memb <= (hi))) \
+            ERROR(#memb " out of range [.."#hi"]");\
+    } while(0)
+
+#define RANGE_CHECK_LO(p,memb,lo) do {\
+        if(!((p)->memb >= (lo))) \
+            ERROR(#memb " out of range ["#lo"..]");\
+    } while(0)
+
+#define RANGE_CHECK_BOOL(p,memb) do {\
+        if(!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\
+    } while(0)
+
+static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
+                                       const vpx_codec_enc_cfg_t *cfg,
+                                       const struct vp8_extracfg *vp8_cfg,
+                                       int                        finalize)
+{
+    RANGE_CHECK(cfg, g_w,                   1, 16383); /* 14 bits available */
+    RANGE_CHECK(cfg, g_h,                   1, 16383); /* 14 bits available */
+    RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
+    RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
+    RANGE_CHECK_HI(cfg, g_profile,          3);
+    RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
+    RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);
+    RANGE_CHECK_HI(cfg, g_threads,          64);
+#if CONFIG_REALTIME_ONLY
+    RANGE_CHECK_HI(cfg, g_lag_in_frames,    0);
+#elif CONFIG_MULTI_RES_ENCODING
+    if (ctx->base.enc.total_encoders > 1)
+        RANGE_CHECK_HI(cfg, g_lag_in_frames,    0);
+#else
+    RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
+#endif
+    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
+    RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
+    RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
+    RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+    RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
+
+/* TODO: add spatial re-sampling support and frame dropping in
+ * multi-res-encoder.*/
+#if CONFIG_MULTI_RES_ENCODING
+    if (ctx->base.enc.total_encoders > 1)
+        RANGE_CHECK_HI(cfg, rc_resize_allowed,     0);
+#else
+    RANGE_CHECK_BOOL(cfg, rc_resize_allowed);
+#endif
+    RANGE_CHECK_HI(cfg, rc_dropframe_thresh,   100);
+    RANGE_CHECK_HI(cfg, rc_resize_up_thresh,   100);
+    RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
+
+#if CONFIG_REALTIME_ONLY
+    RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_ONE_PASS);
+#elif CONFIG_MULTI_RES_ENCODING
+    if (ctx->base.enc.total_encoders > 1)
+        RANGE_CHECK(cfg,    g_pass,         VPX_RC_ONE_PASS, VPX_RC_ONE_PASS);
+#else
+    RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
+#endif
+
+    /* VP8 does not support a lower bound on the keyframe interval in
+     * automatic keyframe placement mode.
+     */
+    if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist
+        && cfg->kf_min_dist > 0)
+        ERROR("kf_min_dist not supported in auto mode, use 0 "
+              "or kf_max_dist instead.");
+
+    RANGE_CHECK_BOOL(vp8_cfg,               enable_auto_alt_ref);
+    RANGE_CHECK(vp8_cfg, cpu_used,           -16, 16);
+
+#if CONFIG_REALTIME_ONLY && !CONFIG_TEMPORAL_DENOISING
+    RANGE_CHECK(vp8_cfg, noise_sensitivity,  0, 0);
+#else
+    RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);
+#endif
+
+    RANGE_CHECK(vp8_cfg, token_partitions,   VP8_ONE_TOKENPARTITION,
+                VP8_EIGHT_TOKENPARTITION);
+    RANGE_CHECK_HI(vp8_cfg, Sharpness,       7);
+    RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);
+    RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
+    RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
+    RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
+    if(finalize && cfg->rc_end_usage == VPX_CQ)
+        RANGE_CHECK(vp8_cfg, cq_level,
+                    cfg->rc_min_quantizer, cfg->rc_max_quantizer);
+
+#if !(CONFIG_REALTIME_ONLY)
+    if (cfg->g_pass == VPX_RC_LAST_PASS)
+    {
+        size_t           packet_sz = sizeof(FIRSTPASS_STATS);
+        int              n_packets = (int)(cfg->rc_twopass_stats_in.sz /
+                                          packet_sz);
+        FIRSTPASS_STATS *stats;
+
+        if (!cfg->rc_twopass_stats_in.buf)
+            ERROR("rc_twopass_stats_in.buf not set.");
+
+        if (cfg->rc_twopass_stats_in.sz % packet_sz)
+            ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
+
+        if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
+            ERROR("rc_twopass_stats_in requires at least two packets.");
+
+        stats = (void*)((char *)cfg->rc_twopass_stats_in.buf
+                + (n_packets - 1) * packet_sz);
+
+        if ((int)(stats->count + 0.5) != n_packets - 1)
+            ERROR("rc_twopass_stats_in missing EOS stats packet");
+    }
+#endif
+
+    RANGE_CHECK(cfg, ts_number_layers, 1, 5);
+
+    if (cfg->ts_number_layers > 1)
+    {
+        unsigned int i;
+        RANGE_CHECK_HI(cfg, ts_periodicity, 16);
+
+        for (i=1; i<cfg->ts_number_layers; i++)
+            if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i-1])
+                ERROR("ts_target_bitrate entries are not strictly increasing");
+
+        RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers-1], 1, 1);
+        for (i=cfg->ts_number_layers-2; i>0; i--)
+            if (cfg->ts_rate_decimator[i-1] != 2*cfg->ts_rate_decimator[i])
+                ERROR("ts_rate_decimator factors are not powers of 2");
+
+        RANGE_CHECK_HI(cfg, ts_layer_id[i], cfg->ts_number_layers-1);
+    }
+
+#if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
+    if(cfg->g_threads > (1 << vp8_cfg->token_partitions))
+        ERROR("g_threads cannot be bigger than number of token partitions");
+#endif
+
+    return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
+                                    const vpx_image_t    *img)
+{
+    switch (img->fmt)
+    {
+    case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_I420:
+    case VPX_IMG_FMT_VPXI420:
+    case VPX_IMG_FMT_VPXYV12:
+        break;
+    default:
+        ERROR("Invalid image format. Only YV12 and I420 images are supported");
+    }
+
+    if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h))
+        ERROR("Image size must match encoder init configuration size");
+
+    return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
+                                       vpx_codec_enc_cfg_t cfg,
+                                       struct vp8_extracfg vp8_cfg,
+                                       vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
+{
+    oxcf->multi_threaded         = cfg.g_threads;
+    oxcf->Version               = cfg.g_profile;
+
+    oxcf->Width                 = cfg.g_w;
+    oxcf->Height                = cfg.g_h;
+    oxcf->timebase              = cfg.g_timebase;
+
+    oxcf->error_resilient_mode = cfg.g_error_resilient;
+
+    switch (cfg.g_pass)
+    {
+    case VPX_RC_ONE_PASS:
+        oxcf->Mode = MODE_BESTQUALITY;
+        break;
+    case VPX_RC_FIRST_PASS:
+        oxcf->Mode = MODE_FIRSTPASS;
+        break;
+    case VPX_RC_LAST_PASS:
+        oxcf->Mode = MODE_SECONDPASS_BEST;
+        break;
+    }
+
+    if (cfg.g_pass == VPX_RC_FIRST_PASS || cfg.g_pass == VPX_RC_ONE_PASS)
+    {
+        oxcf->allow_lag     = 0;
+        oxcf->lag_in_frames = 0;
+    }
+    else
+    {
+        oxcf->allow_lag     = (cfg.g_lag_in_frames) > 0;
+        oxcf->lag_in_frames = cfg.g_lag_in_frames;
+    }
+
+    oxcf->allow_df               = (cfg.rc_dropframe_thresh > 0);
+    oxcf->drop_frames_water_mark   = cfg.rc_dropframe_thresh;
+
+    oxcf->allow_spatial_resampling = cfg.rc_resize_allowed;
+    oxcf->resample_up_water_mark   = cfg.rc_resize_up_thresh;
+    oxcf->resample_down_water_mark = cfg.rc_resize_down_thresh;
+
+    if (cfg.rc_end_usage == VPX_VBR)
+    {
+        oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
+    }
+    else if (cfg.rc_end_usage == VPX_CBR)
+    {
+        oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+    }
+    else if (cfg.rc_end_usage == VPX_CQ)
+    {
+        oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+    }
+
+    oxcf->target_bandwidth         = cfg.rc_target_bitrate;
+    oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
+
+    oxcf->best_allowed_q           = cfg.rc_min_quantizer;
+    oxcf->worst_allowed_q          = cfg.rc_max_quantizer;
+    oxcf->cq_level                 = vp8_cfg.cq_level;
+    oxcf->fixed_q = -1;
+
+    oxcf->under_shoot_pct          = cfg.rc_undershoot_pct;
+    oxcf->over_shoot_pct           = cfg.rc_overshoot_pct;
+
+    oxcf->maximum_buffer_size_in_ms   = cfg.rc_buf_sz;
+    oxcf->starting_buffer_level_in_ms = cfg.rc_buf_initial_sz;
+    oxcf->optimal_buffer_level_in_ms  = cfg.rc_buf_optimal_sz;
+
+    oxcf->maximum_buffer_size      = cfg.rc_buf_sz;
+    oxcf->starting_buffer_level    = cfg.rc_buf_initial_sz;
+    oxcf->optimal_buffer_level     = cfg.rc_buf_optimal_sz;
+
+    oxcf->two_pass_vbrbias         = cfg.rc_2pass_vbr_bias_pct;
+    oxcf->two_pass_vbrmin_section  = cfg.rc_2pass_vbr_minsection_pct;
+    oxcf->two_pass_vbrmax_section  = cfg.rc_2pass_vbr_maxsection_pct;
+
+    oxcf->auto_key                 = cfg.kf_mode == VPX_KF_AUTO
+                                       && cfg.kf_min_dist != cfg.kf_max_dist;
+    oxcf->key_freq                 = cfg.kf_max_dist;
+
+    oxcf->number_of_layers         = cfg.ts_number_layers;
+    oxcf->periodicity              = cfg.ts_periodicity;
+
+    if (oxcf->number_of_layers > 1)
+    {
+        memcpy (oxcf->target_bitrate, cfg.ts_target_bitrate,
+                          sizeof(cfg.ts_target_bitrate));
+        memcpy (oxcf->rate_decimator, cfg.ts_rate_decimator,
+                          sizeof(cfg.ts_rate_decimator));
+        memcpy (oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id));
+    }
+
+#if CONFIG_MULTI_RES_ENCODING
+    /* When mr_cfg is NULL, oxcf->mr_total_resolutions and oxcf->mr_encoder_id
+     * are both memset to 0, which ensures the correct logic under this
+     * situation.
+     */
+    if(mr_cfg)
+    {
+        oxcf->mr_total_resolutions        = mr_cfg->mr_total_resolutions;
+        oxcf->mr_encoder_id               = mr_cfg->mr_encoder_id;
+        oxcf->mr_down_sampling_factor.num = mr_cfg->mr_down_sampling_factor.num;
+        oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den;
+        oxcf->mr_low_res_mode_info        = mr_cfg->mr_low_res_mode_info;
+    }
+#endif
+
+    oxcf->cpu_used               = vp8_cfg.cpu_used;
+    oxcf->encode_breakout        = vp8_cfg.static_thresh;
+    oxcf->play_alternate         = vp8_cfg.enable_auto_alt_ref;
+    oxcf->noise_sensitivity      = vp8_cfg.noise_sensitivity;
+    oxcf->Sharpness              = vp8_cfg.Sharpness;
+    oxcf->token_partitions       = vp8_cfg.token_partitions;
+
+    oxcf->two_pass_stats_in      = cfg.rc_twopass_stats_in;
+    oxcf->output_pkt_list        = vp8_cfg.pkt_list;
+
+    oxcf->arnr_max_frames        = vp8_cfg.arnr_max_frames;
+    oxcf->arnr_strength          = vp8_cfg.arnr_strength;
+    oxcf->arnr_type              = vp8_cfg.arnr_type;
+
+    oxcf->tuning                 = vp8_cfg.tuning;
+
+    /*
+        printf("Current VP8 Settings: \n");
+        printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
+        printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);
+        printf("Sharpness: %d\n",    oxcf->Sharpness);
+        printf("cpu_used: %d\n",  oxcf->cpu_used);
+        printf("Mode: %d\n",     oxcf->Mode);
+        printf("delete_first_pass_file: %d\n",  oxcf->delete_first_pass_file);
+        printf("auto_key: %d\n",  oxcf->auto_key);
+        printf("key_freq: %d\n", oxcf->key_freq);
+        printf("end_usage: %d\n", oxcf->end_usage);
+        printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);
+        printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);
+        printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);
+        printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);
+        printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);
+        printf("fixed_q: %d\n",  oxcf->fixed_q);
+        printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);
+        printf("best_allowed_q: %d\n", oxcf->best_allowed_q);
+        printf("allow_spatial_resampling: %d\n",  oxcf->allow_spatial_resampling);
+        printf("resample_down_water_mark: %d\n", oxcf->resample_down_water_mark);
+        printf("resample_up_water_mark: %d\n", oxcf->resample_up_water_mark);
+        printf("allow_df: %d\n", oxcf->allow_df);
+        printf("drop_frames_water_mark: %d\n", oxcf->drop_frames_water_mark);
+        printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
+        printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
+        printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
+        printf("allow_lag: %d\n", oxcf->allow_lag);
+        printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
+        printf("play_alternate: %d\n", oxcf->play_alternate);
+        printf("Version: %d\n", oxcf->Version);
+        printf("multi_threaded: %d\n",   oxcf->multi_threaded);
+        printf("encode_breakout: %d\n", oxcf->encode_breakout);
+    */
+    return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t       *ctx,
+                                       const vpx_codec_enc_cfg_t  *cfg)
+{
+    vpx_codec_err_t res;
+
+    if (((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
+        && (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS))
+        ERROR("Cannot change width or height after initialization");
+
+    /* Prevent increasing lag_in_frames. This check is stricter than it needs
+     * to be -- the limit is not increasing past the first lag_in_frames
+     * value, but we don't track the initial config, only the last successful
+     * config.
+     */
+    if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames))
+        ERROR("Cannot increase lag_in_frames");
+
+    res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0);
+
+    if (!res)
+    {
+        ctx->cfg = *cfg;
+        set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
+        vp8_change_config(ctx->cpi, &ctx->oxcf);
+    }
+
+    return res;
+}
+
+
+int vp8_reverse_trans(int);
+
+
+static vpx_codec_err_t get_param(vpx_codec_alg_priv_t *ctx,
+                                 int                   ctrl_id,
+                                 va_list               args)
+{
+    void *arg = va_arg(args, void *);
+
+#define MAP(id, var) case id: *(RECAST(id, arg)) = var; break
+
+    if (!arg)
+        return VPX_CODEC_INVALID_PARAM;
+
+    switch (ctrl_id)
+    {
+        MAP(VP8E_GET_LAST_QUANTIZER, vp8_get_quantizer(ctx->cpi));
+        MAP(VP8E_GET_LAST_QUANTIZER_64, vp8_reverse_trans(vp8_get_quantizer(ctx->cpi)));
+    }
+
+    return VPX_CODEC_OK;
+#undef MAP
+}
+
+
+static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
+                                 int                   ctrl_id,
+                                 va_list               args)
+{
+    vpx_codec_err_t     res  = VPX_CODEC_OK;
+    struct vp8_extracfg xcfg = ctx->vp8_cfg;
+
+#define MAP(id, var) case id: var = CAST(id, args); break;
+
+    switch (ctrl_id)
+    {
+        MAP(VP8E_SET_CPUUSED,               xcfg.cpu_used);
+        MAP(VP8E_SET_ENABLEAUTOALTREF,      xcfg.enable_auto_alt_ref);
+        MAP(VP8E_SET_NOISE_SENSITIVITY,     xcfg.noise_sensitivity);
+        MAP(VP8E_SET_SHARPNESS,             xcfg.Sharpness);
+        MAP(VP8E_SET_STATIC_THRESHOLD,      xcfg.static_thresh);
+        MAP(VP8E_SET_TOKEN_PARTITIONS,      xcfg.token_partitions);
+
+        MAP(VP8E_SET_ARNR_MAXFRAMES,        xcfg.arnr_max_frames);
+        MAP(VP8E_SET_ARNR_STRENGTH ,        xcfg.arnr_strength);
+        MAP(VP8E_SET_ARNR_TYPE     ,        xcfg.arnr_type);
+        MAP(VP8E_SET_TUNING,                xcfg.tuning);
+        MAP(VP8E_SET_CQ_LEVEL,              xcfg.cq_level);
+        MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct);
+
+    }
+
+    res = validate_config(ctx, &ctx->cfg, &xcfg, 0);
+
+    if (!res)
+    {
+        ctx->vp8_cfg = xcfg;
+        set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
+        vp8_change_config(ctx->cpi, &ctx->oxcf);
+    }
+
+    return res;
+#undef MAP
+}
+
+static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
+                                        void **mem_loc)
+{
+    vpx_codec_err_t res = 0;
+
+#if CONFIG_MULTI_RES_ENCODING
+    LOWER_RES_FRAME_INFO *shared_mem_loc;
+    int mb_rows = ((cfg->g_w + 15) >>4);
+    int mb_cols = ((cfg->g_h + 15) >>4);
+
+    shared_mem_loc = calloc(1, sizeof(LOWER_RES_FRAME_INFO));
+    if(!shared_mem_loc)
+    {
+        res = VPX_CODEC_MEM_ERROR;
+    }
+
+    shared_mem_loc->mb_info = calloc(mb_rows*mb_cols, sizeof(LOWER_RES_MB_INFO));
+    if(!(shared_mem_loc->mb_info))
+    {
+        res = VPX_CODEC_MEM_ERROR;
+    }
+    else
+    {
+        *mem_loc = (void *)shared_mem_loc;
+        res = VPX_CODEC_OK;
+    }
+#endif
+    return res;
+}
+
+static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
+                                 vpx_codec_priv_enc_mr_cfg_t *mr_cfg)
+{
+    vpx_codec_err_t        res = VPX_CODEC_OK;
+    struct vpx_codec_alg_priv *priv;
+    vpx_codec_enc_cfg_t       *cfg;
+    unsigned int               i;
+
+    struct VP8_COMP *optr;
+
+    vpx_rtcd();
+
+    if (!ctx->priv)
+    {
+        priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
+
+        if (!priv)
+        {
+            return VPX_CODEC_MEM_ERROR;
+        }
+
+        ctx->priv = &priv->base;
+        ctx->priv->sz = sizeof(*ctx->priv);
+        ctx->priv->iface = ctx->iface;
+        ctx->priv->alg_priv = priv;
+        ctx->priv->init_flags = ctx->init_flags;
+
+        if (ctx->config.enc)
+        {
+            /* Update the reference to the config structure to an
+             * internal copy.
+             */
+            ctx->priv->alg_priv->cfg = *ctx->config.enc;
+            ctx->config.enc = &ctx->priv->alg_priv->cfg;
+        }
+
+        cfg =  &ctx->priv->alg_priv->cfg;
+
+        /* Select the extra vp8 configuration table based on the current
+         * usage value. If the current usage value isn't found, use the
+         * values for usage case 0.
+         */
+        for (i = 0;
+             extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
+             i++);
+
+        priv->vp8_cfg = extracfg_map[i].cfg;
+        priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
+
+        priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
+
+        if (priv->cx_data_sz < 32768) priv->cx_data_sz = 32768;
+
+        priv->cx_data = malloc(priv->cx_data_sz);
+
+        if (!priv->cx_data)
+        {
+            return VPX_CODEC_MEM_ERROR;
+        }
+
+        if(mr_cfg)
+            ctx->priv->enc.total_encoders   = mr_cfg->mr_total_resolutions;
+        else
+            ctx->priv->enc.total_encoders   = 1;
+
+        res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0);
+
+        if (!res)
+        {
+            set_vp8e_config(&ctx->priv->alg_priv->oxcf,
+                             ctx->priv->alg_priv->cfg,
+                             ctx->priv->alg_priv->vp8_cfg,
+                             mr_cfg);
+
+            optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf);
+
+            if (!optr)
+                res = VPX_CODEC_MEM_ERROR;
+            else
+                ctx->priv->alg_priv->cpi = optr;
+        }
+    }
+
+    return res;
+}
+
+static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx)
+{
+#if CONFIG_MULTI_RES_ENCODING
+    /* Free multi-encoder shared memory */
+    if (ctx->oxcf.mr_total_resolutions > 0 && (ctx->oxcf.mr_encoder_id == ctx->oxcf.mr_total_resolutions-1))
+    {
+        LOWER_RES_FRAME_INFO *shared_mem_loc = (LOWER_RES_FRAME_INFO *)ctx->oxcf.mr_low_res_mode_info;
+        free(shared_mem_loc->mb_info);
+        free(ctx->oxcf.mr_low_res_mode_info);
+    }
+#endif
+
+    free(ctx->cx_data);
+    vp8_remove_compressor(&ctx->cpi);
+    free(ctx);
+    return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
+                                       YV12_BUFFER_CONFIG  *yv12)
+{
+    vpx_codec_err_t        res = VPX_CODEC_OK;
+    yv12->y_buffer = img->planes[VPX_PLANE_Y];
+    yv12->u_buffer = img->planes[VPX_PLANE_U];
+    yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+    yv12->y_width  = img->d_w;
+    yv12->y_height = img->d_h;
+    yv12->uv_width = (1 + yv12->y_width) / 2;
+    yv12->uv_height = (1 + yv12->y_height) / 2;
+
+    yv12->y_stride = img->stride[VPX_PLANE_Y];
+    yv12->uv_stride = img->stride[VPX_PLANE_U];
+
+    yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
+    return res;
+}
+
+static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
+                                    unsigned long          duration,
+                                    unsigned long          deadline)
+{
+    unsigned int new_qc;
+
+#if !(CONFIG_REALTIME_ONLY)
+    /* Use best quality mode if no deadline is given. */
+    new_qc = MODE_BESTQUALITY;
+
+    if (deadline)
+    {
+        uint64_t     duration_us;
+
+        /* Convert duration parameter from stream timebase to microseconds */
+        duration_us = (uint64_t)duration * 1000000
+                      * (uint64_t)ctx->cfg.g_timebase.num
+                      / (uint64_t)ctx->cfg.g_timebase.den;
+
+        /* If the deadline is more that the duration this frame is to be shown,
+         * use good quality mode. Otherwise use realtime mode.
+         */
+        new_qc = (deadline > duration_us) ? MODE_GOODQUALITY : MODE_REALTIME;
+    }
+
+#else
+    new_qc = MODE_REALTIME;
+#endif
+
+    if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS)
+        new_qc = MODE_FIRSTPASS;
+    else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS)
+        new_qc = (new_qc == MODE_BESTQUALITY)
+                 ? MODE_SECONDPASS_BEST
+                 : MODE_SECONDPASS;
+
+    if (ctx->oxcf.Mode != new_qc)
+    {
+        ctx->oxcf.Mode = new_qc;
+        vp8_change_config(ctx->cpi, &ctx->oxcf);
+    }
+}
+
+
+static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
+                                   const vpx_image_t     *img,
+                                   vpx_codec_pts_t        pts,
+                                   unsigned long          duration,
+                                   vpx_enc_frame_flags_t  flags,
+                                   unsigned long          deadline)
+{
+    vpx_codec_err_t res = VPX_CODEC_OK;
+
+    if (!ctx->cfg.rc_target_bitrate)
+        return res;
+
+    if (!ctx->cfg.rc_target_bitrate)
+        return res;
+
+    if (img)
+        res = validate_img(ctx, img);
+
+    if (!res)
+        res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
+
+    pick_quickcompress_mode(ctx, duration, deadline);
+    vpx_codec_pkt_list_init(&ctx->pkt_list);
+
+    /* Handle Flags */
+    if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF))
+        || ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF)))
+    {
+        ctx->base.err_detail = "Conflicting flags.";
+        return VPX_CODEC_INVALID_PARAM;
+    }
+
+    if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF
+                 | VP8_EFLAG_NO_REF_ARF))
+    {
+        int ref = 7;
+
+        if (flags & VP8_EFLAG_NO_REF_LAST)
+            ref ^= VP8_LAST_FRAME;
+
+        if (flags & VP8_EFLAG_NO_REF_GF)
+            ref ^= VP8_GOLD_FRAME;
+
+        if (flags & VP8_EFLAG_NO_REF_ARF)
+            ref ^= VP8_ALTR_FRAME;
+
+        vp8_use_as_reference(ctx->cpi, ref);
+    }
+
+    if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF
+                 | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF
+                 | VP8_EFLAG_FORCE_ARF))
+    {
+        int upd = 7;
+
+        if (flags & VP8_EFLAG_NO_UPD_LAST)
+            upd ^= VP8_LAST_FRAME;
+
+        if (flags & VP8_EFLAG_NO_UPD_GF)
+            upd ^= VP8_GOLD_FRAME;
+
+        if (flags & VP8_EFLAG_NO_UPD_ARF)
+            upd ^= VP8_ALTR_FRAME;
+
+        vp8_update_reference(ctx->cpi, upd);
+    }
+
+    if (flags & VP8_EFLAG_NO_UPD_ENTROPY)
+    {
+        vp8_update_entropy(ctx->cpi, 0);
+    }
+
+    /* Handle fixed keyframe intervals */
+    if (ctx->cfg.kf_mode == VPX_KF_AUTO
+        && ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist)
+    {
+        if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist)
+        {
+            flags |= VPX_EFLAG_FORCE_KF;
+            ctx->fixed_kf_cntr = 1;
+        }
+    }
+
+    /* Initialize the encoder instance on the first frame*/
+    if (!res && ctx->cpi)
+    {
+        unsigned int lib_flags;
+        YV12_BUFFER_CONFIG sd;
+        int64_t dst_time_stamp, dst_end_time_stamp;
+        unsigned long size, cx_data_sz;
+        unsigned char *cx_data;
+        unsigned char *cx_data_end;
+        int comp_data_state = 0;
+
+        /* Set up internal flags */
+        if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
+            ((VP8_COMP *)ctx->cpi)->b_calculate_psnr = 1;
+
+        if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION)
+            ((VP8_COMP *)ctx->cpi)->output_partition = 1;
+
+        /* Convert API flags to internal codec lib flags */
+        lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+        /* vp8 use 10,000,000 ticks/second as time stamp */
+        dst_time_stamp    = pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+        dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
+
+        if (img != NULL)
+        {
+            res = image2yuvconfig(img, &sd);
+
+            if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,
+                                      &sd, dst_time_stamp, dst_end_time_stamp))
+            {
+                VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
+                res = update_error_state(ctx, &cpi->common.error);
+            }
+
+            /* reset for next frame */
+            ctx->next_frame_flag = 0;
+        }
+
+        cx_data = ctx->cx_data;
+        cx_data_sz = ctx->cx_data_sz;
+        cx_data_end = ctx->cx_data + cx_data_sz;
+        lib_flags = 0;
+
+        while (cx_data_sz >= ctx->cx_data_sz / 2)
+        {
+            comp_data_state = vp8_get_compressed_data(ctx->cpi,
+                                                  &lib_flags,
+                                                  &size,
+                                                  cx_data,
+                                                  cx_data_end,
+                                                  &dst_time_stamp,
+                                                  &dst_end_time_stamp,
+                                                  !img);
+
+            if(comp_data_state == VPX_CODEC_CORRUPT_FRAME)
+                return VPX_CODEC_CORRUPT_FRAME;
+            else if(comp_data_state == -1)
+                break;
+
+            if (size)
+            {
+                vpx_codec_pts_t    round, delta;
+                vpx_codec_cx_pkt_t pkt;
+                VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
+
+                /* Add the frame packet to the list of returned packets. */
+                round = (vpx_codec_pts_t)1000000
+                        * ctx->cfg.g_timebase.num / 2 - 1;
+                delta = (dst_end_time_stamp - dst_time_stamp);
+                pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+                pkt.data.frame.pts =
+                    (dst_time_stamp * ctx->cfg.g_timebase.den + round)
+                    / ctx->cfg.g_timebase.num / 10000000;
+                pkt.data.frame.duration = (unsigned long)
+                    ((delta * ctx->cfg.g_timebase.den + round)
+                    / ctx->cfg.g_timebase.num / 10000000);
+                pkt.data.frame.flags = lib_flags << 16;
+
+                if (lib_flags & FRAMEFLAGS_KEY)
+                    pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
+
+                if (!cpi->common.show_frame)
+                {
+                    pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
+
+                    /* This timestamp should be as close as possible to the
+                     * prior PTS so that if a decoder uses pts to schedule when
+                     * to do this, we start right after last frame was decoded.
+                     * Invisible frames have no duration.
+                     */
+                    pkt.data.frame.pts = ((cpi->last_time_stamp_seen
+                        * ctx->cfg.g_timebase.den + round)
+                        / ctx->cfg.g_timebase.num / 10000000) + 1;
+                    pkt.data.frame.duration = 0;
+                }
+
+                if (cpi->droppable)
+                    pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE;
+
+                if (cpi->output_partition)
+                {
+                    int i;
+                    const int num_partitions =
+                            (1 << cpi->common.multi_token_partition) + 1;
+
+                    pkt.data.frame.flags |= VPX_FRAME_IS_FRAGMENT;
+
+                    for (i = 0; i < num_partitions; ++i)
+                    {
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                        pkt.data.frame.buf = cpi->partition_d[i];
+#else
+                        pkt.data.frame.buf = cx_data;
+                        cx_data += cpi->partition_sz[i];
+                        cx_data_sz -= cpi->partition_sz[i];
+#endif
+                        pkt.data.frame.sz = cpi->partition_sz[i];
+                        pkt.data.frame.partition_id = i;
+                        /* don't set the fragment bit for the last partition */
+                        if (i == (num_partitions - 1))
+                            pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT;
+                        vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+                    }
+#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
+                    /* In lagged mode the encoder can buffer multiple frames.
+                     * We don't want this in partitioned output because
+                     * partitions are spread all over the output buffer.
+                     * So, force an exit!
+                     */
+                    cx_data_sz -= ctx->cx_data_sz / 2;
+#endif
+                }
+                else
+                {
+                    pkt.data.frame.buf = cx_data;
+                    pkt.data.frame.sz  = size;
+                    pkt.data.frame.partition_id = -1;
+                    vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+                    cx_data += size;
+                    cx_data_sz -= size;
+                }
+            }
+        }
+    }
+
+    return res;
+}
+
+
+static const vpx_codec_cx_pkt_t *vp8e_get_cxdata(vpx_codec_alg_priv_t  *ctx,
+        vpx_codec_iter_t      *iter)
+{
+    return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter);
+}
+
+static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx,
+        int ctr_id,
+        va_list args)
+{
+    vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+    if (data)
+    {
+        vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+        YV12_BUFFER_CONFIG sd;
+
+        image2yuvconfig(&frame->img, &sd);
+        vp8_set_reference(ctx->cpi, frame->frame_type, &sd);
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx,
+        int ctr_id,
+        va_list args)
+{
+
+    vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+    if (data)
+    {
+        vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+        YV12_BUFFER_CONFIG sd;
+
+        image2yuvconfig(&frame->img, &sd);
+        vp8_get_reference(ctx->cpi, frame->frame_type, &sd);
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx,
+        int ctr_id,
+        va_list args)
+{
+#if CONFIG_POSTPROC
+    vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+    (void)ctr_id;
+
+    if (data)
+    {
+        ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data);
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+#else
+    (void)ctx;
+    (void)ctr_id;
+    (void)args;
+    return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+
+static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx)
+{
+
+    YV12_BUFFER_CONFIG sd;
+    vp8_ppflags_t flags = {0};
+
+    if (ctx->preview_ppcfg.post_proc_flag)
+    {
+        flags.post_proc_flag        = ctx->preview_ppcfg.post_proc_flag;
+        flags.deblocking_level      = ctx->preview_ppcfg.deblocking_level;
+        flags.noise_level           = ctx->preview_ppcfg.noise_level;
+    }
+
+    if (0 == vp8_get_preview_raw_frame(ctx->cpi, &sd, &flags))
+    {
+
+        /*
+        vpx_img_wrap(&ctx->preview_img, VPX_IMG_FMT_YV12,
+            sd.y_width + 2*VP8BORDERINPIXELS,
+            sd.y_height + 2*VP8BORDERINPIXELS,
+            1,
+            sd.buffer_alloc);
+        vpx_img_set_rect(&ctx->preview_img,
+            VP8BORDERINPIXELS, VP8BORDERINPIXELS,
+            sd.y_width, sd.y_height);
+            */
+
+        ctx->preview_img.bps = 12;
+        ctx->preview_img.planes[VPX_PLANE_Y] = sd.y_buffer;
+        ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer;
+        ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer;
+
+        if (sd.clrtype == REG_YUV)
+            ctx->preview_img.fmt = VPX_IMG_FMT_I420;
+        else
+            ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420;
+
+        ctx->preview_img.x_chroma_shift = 1;
+        ctx->preview_img.y_chroma_shift = 1;
+
+        ctx->preview_img.d_w = sd.y_width;
+        ctx->preview_img.d_h = sd.y_height;
+        ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;
+        ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;
+        ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;
+        ctx->preview_img.w   = sd.y_width;
+        ctx->preview_img.h   = sd.y_height;
+
+        return &ctx->preview_img;
+    }
+    else
+        return NULL;
+}
+
+static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx,
+        int ctr_id,
+        va_list args)
+{
+    int update = va_arg(args, int);
+    vp8_update_entropy(ctx->cpi, update);
+    return VPX_CODEC_OK;
+
+}
+
+static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx,
+        int ctr_id,
+        va_list args)
+{
+    int update = va_arg(args, int);
+    vp8_update_reference(ctx->cpi, update);
+    return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,
+        int ctr_id,
+        va_list args)
+{
+    int reference_flag = va_arg(args, int);
+    vp8_use_as_reference(ctx->cpi, reference_flag);
+    return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,
+                                        int ctr_id,
+                                        va_list args)
+{
+    vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
+
+    if (data)
+    {
+        vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
+
+        if (!vp8_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols, roi->delta_q, roi->delta_lf, roi->static_threshold))
+            return VPX_CODEC_OK;
+        else
+            return VPX_CODEC_INVALID_PARAM;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx,
+        int ctr_id,
+        va_list args)
+{
+    vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
+
+    if (data)
+    {
+
+        vpx_active_map_t *map = (vpx_active_map_t *)data;
+
+        if (!vp8_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
+            return VPX_CODEC_OK;
+        else
+            return VPX_CODEC_INVALID_PARAM;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
+        int ctr_id,
+        va_list args)
+{
+
+    vpx_scaling_mode_t *data =  va_arg(args, vpx_scaling_mode_t *);
+
+    if (data)
+    {
+        int res;
+        vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data ;
+        res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode, scalemode.v_scaling_mode);
+
+        if (!res)
+        {
+            /*force next frame a key frame to effect scaling mode */
+            ctx->next_frame_flag |= FRAMEFLAGS_KEY;
+            return VPX_CODEC_OK;
+        }
+        else
+            return VPX_CODEC_INVALID_PARAM;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+
+
+static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] =
+{
+    {VP8_SET_REFERENCE,                 vp8e_set_reference},
+    {VP8_COPY_REFERENCE,                vp8e_get_reference},
+    {VP8_SET_POSTPROC,                  vp8e_set_previewpp},
+    {VP8E_UPD_ENTROPY,                  vp8e_update_entropy},
+    {VP8E_UPD_REFERENCE,                vp8e_update_reference},
+    {VP8E_USE_REFERENCE,                vp8e_use_reference},
+    {VP8E_SET_ROI_MAP,                  vp8e_set_roi_map},
+    {VP8E_SET_ACTIVEMAP,                vp8e_set_activemap},
+    {VP8E_SET_SCALEMODE,                vp8e_set_scalemode},
+    {VP8E_SET_CPUUSED,                  set_param},
+    {VP8E_SET_NOISE_SENSITIVITY,        set_param},
+    {VP8E_SET_ENABLEAUTOALTREF,         set_param},
+    {VP8E_SET_SHARPNESS,                set_param},
+    {VP8E_SET_STATIC_THRESHOLD,         set_param},
+    {VP8E_SET_TOKEN_PARTITIONS,         set_param},
+    {VP8E_GET_LAST_QUANTIZER,           get_param},
+    {VP8E_GET_LAST_QUANTIZER_64,        get_param},
+    {VP8E_SET_ARNR_MAXFRAMES,           set_param},
+    {VP8E_SET_ARNR_STRENGTH ,           set_param},
+    {VP8E_SET_ARNR_TYPE     ,           set_param},
+    {VP8E_SET_TUNING,                   set_param},
+    {VP8E_SET_CQ_LEVEL,                 set_param},
+    {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_param},
+    { -1, NULL},
+};
+
+static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
+{
+    {
+    0,
+    {
+        0,                  /* g_usage */
+        0,                  /* g_threads */
+        0,                  /* g_profile */
+
+        320,                /* g_width */
+        240,                /* g_height */
+        {1, 30},            /* g_timebase */
+
+        0,                  /* g_error_resilient */
+
+        VPX_RC_ONE_PASS,    /* g_pass */
+
+        0,                  /* g_lag_in_frames */
+
+        0,                  /* rc_dropframe_thresh */
+        0,                  /* rc_resize_allowed */
+        60,                 /* rc_resize_down_thresold */
+        30,                 /* rc_resize_up_thresold */
+
+        VPX_VBR,            /* rc_end_usage */
+#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
+        {0},                /* rc_twopass_stats_in */
+#endif
+        256,                /* rc_target_bandwidth */
+        4,                  /* rc_min_quantizer */
+        63,                 /* rc_max_quantizer */
+        100,                /* rc_undershoot_pct */
+        100,                /* rc_overshoot_pct */
+
+        6000,               /* rc_max_buffer_size */
+        4000,               /* rc_buffer_initial_size; */
+        5000,               /* rc_buffer_optimal_size; */
+
+        50,                 /* rc_two_pass_vbrbias  */
+        0,                  /* rc_two_pass_vbrmin_section */
+        400,                /* rc_two_pass_vbrmax_section */
+
+        /* keyframing settings (kf) */
+        VPX_KF_AUTO,        /* g_kfmode*/
+        0,                  /* kf_min_dist */
+        128,                /* kf_max_dist */
+
+#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
+        1,                  /* g_delete_first_pass_file */
+        "vp8.fpf"           /* first pass filename */
+#endif
+
+        1,                  /* ts_number_layers */
+        {0},                /* ts_target_bitrate */
+        {0},                /* ts_rate_decimator */
+        0,                  /* ts_periodicity */
+        {0},                /* ts_layer_id */
+    }},
+    { -1, {NOT_IMPLEMENTED}}
+};
+
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp8_cx) =
+{
+    "WebM Project VP8 Encoder" VERSION_STRING,
+    VPX_CODEC_INTERNAL_ABI_VERSION,
+    VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR |
+    VPX_CODEC_CAP_OUTPUT_PARTITION,
+    /* vpx_codec_caps_t          caps; */
+    vp8e_init,          /* vpx_codec_init_fn_t       init; */
+    vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
+    vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
+    NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
+    NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
+    {
+        NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
+        NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
+        NOT_IMPLEMENTED,    /* vpx_codec_decode_fn_t     decode; */
+        NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
+    },
+    {
+        vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
+        vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
+        vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
+        vp8e_set_config,
+        NOT_IMPLEMENTED,
+        vp8e_get_preview,
+        vp8e_mr_alloc_mem,
+    } /* encoder functions */
+};
diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c
new file mode 100644
index 0000000..c13d697
--- /dev/null
+++ b/libvpx/vp8/vp8_dx_iface.c
@@ -0,0 +1,902 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include <string.h>
+#include "vpx_rtcd.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx/vp8dx.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_version.h"
+#include "common/onyxd.h"
+#include "decoder/onyxd_int.h"
+#include "common/alloccommon.h"
+#include "vpx_mem/vpx_mem.h"
+#if CONFIG_ERROR_CONCEALMENT
+#include "decoder/error_concealment.h"
+#endif
+#include "decoder/decoderthreading.h"
+
+#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
+#define VP8_CAP_ERROR_CONCEALMENT (CONFIG_ERROR_CONCEALMENT ? \
+                                    VPX_CODEC_CAP_ERROR_CONCEALMENT : 0)
+
+typedef vpx_codec_stream_info_t  vp8_stream_info_t;
+
+/* Structures for handling memory allocations */
+typedef enum
+{
+    VP8_SEG_ALG_PRIV     = 256,
+    VP8_SEG_MAX
+} mem_seg_id_t;
+#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0])))
+
+static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t);
+
+typedef struct
+{
+    unsigned int   id;
+    unsigned long  sz;
+    unsigned int   align;
+    unsigned int   flags;
+    unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
+} mem_req_t;
+
+static const mem_req_t vp8_mem_req_segs[] =
+{
+    {VP8_SEG_ALG_PRIV,    0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz},
+    {VP8_SEG_MAX, 0, 0, 0, NULL}
+};
+
+struct vpx_codec_alg_priv
+{
+    vpx_codec_priv_t        base;
+    vpx_codec_mmap_t        mmaps[NELEMENTS(vp8_mem_req_segs)-1];
+    vpx_codec_dec_cfg_t     cfg;
+    vp8_stream_info_t       si;
+    int                     defer_alloc;
+    int                     decoder_init;
+    struct VP8D_COMP       *pbi;
+    int                     postproc_cfg_set;
+    vp8_postproc_cfg_t      postproc_cfg;
+#if CONFIG_POSTPROC_VISUALIZER
+    unsigned int            dbg_postproc_flag;
+    int                     dbg_color_ref_frame_flag;
+    int                     dbg_color_mb_modes_flag;
+    int                     dbg_color_b_modes_flag;
+    int                     dbg_display_mv_flag;
+#endif
+    vpx_image_t             img;
+    int                     img_setup;
+    void                    *user_priv;
+};
+
+static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t flags)
+{
+    /* Although this declaration is constant, we can't use it in the requested
+     * segments list because we want to define the requested segments list
+     * before defining the private type (so that the number of memory maps is
+     * known)
+     */
+    (void)si;
+    return sizeof(vpx_codec_alg_priv_t);
+}
+
+
+static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap)
+{
+    free(mmap->priv);
+}
+
+static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap)
+{
+    vpx_codec_err_t  res;
+    unsigned int   align;
+
+    align = mmap->align ? mmap->align - 1 : 0;
+
+    if (mmap->flags & VPX_CODEC_MEM_ZERO)
+        mmap->priv = calloc(1, mmap->sz + align);
+    else
+        mmap->priv = malloc(mmap->sz + align);
+
+    res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR;
+    mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
+    mmap->dtor = vp8_mmap_dtor;
+    return res;
+}
+
+static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si,
+        const vpx_codec_mmap_t        *mmaps,
+        vpx_codec_flags_t              init_flags)
+{
+    int i;
+    vpx_codec_err_t res = VPX_CODEC_OK;
+
+    for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++)
+    {
+        /* Ensure the segment has been allocated */
+        if (!mmaps[i].base)
+        {
+            res = VPX_CODEC_MEM_ERROR;
+            break;
+        }
+
+        /* Verify variable size segment is big enough for the current si. */
+        if (vp8_mem_req_segs[i].calc_sz)
+        {
+            vpx_codec_dec_cfg_t cfg;
+
+            cfg.w = si->w;
+            cfg.h = si->h;
+
+            if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags))
+            {
+                res = VPX_CODEC_MEM_ERROR;
+                break;
+            }
+        }
+    }
+
+    return res;
+}
+
+static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap)
+{
+    int i;
+
+    ctx->priv = mmap->base;
+    ctx->priv->sz = sizeof(*ctx->priv);
+    ctx->priv->iface = ctx->iface;
+    ctx->priv->alg_priv = mmap->base;
+
+    for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
+        ctx->priv->alg_priv->mmaps[i].id = vp8_mem_req_segs[i].id;
+
+    ctx->priv->alg_priv->mmaps[0] = *mmap;
+    ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
+    ctx->priv->init_flags = ctx->init_flags;
+
+    if (ctx->config.dec)
+    {
+        /* Update the reference to the config structure to an internal copy. */
+        ctx->priv->alg_priv->cfg = *ctx->config.dec;
+        ctx->config.dec = &ctx->priv->alg_priv->cfg;
+    }
+}
+
+static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id)
+{
+    int i;
+
+    for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
+        if (ctx->mmaps[i].id == id)
+            return ctx->mmaps[i].base;
+
+    return NULL;
+}
+static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx)
+{
+    /* nothing to clean up */
+}
+
+static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
+                                vpx_codec_priv_enc_mr_cfg_t *data)
+{
+    vpx_codec_err_t        res = VPX_CODEC_OK;
+    (void) data;
+
+    vpx_rtcd();
+
+    /* This function only allocates space for the vpx_codec_alg_priv_t
+     * structure. More memory may be required at the time the stream
+     * information becomes known.
+     */
+    if (!ctx->priv)
+    {
+        vpx_codec_mmap_t mmap;
+
+        mmap.id = vp8_mem_req_segs[0].id;
+        mmap.sz = sizeof(vpx_codec_alg_priv_t);
+        mmap.align = vp8_mem_req_segs[0].align;
+        mmap.flags = vp8_mem_req_segs[0].flags;
+
+        res = vp8_mmap_alloc(&mmap);
+
+        if (!res)
+        {
+            vp8_init_ctx(ctx, &mmap);
+
+            ctx->priv->alg_priv->defer_alloc = 1;
+            /*post processing level initialized to do nothing */
+        }
+    }
+
+    return res;
+}
+
+static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx)
+{
+    int i;
+
+    vp8dx_remove_decompressor(ctx->pbi);
+
+    for (i = NELEMENTS(ctx->mmaps) - 1; i >= 0; i--)
+    {
+        if (ctx->mmaps[i].dtor)
+            ctx->mmaps[i].dtor(&ctx->mmaps[i]);
+    }
+
+    return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8_peek_si(const uint8_t         *data,
+                                   unsigned int           data_sz,
+                                   vpx_codec_stream_info_t *si)
+{
+    vpx_codec_err_t res = VPX_CODEC_OK;
+
+    if(data + data_sz <= data)
+        res = VPX_CODEC_INVALID_PARAM;
+    else
+    {
+        /* Parse uncompresssed part of key frame header.
+         * 3 bytes:- including version, frame type and an offset
+         * 3 bytes:- sync code (0x9d, 0x01, 0x2a)
+         * 4 bytes:- including image width and height in the lowest 14 bits
+         *           of each 2-byte value.
+         */
+        si->is_kf = 0;
+
+        if (data_sz >= 10 && !(data[0] & 0x01))  /* I-Frame */
+        {
+            const uint8_t *c = data + 3;
+            si->is_kf = 1;
+
+            /* vet via sync code */
+            if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)
+                res = VPX_CODEC_UNSUP_BITSTREAM;
+
+            si->w = (c[3] | (c[4] << 8)) & 0x3fff;
+            si->h = (c[5] | (c[6] << 8)) & 0x3fff;
+
+            /*printf("w=%d, h=%d\n", si->w, si->h);*/
+            if (!(si->h | si->w))
+                res = VPX_CODEC_UNSUP_BITSTREAM;
+        }
+        else
+            res = VPX_CODEC_UNSUP_BITSTREAM;
+    }
+
+    return res;
+
+}
+
+static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t    *ctx,
+                                  vpx_codec_stream_info_t *si)
+{
+
+    unsigned int sz;
+
+    if (si->sz >= sizeof(vp8_stream_info_t))
+        sz = sizeof(vp8_stream_info_t);
+    else
+        sz = sizeof(vpx_codec_stream_info_t);
+
+    memcpy(si, &ctx->si, sz);
+    si->sz = sz;
+
+    return VPX_CODEC_OK;
+}
+
+
+static vpx_codec_err_t
+update_error_state(vpx_codec_alg_priv_t                 *ctx,
+                   const struct vpx_internal_error_info *error)
+{
+    vpx_codec_err_t res;
+
+    if ((res = error->error_code))
+        ctx->base.err_detail = error->has_detail
+                               ? error->detail
+                               : NULL;
+
+    return res;
+}
+
+static void yuvconfig2image(vpx_image_t               *img,
+                            const YV12_BUFFER_CONFIG  *yv12,
+                            void                      *user_priv)
+{
+    /** vpx_img_wrap() doesn't allow specifying independent strides for
+      * the Y, U, and V planes, nor other alignment adjustments that
+      * might be representable by a YV12_BUFFER_CONFIG, so we just
+      * initialize all the fields.*/
+    img->fmt = yv12->clrtype == REG_YUV ?
+        VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;
+    img->w = yv12->y_stride;
+    img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
+    img->d_w = yv12->y_width;
+    img->d_h = yv12->y_height;
+    img->x_chroma_shift = 1;
+    img->y_chroma_shift = 1;
+    img->planes[VPX_PLANE_Y] = yv12->y_buffer;
+    img->planes[VPX_PLANE_U] = yv12->u_buffer;
+    img->planes[VPX_PLANE_V] = yv12->v_buffer;
+    img->planes[VPX_PLANE_ALPHA] = NULL;
+    img->stride[VPX_PLANE_Y] = yv12->y_stride;
+    img->stride[VPX_PLANE_U] = yv12->uv_stride;
+    img->stride[VPX_PLANE_V] = yv12->uv_stride;
+    img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+    img->bps = 12;
+    img->user_priv = user_priv;
+    img->img_data = yv12->buffer_alloc;
+    img->img_data_owner = 0;
+    img->self_allocd = 0;
+}
+
+static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
+                                  const uint8_t         *data,
+                                  unsigned int            data_sz,
+                                  void                    *user_priv,
+                                  long                    deadline)
+{
+    vpx_codec_err_t res = VPX_CODEC_OK;
+    unsigned int resolution_change = 0;
+    unsigned int w, h;
+
+    /* Determine the stream parameters. Note that we rely on peek_si to
+     * validate that we have a buffer that does not wrap around the top
+     * of the heap.
+     */
+    w = ctx->si.w;
+    h = ctx->si.h;
+
+    res = ctx->base.iface->dec.peek_si(data, data_sz, &ctx->si);
+
+    if((res == VPX_CODEC_UNSUP_BITSTREAM) && !ctx->si.is_kf)
+    {
+        /* the peek function returns an error for non keyframes, however for
+         * this case, it is not an error */
+        res = VPX_CODEC_OK;
+    }
+
+    if(!ctx->decoder_init && !ctx->si.is_kf)
+        res = VPX_CODEC_UNSUP_BITSTREAM;
+
+    if ((ctx->si.h != h) || (ctx->si.w != w))
+        resolution_change = 1;
+
+    /* Perform deferred allocations, if required */
+    if (!res && ctx->defer_alloc)
+    {
+        int i;
+
+        for (i = 1; !res && i < NELEMENTS(ctx->mmaps); i++)
+        {
+            vpx_codec_dec_cfg_t cfg;
+
+            cfg.w = ctx->si.w;
+            cfg.h = ctx->si.h;
+            ctx->mmaps[i].id = vp8_mem_req_segs[i].id;
+            ctx->mmaps[i].sz = vp8_mem_req_segs[i].sz;
+            ctx->mmaps[i].align = vp8_mem_req_segs[i].align;
+            ctx->mmaps[i].flags = vp8_mem_req_segs[i].flags;
+
+            if (!ctx->mmaps[i].sz)
+                ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg,
+                                   ctx->base.init_flags);
+
+            res = vp8_mmap_alloc(&ctx->mmaps[i]);
+        }
+
+        if (!res)
+            vp8_finalize_mmaps(ctx);
+
+        ctx->defer_alloc = 0;
+    }
+
+    /* Initialize the decoder instance on the first frame*/
+    if (!res && !ctx->decoder_init)
+    {
+        res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags);
+
+        if (!res)
+        {
+            VP8D_CONFIG oxcf;
+            struct VP8D_COMP* optr;
+
+            oxcf.Width = ctx->si.w;
+            oxcf.Height = ctx->si.h;
+            oxcf.Version = 9;
+            oxcf.postprocess = 0;
+            oxcf.max_threads = ctx->cfg.threads;
+            oxcf.error_concealment =
+                    (ctx->base.init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT);
+            oxcf.input_fragments =
+                    (ctx->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS);
+
+            optr = vp8dx_create_decompressor(&oxcf);
+
+            /* If postprocessing was enabled by the application and a
+             * configuration has not been provided, default it.
+             */
+            if (!ctx->postproc_cfg_set
+                && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
+            {
+                ctx->postproc_cfg.post_proc_flag =
+                    VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE;
+                ctx->postproc_cfg.deblocking_level = 4;
+                ctx->postproc_cfg.noise_level = 0;
+            }
+
+            if (!optr)
+                res = VPX_CODEC_ERROR;
+            else
+                ctx->pbi = optr;
+        }
+
+        ctx->decoder_init = 1;
+    }
+
+    if (!res && ctx->pbi)
+    {
+        if(resolution_change)
+        {
+            VP8D_COMP *pbi = ctx->pbi;
+            VP8_COMMON *const pc = & pbi->common;
+            MACROBLOCKD *const xd  = & pbi->mb;
+#if CONFIG_MULTITHREAD
+            int i;
+#endif
+            pc->Width = ctx->si.w;
+            pc->Height = ctx->si.h;
+            {
+                int prev_mb_rows = pc->mb_rows;
+
+                if (setjmp(pbi->common.error.jmp))
+                {
+                    pbi->common.error.setjmp = 0;
+                    /* same return value as used in vp8dx_receive_compressed_data */
+                    return -1;
+                }
+
+                pbi->common.error.setjmp = 1;
+
+                if (pc->Width <= 0)
+                {
+                    pc->Width = w;
+                    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                                       "Invalid frame width");
+                }
+
+                if (pc->Height <= 0)
+                {
+                    pc->Height = h;
+                    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                                       "Invalid frame height");
+                }
+
+                if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height))
+                    vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                                       "Failed to allocate frame buffers");
+
+                xd->pre = pc->yv12_fb[pc->lst_fb_idx];
+                xd->dst = pc->yv12_fb[pc->new_fb_idx];
+
+#if CONFIG_MULTITHREAD
+                for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
+                {
+                    pbi->mb_row_di[i].mbd.dst = pc->yv12_fb[pc->new_fb_idx];
+                    vp8_build_block_doffsets(&pbi->mb_row_di[i].mbd);
+                }
+#endif
+                vp8_build_block_doffsets(&pbi->mb);
+
+                /* allocate memory for last frame MODE_INFO array */
+#if CONFIG_ERROR_CONCEALMENT
+
+                if (pbi->ec_enabled)
+                {
+                    /* old prev_mip was released by vp8_de_alloc_frame_buffers()
+                     * called in vp8_alloc_frame_buffers() */
+                    pc->prev_mip = vpx_calloc(
+                                       (pc->mb_cols + 1) * (pc->mb_rows + 1),
+                                       sizeof(MODE_INFO));
+
+                    if (!pc->prev_mip)
+                    {
+                        vp8_de_alloc_frame_buffers(pc);
+                        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                                           "Failed to allocate"
+                                           "last frame MODE_INFO array");
+                    }
+
+                    pc->prev_mi = pc->prev_mip + pc->mode_info_stride + 1;
+
+                    if (vp8_alloc_overlap_lists(pbi))
+                        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
+                                           "Failed to allocate overlap lists "
+                                           "for error concealment");
+                }
+
+#endif
+
+#if CONFIG_MULTITHREAD
+                if (pbi->b_multithreaded_rd)
+                    vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
+#else
+                (void)prev_mb_rows;
+#endif
+            }
+
+            pbi->common.error.setjmp = 0;
+
+            /* required to get past the first get_free_fb() call */
+            ctx->pbi->common.fb_idx_ref_cnt[0] = 0;
+        }
+
+        ctx->user_priv = user_priv;
+        if (vp8dx_receive_compressed_data(ctx->pbi, data_sz, data, deadline))
+        {
+            VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+            res = update_error_state(ctx, &pbi->common.error);
+        }
+    }
+
+    return res;
+}
+
+static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t  *ctx,
+                                  vpx_codec_iter_t      *iter)
+{
+    vpx_image_t *img = NULL;
+
+    /* iter acts as a flip flop, so an image is only returned on the first
+     * call to get_frame.
+     */
+    if (!(*iter))
+    {
+        YV12_BUFFER_CONFIG sd;
+        int64_t time_stamp = 0, time_end_stamp = 0;
+        vp8_ppflags_t flags = {0};
+
+        if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
+        {
+            flags.post_proc_flag= ctx->postproc_cfg.post_proc_flag
+#if CONFIG_POSTPROC_VISUALIZER
+
+                                | ((ctx->dbg_color_ref_frame_flag != 0) ? VP8D_DEBUG_CLR_FRM_REF_BLKS : 0)
+                                | ((ctx->dbg_color_mb_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+                                | ((ctx->dbg_color_b_modes_flag != 0) ? VP8D_DEBUG_CLR_BLK_MODES : 0)
+                                | ((ctx->dbg_display_mv_flag != 0) ? VP8D_DEBUG_DRAW_MV : 0)
+#endif
+                                ;
+            flags.deblocking_level      = ctx->postproc_cfg.deblocking_level;
+            flags.noise_level           = ctx->postproc_cfg.noise_level;
+#if CONFIG_POSTPROC_VISUALIZER
+            flags.display_ref_frame_flag= ctx->dbg_color_ref_frame_flag;
+            flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag;
+            flags.display_b_modes_flag  = ctx->dbg_color_b_modes_flag;
+            flags.display_mv_flag       = ctx->dbg_display_mv_flag;
+#endif
+        }
+
+        if (0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags))
+        {
+            yuvconfig2image(&ctx->img, &sd, ctx->user_priv);
+
+            img = &ctx->img;
+            *iter = img;
+        }
+    }
+
+    return img;
+}
+
+
+static
+vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t      *ctx,
+                                 vpx_codec_mmap_t           *mmap,
+                                 vpx_codec_iter_t           *iter)
+{
+    vpx_codec_err_t     res;
+    const mem_req_t  *seg_iter = *iter;
+
+    /* Get address of next segment request */
+    do
+    {
+        if (!seg_iter)
+            seg_iter = vp8_mem_req_segs;
+        else if (seg_iter->id != VP8_SEG_MAX)
+            seg_iter++;
+
+        *iter = (vpx_codec_iter_t)seg_iter;
+
+        if (seg_iter->id != VP8_SEG_MAX)
+        {
+            mmap->id = seg_iter->id;
+            mmap->sz = seg_iter->sz;
+            mmap->align = seg_iter->align;
+            mmap->flags = seg_iter->flags;
+
+            if (!seg_iter->sz)
+                mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags);
+
+            res = VPX_CODEC_OK;
+        }
+        else
+            res = VPX_CODEC_LIST_END;
+    }
+    while (!mmap->sz && res != VPX_CODEC_LIST_END);
+
+    return res;
+}
+
+static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t         *ctx,
+                                        const vpx_codec_mmap_t  *mmap)
+{
+    vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;
+    int i, done;
+
+    if (!ctx->priv)
+    {
+        if (mmap->id == VP8_SEG_ALG_PRIV)
+        {
+            if (!ctx->priv)
+            {
+                vp8_init_ctx(ctx, mmap);
+                res = VPX_CODEC_OK;
+            }
+        }
+    }
+
+    done = 1;
+
+    if (!res && ctx->priv->alg_priv)
+    {
+        for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
+        {
+            if (ctx->priv->alg_priv->mmaps[i].id == mmap->id)
+                if (!ctx->priv->alg_priv->mmaps[i].base)
+                {
+                    ctx->priv->alg_priv->mmaps[i] = *mmap;
+                    res = VPX_CODEC_OK;
+                }
+
+            done &= (ctx->priv->alg_priv->mmaps[i].base != NULL);
+        }
+    }
+
+    if (done && !res)
+    {
+        vp8_finalize_mmaps(ctx->priv->alg_priv);
+        res = ctx->iface->init(ctx, NULL);
+    }
+
+    return res;
+}
+
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
+                                       YV12_BUFFER_CONFIG  *yv12)
+{
+    vpx_codec_err_t        res = VPX_CODEC_OK;
+    yv12->y_buffer = img->planes[VPX_PLANE_Y];
+    yv12->u_buffer = img->planes[VPX_PLANE_U];
+    yv12->v_buffer = img->planes[VPX_PLANE_V];
+
+    yv12->y_width  = img->d_w;
+    yv12->y_height = img->d_h;
+    yv12->uv_width = yv12->y_width / 2;
+    yv12->uv_height = yv12->y_height / 2;
+
+    yv12->y_stride = img->stride[VPX_PLANE_Y];
+    yv12->uv_stride = img->stride[VPX_PLANE_U];
+
+    yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
+    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
+
+    return res;
+}
+
+
+static vpx_codec_err_t vp8_set_reference(vpx_codec_alg_priv_t *ctx,
+        int ctr_id,
+        va_list args)
+{
+
+    vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+    if (data)
+    {
+        vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+        YV12_BUFFER_CONFIG sd;
+
+        image2yuvconfig(&frame->img, &sd);
+
+        return vp8dx_set_reference(ctx->pbi, frame->frame_type, &sd);
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx,
+        int ctr_id,
+        va_list args)
+{
+
+    vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
+
+    if (data)
+    {
+        vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+        YV12_BUFFER_CONFIG sd;
+
+        image2yuvconfig(&frame->img, &sd);
+
+        return vp8dx_get_reference(ctx->pbi, frame->frame_type, &sd);
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+
+}
+
+static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
+                                        int ctr_id,
+                                        va_list args)
+{
+#if CONFIG_POSTPROC
+    vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *);
+
+    if (data)
+    {
+        ctx->postproc_cfg_set = 1;
+        ctx->postproc_cfg = *((vp8_postproc_cfg_t *)data);
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+
+#else
+    return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx,
+                                        int ctrl_id,
+                                        va_list args)
+{
+#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC
+    int data = va_arg(args, int);
+
+#define MAP(id, var) case id: var = data; break;
+
+    switch (ctrl_id)
+    {
+        MAP (VP8_SET_DBG_COLOR_REF_FRAME,   ctx->dbg_color_ref_frame_flag);
+        MAP (VP8_SET_DBG_COLOR_MB_MODES,    ctx->dbg_color_mb_modes_flag);
+        MAP (VP8_SET_DBG_COLOR_B_MODES,     ctx->dbg_color_b_modes_flag);
+        MAP (VP8_SET_DBG_DISPLAY_MV,        ctx->dbg_display_mv_flag);
+    }
+
+    return VPX_CODEC_OK;
+#else
+    return VPX_CODEC_INCAPABLE;
+#endif
+}
+
+static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
+                                                int ctrl_id,
+                                                va_list args)
+{
+    int *update_info = va_arg(args, int *);
+    VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+
+    if (update_info)
+    {
+        *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME
+            + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME
+            + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;
+
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+
+extern int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame );
+static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx,
+                                              int ctrl_id,
+                                              va_list args)
+{
+    int *ref_info = va_arg(args, int *);
+    VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+    VP8_COMMON *oci = &pbi->common;
+
+    if (ref_info)
+    {
+        *ref_info =
+            (vp8dx_references_buffer( oci, ALTREF_FRAME )?VP8_ALTR_FRAME:0) |
+            (vp8dx_references_buffer( oci, GOLDEN_FRAME )?VP8_GOLD_FRAME:0) |
+            (vp8dx_references_buffer( oci, LAST_FRAME )?VP8_LAST_FRAME:0);
+
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
+                                               int ctrl_id,
+                                               va_list args)
+{
+
+    int *corrupted = va_arg(args, int *);
+
+    if (corrupted)
+    {
+        VP8D_COMP *pbi = (VP8D_COMP *)ctx->pbi;
+        *corrupted = pbi->common.frame_to_show->corrupted;
+
+        return VPX_CODEC_OK;
+    }
+    else
+        return VPX_CODEC_INVALID_PARAM;
+
+}
+
+vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] =
+{
+    {VP8_SET_REFERENCE,             vp8_set_reference},
+    {VP8_COPY_REFERENCE,            vp8_get_reference},
+    {VP8_SET_POSTPROC,              vp8_set_postproc},
+    {VP8_SET_DBG_COLOR_REF_FRAME,   vp8_set_dbg_options},
+    {VP8_SET_DBG_COLOR_MB_MODES,    vp8_set_dbg_options},
+    {VP8_SET_DBG_COLOR_B_MODES,     vp8_set_dbg_options},
+    {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_options},
+    {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},
+    {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},
+    {VP8D_GET_LAST_REF_USED,        vp8_get_last_ref_frame},
+    { -1, NULL},
+};
+
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(vpx_codec_vp8_dx) =
+{
+    "WebM Project VP8 Decoder" VERSION_STRING,
+    VPX_CODEC_INTERNAL_ABI_VERSION,
+    VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC | VP8_CAP_ERROR_CONCEALMENT |
+    VPX_CODEC_CAP_INPUT_FRAGMENTS,
+    /* vpx_codec_caps_t          caps; */
+    vp8_init,         /* vpx_codec_init_fn_t       init; */
+    vp8_destroy,      /* vpx_codec_destroy_fn_t    destroy; */
+    vp8_ctf_maps,     /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
+    vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t   get_mmap; */
+    vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t   set_mmap; */
+    {
+        vp8_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */
+        vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
+        vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
+        vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
+    },
+    { /* encoder functions */
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED
+    }
+};
diff --git a/libvpx/vp8/vp8cx.mk b/libvpx/vp8/vp8cx.mk
new file mode 100644
index 0000000..0ae2f10
--- /dev/null
+++ b/libvpx/vp8/vp8cx.mk
@@ -0,0 +1,117 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
+
+VP8_CX_EXPORTS += exports_enc
+
+VP8_CX_SRCS-yes += $(VP8_COMMON_SRCS-yes)
+VP8_CX_SRCS-no  += $(VP8_COMMON_SRCS-no)
+VP8_CX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
+VP8_CX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no)
+
+ifeq ($(ARCH_ARM),yes)
+  include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx_arm.mk
+endif
+
+VP8_CX_SRCS-yes += vp8cx.mk
+
+VP8_CX_SRCS-yes += vp8_cx_iface.c
+
+VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c
+VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
+VP8_CX_SRCS-yes += encoder/bitstream.c
+VP8_CX_SRCS-yes += encoder/boolhuff.c
+VP8_CX_SRCS-yes += encoder/dct.c
+VP8_CX_SRCS-yes += encoder/encodeframe.c
+VP8_CX_SRCS-yes += encoder/encodeframe.h
+VP8_CX_SRCS-yes += encoder/encodeintra.c
+VP8_CX_SRCS-yes += encoder/encodemb.c
+VP8_CX_SRCS-yes += encoder/encodemv.c
+VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
+VP8_CX_SRCS-yes += encoder/firstpass.c
+VP8_CX_SRCS-yes += encoder/block.h
+VP8_CX_SRCS-yes += encoder/boolhuff.h
+VP8_CX_SRCS-yes += encoder/bitstream.h
+VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.h
+VP8_CX_SRCS-$(CONFIG_TEMPORAL_DENOISING) += encoder/denoising.c
+VP8_CX_SRCS-yes += encoder/encodeintra.h
+VP8_CX_SRCS-yes += encoder/encodemb.h
+VP8_CX_SRCS-yes += encoder/encodemv.h
+VP8_CX_SRCS-yes += encoder/firstpass.h
+VP8_CX_SRCS-yes += encoder/lookahead.c
+VP8_CX_SRCS-yes += encoder/lookahead.h
+VP8_CX_SRCS-yes += encoder/mcomp.h
+VP8_CX_SRCS-yes += encoder/modecosts.h
+VP8_CX_SRCS-yes += encoder/onyx_int.h
+VP8_CX_SRCS-yes += encoder/pickinter.h
+VP8_CX_SRCS-yes += encoder/psnr.h
+VP8_CX_SRCS-yes += encoder/quantize.h
+VP8_CX_SRCS-yes += encoder/ratectrl.h
+VP8_CX_SRCS-yes += encoder/rdopt.h
+VP8_CX_SRCS-yes += encoder/tokenize.h
+VP8_CX_SRCS-yes += encoder/treewriter.h
+VP8_CX_SRCS-yes += encoder/mcomp.c
+VP8_CX_SRCS-yes += encoder/modecosts.c
+VP8_CX_SRCS-yes += encoder/onyx_if.c
+VP8_CX_SRCS-yes += encoder/pickinter.c
+VP8_CX_SRCS-yes += encoder/picklpf.c
+VP8_CX_SRCS-yes += encoder/psnr.c
+VP8_CX_SRCS-yes += encoder/quantize.c
+VP8_CX_SRCS-yes += encoder/ratectrl.c
+VP8_CX_SRCS-yes += encoder/rdopt.c
+VP8_CX_SRCS-yes += encoder/segmentation.c
+VP8_CX_SRCS-yes += encoder/segmentation.h
+VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/ssim.c
+VP8_CX_SRCS-yes += encoder/tokenize.c
+VP8_CX_SRCS-yes += encoder/dct_value_cost.h
+VP8_CX_SRCS-yes += encoder/dct_value_tokens.h
+VP8_CX_SRCS-yes += encoder/treewriter.c
+VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
+VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
+VP8_CX_SRCS-yes += encoder/temporal_filter.c
+VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
+VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
+
+ifeq ($(CONFIG_REALTIME_ONLY),yes)
+VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
+VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
+endif
+
+VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
+VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
+VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
+
+ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
+ifeq ($(HAVE_SSE2),yes)
+vp8/encoder/x86/denoising_sse2.c.o: CFLAGS += -msse2
+endif
+endif
+
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm
+VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
+VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
+VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm
+
+ifeq ($(CONFIG_REALTIME_ONLY),yes)
+VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
+endif
+
+
+VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
diff --git a/libvpx/vp8/vp8cx_arm.mk b/libvpx/vp8/vp8cx_arm.mk
new file mode 100644
index 0000000..b030ee5
--- /dev/null
+++ b/libvpx/vp8/vp8cx_arm.mk
@@ -0,0 +1,44 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+VP8_CX_SRCS-$(ARCH_ARM)  += vp8cx_arm.mk
+
+#File list for arm
+# encoder
+VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c
+VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.c
+
+#File list for edsp
+# encoder
+VP8_CX_SRCS-$(HAVE_EDSP) += encoder/arm/boolhuff_arm.c
+VP8_CX_SRCS_REMOVE-$(HAVE_EDSP)  += encoder/boolhuff.c
+VP8_CX_SRCS-$(HAVE_EDSP)  += encoder/arm/armv5te/boolhuff_armv5te$(ASM)
+VP8_CX_SRCS-$(HAVE_EDSP)  += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM)
+VP8_CX_SRCS-$(HAVE_EDSP)  += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM)
+VP8_CX_SRCS-$(HAVE_EDSP)  += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM)
+
+#File list for media
+# encoder
+VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/walsh_v6$(ASM)
+
+#File list for neon
+# encoder
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/fastquantizeb_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/picklpf_arm.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/subtract_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_mse16x16_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
diff --git a/libvpx/vp8/vp8dx.mk b/libvpx/vp8/vp8dx.mk
new file mode 100644
index 0000000..dd39190
--- /dev/null
+++ b/libvpx/vp8/vp8dx.mk
@@ -0,0 +1,66 @@
+##
+##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+
+include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
+
+VP8_DX_EXPORTS += exports_dec
+
+VP8_DX_SRCS-yes += $(VP8_COMMON_SRCS-yes)
+VP8_DX_SRCS-no  += $(VP8_COMMON_SRCS-no)
+VP8_DX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
+VP8_DX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no)
+
+VP8_DX_SRCS-yes += vp8dx.mk
+
+VP8_DX_SRCS-yes += vp8_dx_iface.c
+
+# common
+#define ARM
+#define DISABLE_THREAD
+
+#INCLUDES += algo/vpx_common/vpx_mem/include
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += decoder
+
+
+
+# decoder
+#define ARM
+#define DISABLE_THREAD
+
+#INCLUDES += algo/vpx_common/vpx_mem/include
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += common
+#INCLUDES += decoder
+
+VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
+VP8_DX_SRCS-yes += decoder/dboolhuff.c
+VP8_DX_SRCS-yes += decoder/decodemv.c
+VP8_DX_SRCS-yes += decoder/decodframe.c
+VP8_DX_SRCS-yes += decoder/detokenize.c
+VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/ec_types.h
+VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.h
+VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.c
+VP8_DX_SRCS-yes += decoder/dboolhuff.h
+VP8_DX_SRCS-yes += decoder/decodemv.h
+VP8_DX_SRCS-yes += decoder/decoderthreading.h
+VP8_DX_SRCS-yes += decoder/detokenize.h
+VP8_DX_SRCS-yes += decoder/onyxd_int.h
+VP8_DX_SRCS-yes += decoder/treereader.h
+VP8_DX_SRCS-yes += decoder/onyxd_if.c
+VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
+
+VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes))
author	Johann <johannkoenig@google.com>	2012-10-10 11:46:26 -0700
committer	Johann <johannkoenig@google.com>	2012-12-11 11:30:01 -0800
commit	1b362b15af34006e6a11974088a46d42b903418e (patch)
tree	5c05c1e135557201ef2797e62a4e1480e712a7a2 /libvpx/vp8
parent	b9c4e676b7be3cf3a9e45e06d71bf0ca92344a5b (diff)
download	android_external_libvpx-1b362b15af34006e6a11974088a46d42b903418e.tar.gz android_external_libvpx-1b362b15af34006e6a11974088a46d42b903418e.tar.bz2 android_external_libvpx-1b362b15af34006e6a11974088a46d42b903418e.zip