summaryrefslogtreecommitdiffstats
path: root/libvpx/vp8/common/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libvpx/vp8/common/x86')
-rw-r--r--libvpx/vp8/common/x86/dequantize_mmx.asm258
-rw-r--r--libvpx/vp8/common/x86/filter_x86.c35
-rw-r--r--libvpx/vp8/common/x86/filter_x86.h19
-rw-r--r--libvpx/vp8/common/x86/idct_blk_mmx.c127
-rw-r--r--libvpx/vp8/common/x86/idct_blk_sse2.c89
-rw-r--r--libvpx/vp8/common/x86/idctllm_mmx.asm295
-rw-r--r--libvpx/vp8/common/x86/idctllm_sse2.asm708
-rw-r--r--libvpx/vp8/common/x86/iwalsh_mmx.asm140
-rw-r--r--libvpx/vp8/common/x86/iwalsh_sse2.asm121
-rw-r--r--libvpx/vp8/common/x86/loopfilter_block_sse2.asm813
-rw-r--r--libvpx/vp8/common/x86/loopfilter_mmx.asm1753
-rw-r--r--libvpx/vp8/common/x86/loopfilter_sse2.asm1640
-rw-r--r--libvpx/vp8/common/x86/loopfilter_x86.c198
-rw-r--r--libvpx/vp8/common/x86/mfqe_sse2.asm281
-rw-r--r--libvpx/vp8/common/x86/postproc_mmx.asm314
-rw-r--r--libvpx/vp8/common/x86/postproc_sse2.asm721
-rw-r--r--libvpx/vp8/common/x86/postproc_x86.c24
-rw-r--r--libvpx/vp8/common/x86/recon_mmx.asm274
-rw-r--r--libvpx/vp8/common/x86/recon_sse2.asm1080
-rw-r--r--libvpx/vp8/common/x86/recon_wrapper_sse2.c186
-rw-r--r--libvpx/vp8/common/x86/sad_mmx.asm427
-rw-r--r--libvpx/vp8/common/x86/sad_sse2.asm410
-rw-r--r--libvpx/vp8/common/x86/sad_sse3.asm960
-rw-r--r--libvpx/vp8/common/x86/sad_sse4.asm353
-rw-r--r--libvpx/vp8/common/x86/sad_ssse3.asm370
-rw-r--r--libvpx/vp8/common/x86/subpixel_mmx.asm702
-rw-r--r--libvpx/vp8/common/x86/subpixel_sse2.asm1372
-rw-r--r--libvpx/vp8/common/x86/subpixel_ssse3.asm1507
-rw-r--r--libvpx/vp8/common/x86/variance_impl_mmx.asm851
-rw-r--r--libvpx/vp8/common/x86/variance_impl_sse2.asm1359
-rw-r--r--libvpx/vp8/common/x86/variance_impl_ssse3.asm364
-rw-r--r--libvpx/vp8/common/x86/variance_mmx.c398
-rw-r--r--libvpx/vp8/common/x86/variance_sse2.c558
-rw-r--r--libvpx/vp8/common/x86/variance_ssse3.c166
-rw-r--r--libvpx/vp8/common/x86/vp8_asm_stubs.c629
35 files changed, 19502 insertions, 0 deletions
diff --git a/libvpx/vp8/common/x86/dequantize_mmx.asm b/libvpx/vp8/common/x86/dequantize_mmx.asm
new file mode 100644
index 0000000..4e551f0
--- /dev/null
+++ b/libvpx/vp8/common/x86/dequantize_mmx.asm
@@ -0,0 +1,258 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
+global sym(vp8_dequantize_b_impl_mmx) PRIVATE
+sym(vp8_dequantize_b_impl_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;sq
+ mov rdi, arg(1) ;dq
+ mov rax, arg(2) ;q
+
+ movq mm1, [rsi]
+ pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers.
+ movq [rdi], mm1
+
+ movq mm1, [rsi+8]
+ pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers.
+ movq [rdi+8], mm1
+
+ movq mm1, [rsi+16]
+ pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers.
+ movq [rdi+16], mm1
+
+ movq mm1, [rsi+24]
+ pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers.
+ movq [rdi+24], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void dequant_idct_add_mmx(
+;short *input, 0
+;short *dq, 1
+;unsigned char *dest, 2
+;int stride) 3
+global sym(vp8_dequant_idct_add_mmx) PRIVATE
+sym(vp8_dequant_idct_add_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ;input
+ mov rdx, arg(1) ;dq
+
+
+ movq mm0, [rax ]
+ pmullw mm0, [rdx]
+
+ movq mm1, [rax +8]
+ pmullw mm1, [rdx +8]
+
+ movq mm2, [rax+16]
+ pmullw mm2, [rdx+16]
+
+ movq mm3, [rax+24]
+ pmullw mm3, [rdx+24]
+
+ mov rdx, arg(2) ;dest
+
+ pxor mm7, mm7
+
+
+ movq [rax], mm7
+ movq [rax+8], mm7
+
+ movq [rax+16],mm7
+ movq [rax+24],mm7
+
+
+ movsxd rdi, dword ptr arg(3) ;stride
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ movq mm3, mm5 ; 33 23 13 03
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ paddw mm0, [GLOBAL(fours)]
+
+ paddw mm2, [GLOBAL(fours)]
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+ psraw mm2, 3
+
+ psraw mm0, 3
+ psraw mm4, 3
+
+ psraw mm6, 3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ pxor mm7, mm7
+
+ movd mm4, [rdx]
+ punpcklbw mm4, mm7
+ paddsw mm0, mm4
+ packuswb mm0, mm7
+ movd [rdx], mm0
+
+ movd mm4, [rdx+rdi]
+ punpcklbw mm4, mm7
+ paddsw mm1, mm4
+ packuswb mm1, mm7
+ movd [rdx+rdi], mm1
+
+ movd mm4, [rdx+2*rdi]
+ punpcklbw mm4, mm7
+ paddsw mm2, mm4
+ packuswb mm2, mm7
+ movd [rdx+rdi*2], mm2
+
+ add rdx, rdi
+
+ movd mm4, [rdx+2*rdi]
+ punpcklbw mm4, mm7
+ paddsw mm5, mm4
+ packuswb mm5, mm7
+ movd [rdx+rdi*2], mm5
+
+ ; begin epilog
+ pop rdi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+ times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 4 dw 0x4E7B
+align 16
+fours:
+ times 4 dw 0x0004
diff --git a/libvpx/vp8/common/x86/filter_x86.c b/libvpx/vp8/common/x86/filter_x86.c
new file mode 100644
index 0000000..ebab814
--- /dev/null
+++ b/libvpx/vp8/common/x86/filter_x86.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) =
+{
+ { 128, 128, 128, 128, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 32, 32, 32, 32 },
+ { 80, 80, 80, 80, 48, 48, 48, 48 },
+ { 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 48, 48, 48, 48, 80, 80, 80, 80 },
+ { 32, 32, 32, 32, 96, 96, 96, 96 },
+ { 16, 16, 16, 16, 112, 112, 112, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) =
+{
+ { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
+ { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
+ { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
+ { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
+ { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
+};
diff --git a/libvpx/vp8/common/x86/filter_x86.h b/libvpx/vp8/common/x86/filter_x86.h
new file mode 100644
index 0000000..efcc4dc
--- /dev/null
+++ b/libvpx/vp8/common/x86/filter_x86.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef FILTER_X86_H
+#define FILTER_X86_H
+
+/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
+ * duplicated values */
+extern const short vp8_bilinear_filters_x86_4[8][8]; /* duplicated 4x */
+extern const short vp8_bilinear_filters_x86_8[8][16]; /* duplicated 8x */
+
+#endif /* FILTER_X86_H */
diff --git a/libvpx/vp8/common/x86/idct_blk_mmx.c b/libvpx/vp8/common/x86/idct_blk_mmx.c
new file mode 100644
index 0000000..4adf3f5
--- /dev/null
+++ b/libvpx/vp8/common/x86/idct_blk_mmx.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/blockd.h"
+
+extern void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
+
+void vp8_dequantize_b_mmx(BLOCKD *d, short *DQC)
+{
+ short *sq = (short *) d->qcoeff;
+ short *dq = (short *) d->dqcoeff;
+
+ vp8_dequantize_b_impl_mmx(sq, dq, DQC);
+}
+
+void vp8_dequant_idct_add_y_block_mmx
+ (short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_mmx (q, dq, dst, stride);
+ else if (eobs[0] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride);
+ else if (eobs[1] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
+ dst+4, stride);
+ ((int *)(q+16))[0] = 0;
+ }
+
+ if (eobs[2] > 1)
+ vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride);
+ else if (eobs[2] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
+ dst+8, stride);
+ ((int *)(q+32))[0] = 0;
+ }
+
+ if (eobs[3] > 1)
+ vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride);
+ else if (eobs[3] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
+ dst+12, stride);
+ ((int *)(q+48))[0] = 0;
+ }
+
+ q += 64;
+ dst += 4*stride;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_mmx
+ (short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 2; i++)
+ {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_mmx (q, dq, dstu, stride);
+ else if (eobs[0] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride);
+ else if (eobs[1] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
+ dstu+4, stride);
+ ((int *)(q+16))[0] = 0;
+ }
+
+ q += 32;
+ dstu += 4*stride;
+ eobs += 2;
+ }
+
+ for (i = 0; i < 2; i++)
+ {
+ if (eobs[0] > 1)
+ vp8_dequant_idct_add_mmx (q, dq, dstv, stride);
+ else if (eobs[0] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
+ ((int *)q)[0] = 0;
+ }
+
+ if (eobs[1] > 1)
+ vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride);
+ else if (eobs[1] == 1)
+ {
+ vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
+ dstv+4, stride);
+ ((int *)(q+16))[0] = 0;
+ }
+
+ q += 32;
+ dstv += 4*stride;
+ eobs += 2;
+ }
+}
diff --git a/libvpx/vp8/common/x86/idct_blk_sse2.c b/libvpx/vp8/common/x86/idct_blk_sse2.c
new file mode 100644
index 0000000..056e052
--- /dev/null
+++ b/libvpx/vp8/common/x86/idct_blk_sse2.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+void vp8_idct_dequant_0_2x_sse2
+ (short *q, short *dq ,
+ unsigned char *dst, int dst_stride);
+void vp8_idct_dequant_full_2x_sse2
+ (short *q, short *dq ,
+ unsigned char *dst, int dst_stride);
+
+void vp8_dequant_idct_add_y_block_sse2
+ (short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs)
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)(eobs))[0] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride);
+ }
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)(eobs))[1] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride);
+ }
+ q += 64;
+ dst += stride*4;
+ eobs += 4;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_sse2
+ (short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)(eobs))[0] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
+ }
+ q += 32;
+ dstu += stride*4;
+
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)(eobs))[1] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
+ }
+ q += 32;
+
+ if (((short *)(eobs))[2])
+ {
+ if (((short *)(eobs))[2] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
+ }
+ q += 32;
+ dstv += stride*4;
+
+ if (((short *)(eobs))[3])
+ {
+ if (((short *)(eobs))[3] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
+ }
+}
diff --git a/libvpx/vp8/common/x86/idctllm_mmx.asm b/libvpx/vp8/common/x86/idctllm_mmx.asm
new file mode 100644
index 0000000..96fa2c6
--- /dev/null
+++ b/libvpx/vp8/common/x86/idctllm_mmx.asm
@@ -0,0 +1,295 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; /****************************************************************************
+; * Notes:
+; *
+; * This implementation makes use of 16 bit fixed point version of two multiply
+; * constants:
+; * 1. sqrt(2) * cos (pi/8)
+; * 2. sqrt(2) * sin (pi/8)
+; * Because the first constant is bigger than 1, to maintain the same 16 bit
+; * fixed point precision as the second one, we use a trick of
+; * x * a = x + x*(a-1)
+; * so
+; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+; *
+; * For the second constant, because of the 16bit version is 35468, which
+; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
+; * number.
+; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
+; *
+; **************************************************************************/
+
+
+;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
+;int pitch, unsigned char *dest,int stride)
+global sym(vp8_short_idct4x4llm_mmx) PRIVATE
+sym(vp8_short_idct4x4llm_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ;input
+ mov rsi, arg(1) ;pred
+
+ movq mm0, [rax ]
+ movq mm1, [rax+ 8]
+ movq mm2, [rax+16]
+ movq mm3, [rax+24]
+
+%if 0
+ pxor mm7, mm7
+ movq [rax], mm7
+ movq [rax+8], mm7
+ movq [rax+16],mm7
+ movq [rax+24],mm7
+%endif
+ movsxd rax, dword ptr arg(2) ;pitch
+ mov rdx, arg(3) ;dest
+ movsxd rdi, dword ptr arg(4) ;stride
+
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ movq mm3, mm5 ; 33 23 13 03
+
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
+
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
+
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
+
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
+
+ movq mm5, mm1
+ movq mm4, mm3
+
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
+
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
+
+ paddw mm3, mm5 ; d1
+ paddw mm0, [GLOBAL(fours)]
+
+ paddw mm2, [GLOBAL(fours)]
+ movq mm6, mm2 ; a1
+
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
+
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
+
+ psubw mm6, mm3 ;3
+ psraw mm2, 3
+
+ psraw mm0, 3
+ psraw mm4, 3
+
+ psraw mm6, 3
+
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
+
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
+
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
+
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
+
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
+
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
+
+ pxor mm7, mm7
+
+ movd mm4, [rsi]
+ punpcklbw mm4, mm7
+ paddsw mm0, mm4
+ packuswb mm0, mm7
+ movd [rdx], mm0
+
+ movd mm4, [rsi+rax]
+ punpcklbw mm4, mm7
+ paddsw mm1, mm4
+ packuswb mm1, mm7
+ movd [rdx+rdi], mm1
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm2, mm4
+ packuswb mm2, mm7
+ movd [rdx+rdi*2], mm2
+
+ add rdx, rdi
+ add rsi, rax
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm5, mm4
+ packuswb mm5, mm7
+ movd [rdx+rdi*2], mm5
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_dc_only_idct_add_mmx(
+;short input_dc,
+;unsigned char *pred_ptr,
+;int pred_stride,
+;unsigned char *dst_ptr,
+;int stride)
+global sym(vp8_dc_only_idct_add_mmx) PRIVATE
+sym(vp8_dc_only_idct_add_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ ; end prolog
+
+ movd mm5, arg(0) ;input_dc
+ mov rax, arg(1) ;pred_ptr
+ movsxd rdx, dword ptr arg(2) ;pred_stride
+
+ pxor mm0, mm0
+
+ paddw mm5, [GLOBAL(fours)]
+ lea rcx, [rdx + rdx*2]
+
+ psraw mm5, 3
+
+ punpcklwd mm5, mm5
+
+ punpckldq mm5, mm5
+
+ movd mm1, [rax]
+ movd mm2, [rax+rdx]
+ movd mm3, [rax+2*rdx]
+ movd mm4, [rax+rcx]
+
+ mov rax, arg(3) ;d -- destination
+ movsxd rdx, dword ptr arg(4) ;dst_stride
+
+ punpcklbw mm1, mm0
+ paddsw mm1, mm5
+ packuswb mm1, mm0 ; pack and unpack to saturate
+ lea rcx, [rdx + rdx*2]
+
+ punpcklbw mm2, mm0
+ paddsw mm2, mm5
+ packuswb mm2, mm0 ; pack and unpack to saturate
+
+ punpcklbw mm3, mm0
+ paddsw mm3, mm5
+ packuswb mm3, mm0 ; pack and unpack to saturate
+
+ punpcklbw mm4, mm0
+ paddsw mm4, mm5
+ packuswb mm4, mm0 ; pack and unpack to saturate
+
+ movd [rax], mm1
+ movd [rax+rdx], mm2
+ movd [rax+2*rdx], mm3
+ movd [rax+rcx], mm4
+
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+x_s1sqr2:
+ times 4 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 4 dw 0x4E7B
+align 16
+fours:
+ times 4 dw 0x0004
diff --git a/libvpx/vp8/common/x86/idctllm_sse2.asm b/libvpx/vp8/common/x86/idctllm_sse2.asm
new file mode 100644
index 0000000..bf8e2c4
--- /dev/null
+++ b/libvpx/vp8/common/x86/idctllm_sse2.asm
@@ -0,0 +1,708 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_idct_dequant_0_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; )
+
+global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_0_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ GET_GOT rbx
+ ; end prolog
+
+ mov rdx, arg(1) ; dequant
+ mov rax, arg(0) ; qcoeff
+
+ movd xmm4, [rax]
+ movd xmm5, [rdx]
+
+ pinsrw xmm4, [rax+32], 4
+ pinsrw xmm5, [rdx], 4
+
+ pmullw xmm4, xmm5
+
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
+
+ ; clear coeffs
+ movd [rax], xmm5
+ movd [rax+32], xmm5
+;pshufb
+ mov rax, arg(2) ; dst
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+
+ pshuflw xmm4, xmm4, 00000000b
+ pshufhw xmm4, xmm4, 00000000b
+
+ lea rcx, [rdx + rdx*2]
+ paddw xmm4, [GLOBAL(fours)]
+
+ psraw xmm4, 3
+
+ movq xmm0, [rax]
+ movq xmm1, [rax+rdx]
+ movq xmm2, [rax+2*rdx]
+ movq xmm3, [rax+rcx]
+
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+
+
+ ; Add to predict buffer
+ paddw xmm0, xmm4
+ paddw xmm1, xmm4
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+
+ ; pack up before storing
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
+
+ ; store blocks back out
+ movq [rax], xmm0
+ movq [rax + rdx], xmm1
+
+ lea rax, [rax + 2*rdx]
+
+ movq [rax], xmm2
+ movq [rax + rdx], xmm3
+
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_idct_dequant_full_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; )
+global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_full_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+ mov rdx, arg(1) ; dequant
+ mov rdi, arg(2) ; dst
+
+
+ ; Zero out xmm7, for use unpacking
+ pxor xmm7, xmm7
+
+
+ ; note the transpose of xmm1 and xmm2, necessary for shuffle
+ ; to spit out sensicle data
+ movdqa xmm0, [rax]
+ movdqa xmm2, [rax+16]
+ movdqa xmm1, [rax+32]
+ movdqa xmm3, [rax+48]
+
+ ; Clear out coeffs
+ movdqa [rax], xmm7
+ movdqa [rax+16], xmm7
+ movdqa [rax+32], xmm7
+ movdqa [rax+48], xmm7
+
+ ; dequantize qcoeff buffer
+ pmullw xmm0, [rdx]
+ pmullw xmm2, [rdx+16]
+ pmullw xmm1, [rdx]
+ pmullw xmm3, [rdx+16]
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+
+ ; repack so block 0 row x and block 1 row x are together
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm4, xmm1
+
+ pshufd xmm0, xmm0, 11011000b
+ pshufd xmm1, xmm4, 11011000b
+
+ movdqa xmm4, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm4, xmm3
+
+ pshufd xmm2, xmm2, 11011000b
+ pshufd xmm3, xmm4, 11011000b
+
+ ; first pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2 ;
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ lea rcx, [rdx + rdx*2] ;dst_stride * 3
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+
+ ; transpose for the second pass
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ ; second pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ paddw xmm0, [GLOBAL(fours)]
+
+ paddw xmm2, [GLOBAL(fours)]
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+ psraw xmm2, 3
+
+ psraw xmm0, 3
+ psraw xmm4, 3
+
+ psraw xmm6, 3
+
+ ; transpose to save
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ pxor xmm7, xmm7
+
+ ; Load up predict blocks
+ movq xmm4, [rdi]
+ movq xmm5, [rdi+rdx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movq xmm4, [rdi+2*rdx]
+ movq xmm5, [rdi+rcx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+.finish:
+
+ ; pack up before storing
+ packuswb xmm0, xmm7
+ packuswb xmm1, xmm7
+ packuswb xmm2, xmm7
+ packuswb xmm3, xmm7
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+ movq [rdi + rdx*2], xmm2
+ movq [rdi + rcx], xmm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_idct_dequant_dc_0_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; short *dc - 4
+; )
+global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_dc_0_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+
+ mov rdi, arg(2) ; dst
+ mov rdx, arg(4) ; dc
+
+ ; Zero out xmm5, for use unpacking
+ pxor xmm5, xmm5
+
+ ; load up 2 dc words here == 2*16 = doubleword
+ movd xmm4, [rdx]
+
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+ lea rcx, [rdx + rdx*2]
+ ; Load up predict blocks
+ movq xmm0, [rdi]
+ movq xmm1, [rdi+rdx*1]
+ movq xmm2, [rdi+rdx*2]
+ movq xmm3, [rdi+rcx]
+
+ ; Duplicate and expand dc across
+ punpcklwd xmm4, xmm4
+ punpckldq xmm4, xmm4
+
+ ; Rounding to dequant and downshift
+ paddw xmm4, [GLOBAL(fours)]
+ psraw xmm4, 3
+
+ ; Predict buffer needs to be expanded from bytes to words
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+
+ ; Add to predict buffer
+ paddw xmm0, xmm4
+ paddw xmm1, xmm4
+ paddw xmm2, xmm4
+ paddw xmm3, xmm4
+
+ ; pack up before storing
+ packuswb xmm0, xmm5
+ packuswb xmm1, xmm5
+ packuswb xmm2, xmm5
+ packuswb xmm3, xmm5
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+ movq [rdi + rdx*2], xmm2
+ movq [rdi + rcx], xmm3
+
+ ; begin epilog
+ pop rdi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp8_idct_dequant_dc_full_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; short *dc - 4
+; )
+global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
+sym(vp8_idct_dequant_dc_full_2x_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rdi
+ ; end prolog
+
+ ; special case when 2 blocks have 0 or 1 coeffs
+ ; dc is set as first coeff, so no need to load qcoeff
+ mov rax, arg(0) ; qcoeff
+ mov rdx, arg(1) ; dequant
+
+ mov rdi, arg(2) ; dst
+
+ ; Zero out xmm7, for use unpacking
+ pxor xmm7, xmm7
+
+
+ ; note the transpose of xmm1 and xmm2, necessary for shuffle
+ ; to spit out sensicle data
+ movdqa xmm0, [rax]
+ movdqa xmm2, [rax+16]
+ movdqa xmm1, [rax+32]
+ movdqa xmm3, [rax+48]
+
+ ; Clear out coeffs
+ movdqa [rax], xmm7
+ movdqa [rax+16], xmm7
+ movdqa [rax+32], xmm7
+ movdqa [rax+48], xmm7
+
+ ; dequantize qcoeff buffer
+ pmullw xmm0, [rdx]
+ pmullw xmm2, [rdx+16]
+ pmullw xmm1, [rdx]
+ pmullw xmm3, [rdx+16]
+
+ ; DC component
+ mov rdx, arg(4)
+
+ ; repack so block 0 row x and block 1 row x are together
+ movdqa xmm4, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm4, xmm1
+
+ pshufd xmm0, xmm0, 11011000b
+ pshufd xmm1, xmm4, 11011000b
+
+ movdqa xmm4, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm4, xmm3
+
+ pshufd xmm2, xmm2, 11011000b
+ pshufd xmm3, xmm4, 11011000b
+
+ ; insert DC component
+ pinsrw xmm0, [rdx], 0
+ pinsrw xmm0, [rdx+2], 4
+
+ ; first pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2 ;
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+
+ ; transpose for the second pass
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ ; second pass
+ psubw xmm0, xmm2 ; b1 = 0-2
+ paddw xmm2, xmm2
+
+ movdqa xmm5, xmm1
+ paddw xmm2, xmm0 ; a1 = 0+2
+
+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
+
+ movdqa xmm7, xmm3
+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]
+
+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw xmm7, xmm5 ; c1
+
+ movdqa xmm5, xmm1
+ movdqa xmm4, xmm3
+
+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]
+ paddw xmm5, xmm1
+
+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]
+ paddw xmm3, xmm4
+
+ paddw xmm3, xmm5 ; d1
+ paddw xmm0, [GLOBAL(fours)]
+
+ paddw xmm2, [GLOBAL(fours)]
+ movdqa xmm6, xmm2 ; a1
+
+ movdqa xmm4, xmm0 ; b1
+ paddw xmm2, xmm3 ;0
+
+ paddw xmm4, xmm7 ;1
+ psubw xmm0, xmm7 ;2
+
+ psubw xmm6, xmm3 ;3
+ psraw xmm2, 3
+
+ psraw xmm0, 3
+ psraw xmm4, 3
+
+ psraw xmm6, 3
+
+ ; transpose to save
+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000
+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000
+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100
+
+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008
+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008
+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108
+
+
+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000
+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000
+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002
+
+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100
+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100
+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102
+
+
+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000
+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000
+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001
+
+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002
+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002
+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003
+
+ pshufd xmm0, xmm2, 11011000b
+ pshufd xmm2, xmm1, 11011000b
+
+ pshufd xmm1, xmm5, 11011000b
+ pshufd xmm3, xmm7, 11011000b
+
+ pxor xmm7, xmm7
+
+ ; Load up predict blocks
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+ movq xmm4, [rdi]
+ movq xmm5, [rdi+rdx]
+ lea rcx, [rdx + rdx*2]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm0, xmm4
+ paddw xmm1, xmm5
+
+ movq xmm4, [rdi+rdx*2]
+ movq xmm5, [rdi+rcx]
+
+ punpcklbw xmm4, xmm7
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+
+.finish:
+
+ ; pack up before storing
+ packuswb xmm0, xmm7
+ packuswb xmm1, xmm7
+ packuswb xmm2, xmm7
+ packuswb xmm3, xmm7
+
+ ; Load destination stride before writing out,
+ ; doesn't need to persist
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+
+ ; store blocks back out
+ movq [rdi], xmm0
+ movq [rdi + rdx], xmm1
+
+ lea rdi, [rdi + 2*rdx]
+
+ movq [rdi], xmm2
+ movq [rdi + rdx], xmm3
+
+
+ ; begin epilog
+ pop rdi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+fours:
+ times 8 dw 0x0004
+align 16
+x_s1sqr2:
+ times 8 dw 0x8A8C
+align 16
+x_c1sqr2less1:
+ times 8 dw 0x4E7B
diff --git a/libvpx/vp8/common/x86/iwalsh_mmx.asm b/libvpx/vp8/common/x86/iwalsh_mmx.asm
new file mode 100644
index 0000000..4aac094
--- /dev/null
+++ b/libvpx/vp8/common/x86/iwalsh_mmx.asm
@@ -0,0 +1,140 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
+global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
+sym(vp8_short_inv_walsh4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ ; end prolog
+
+ mov rdx, arg(0)
+ mov rax, 30003h
+
+ movq mm0, [rdx + 0] ;ip[0]
+ movq mm1, [rdx + 8] ;ip[4]
+ movd mm7, rax
+
+ movq mm2, [rdx + 16] ;ip[8]
+ movq mm3, [rdx + 24] ;ip[12]
+ punpcklwd mm7, mm7 ;0003000300030003h
+ mov rdx, arg(1)
+
+ movq mm4, mm0
+ movq mm5, mm1
+
+ paddw mm4, mm3 ;ip[0] + ip[12] aka al
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl
+
+ movq mm6, mm4 ;temp al
+ paddw mm4, mm5 ;al + bl
+ psubw mm6, mm5 ;al - bl
+
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1
+ psubw mm1, mm2 ;ip[4] - ip[8] aka c1
+
+ movq mm5, mm0 ;temp dl
+ paddw mm0, mm1 ;dl + cl
+ psubw mm5, mm1 ;dl - cl
+
+ ; 03 02 01 00
+ ; 13 12 11 10
+ ; 23 22 21 20
+ ; 33 32 31 30
+
+ movq mm3, mm4 ; 03 02 01 00
+ punpcklwd mm4, mm0 ; 11 01 10 00
+ punpckhwd mm3, mm0 ; 13 03 12 02
+
+ movq mm1, mm6 ; 23 22 21 20
+ punpcklwd mm6, mm5 ; 31 21 30 20
+ punpckhwd mm1, mm5 ; 33 23 32 22
+
+ movq mm0, mm4 ; 11 01 10 00
+ movq mm2, mm3 ; 13 03 12 02
+
+ punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
+ punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
+
+ punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
+ punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
+;~~~~~~~~~~~~~~~~~~~~~
+ movq mm1, mm0
+ movq mm5, mm4
+ paddw mm1, mm3 ;ip[0] + ip[12] aka al
+ paddw mm5, mm2 ;ip[4] + ip[8] aka bl
+
+ movq mm6, mm1 ;temp al
+ paddw mm1, mm5 ;al + bl
+ psubw mm6, mm5 ;al - bl
+ paddw mm1, mm7
+ paddw mm6, mm7
+ psraw mm1, 3
+ psraw mm6, 3
+
+ psubw mm0, mm3 ;ip[0] - ip[12] aka d1
+ psubw mm4, mm2 ;ip[4] - ip[8] aka c1
+
+ movq mm5, mm0 ;temp dl
+ paddw mm0, mm4 ;dl + cl
+ psubw mm5, mm4 ;dl - cl
+ paddw mm0, mm7
+ paddw mm5, mm7
+ psraw mm0, 3
+ psraw mm5, 3
+;~~~~~~~~~~~~~~~~~~~~~
+
+ movd eax, mm1
+ movd ecx, mm0
+ psrlq mm0, 32
+ psrlq mm1, 32
+ mov word ptr[rdx+32*0], ax
+ mov word ptr[rdx+32*1], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*4], ax
+ mov word ptr[rdx+32*5], cx
+ movd eax, mm1
+ movd ecx, mm0
+ mov word ptr[rdx+32*8], ax
+ mov word ptr[rdx+32*9], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*12], ax
+ mov word ptr[rdx+32*13], cx
+
+ movd eax, mm6
+ movd ecx, mm5
+ psrlq mm5, 32
+ psrlq mm6, 32
+ mov word ptr[rdx+32*2], ax
+ mov word ptr[rdx+32*3], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*6], ax
+ mov word ptr[rdx+32*7], cx
+ movd eax, mm6
+ movd ecx, mm5
+ mov word ptr[rdx+32*10], ax
+ mov word ptr[rdx+32*11], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*14], ax
+ mov word ptr[rdx+32*15], cx
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
diff --git a/libvpx/vp8/common/x86/iwalsh_sse2.asm b/libvpx/vp8/common/x86/iwalsh_sse2.asm
new file mode 100644
index 0000000..06e86a8
--- /dev/null
+++ b/libvpx/vp8/common/x86/iwalsh_sse2.asm
@@ -0,0 +1,121 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
+global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
+sym(vp8_short_inv_walsh4x4_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 2
+ ; end prolog
+
+ mov rcx, arg(0)
+ mov rdx, arg(1)
+ mov rax, 30003h
+
+ movdqa xmm0, [rcx + 0] ;ip[4] ip[0]
+ movdqa xmm1, [rcx + 16] ;ip[12] ip[8]
+
+
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm0 ;ip[4] ip[0]
+
+ paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm4, xmm0
+ punpcklqdq xmm0, xmm3 ;d1 a1
+ punpckhqdq xmm4, xmm3 ;c1 b1
+
+ movdqa xmm1, xmm4 ;c1 b1
+ paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ ; 13 12 11 10 03 02 01 00
+ ;
+ ; 33 32 31 30 23 22 21 20
+ ;
+ movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
+ punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
+ punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
+ movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
+ punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
+ ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ movd xmm0, eax
+ pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
+ movdqa xmm3, xmm4 ;ip[4] ip[0]
+
+ pshufd xmm0, xmm0, 0 ;03 03 03 03 03 03 03 03
+
+ paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+ psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm3 ;d1 a1
+ punpckhqdq xmm5, xmm3 ;c1 b1
+
+ movdqa xmm1, xmm5 ;c1 b1
+ paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
+ psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
+
+ paddw xmm5, xmm0
+ paddw xmm4, xmm0
+ psraw xmm5, 3
+ psraw xmm4, 3
+
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*0], ax
+ mov word ptr[rdx+32*2], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*4], ax
+ mov word ptr[rdx+32*6], cx
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*8], ax
+ mov word ptr[rdx+32*10], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*12], ax
+ mov word ptr[rdx+32*14], cx
+
+ movd eax, xmm5
+ movd ecx, xmm4
+ psrldq xmm5, 4
+ psrldq xmm4, 4
+ mov word ptr[rdx+32*1], ax
+ mov word ptr[rdx+32*3], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*5], ax
+ mov word ptr[rdx+32*7], cx
+ movd eax, xmm5
+ movd ecx, xmm4
+ mov word ptr[rdx+32*9], ax
+ mov word ptr[rdx+32*11], cx
+ shr eax, 16
+ shr ecx, 16
+ mov word ptr[rdx+32*13], ax
+ mov word ptr[rdx+32*15], cx
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/common/x86/loopfilter_block_sse2.asm b/libvpx/vp8/common/x86/loopfilter_block_sse2.asm
new file mode 100644
index 0000000..1c445ef
--- /dev/null
+++ b/libvpx/vp8/common/x86/loopfilter_block_sse2.asm
@@ -0,0 +1,813 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro LF_ABS 2
+ ; %1 value not preserved
+ ; %2 value preserved
+ ; output in %1
+ movdqa scratch1, %2 ; v2
+
+ psubusb scratch1, %1 ; v2 - v1
+ psubusb %1, %2 ; v1 - v2
+ por %1, scratch1 ; abs(v2 - v1)
+%endmacro
+
+%macro LF_FILTER_HEV_MASK 8-9
+
+ LF_ABS %1, %2 ; abs(p3 - p2)
+ LF_ABS %2, %3 ; abs(p2 - p1)
+ pmaxub %1, %2 ; accumulate mask
+%if %0 == 8
+ movdqa scratch2, %3 ; save p1
+ LF_ABS scratch2, %4 ; abs(p1 - p0)
+%endif
+ LF_ABS %4, %5 ; abs(p0 - q0)
+ LF_ABS %5, %6 ; abs(q0 - q1)
+%if %0 == 8
+ pmaxub %5, scratch2 ; accumulate hev
+%else
+ pmaxub %5, %9
+%endif
+ pmaxub %1, %5 ; accumulate mask
+
+ LF_ABS %3, %6 ; abs(p1 - q1)
+ LF_ABS %6, %7 ; abs(q1 - q2)
+ pmaxub %1, %6 ; accumulate mask
+ LF_ABS %7, %8 ; abs(q2 - q3)
+ pmaxub %1, %7 ; accumulate mask
+
+ paddusb %4, %4 ; 2 * abs(p0 - q0)
+ pand %3, [GLOBAL(tfe)]
+ psrlw %3, 1 ; abs(p1 - q1) / 2
+ paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+ psubusb %1, [limit]
+ psubusb %4, [blimit]
+ por %1, %4
+ pcmpeqb %1, zero ; mask
+
+ psubusb %5, [thresh]
+ pcmpeqb %5, zero ; ~hev
+%endmacro
+
+%macro LF_FILTER 6
+ ; %1-%4: p1-q1
+ ; %5: mask
+ ; %6: hev
+
+ movdqa scratch2, %6 ; save hev
+
+ pxor %1, [GLOBAL(t80)] ; ps1
+ pxor %4, [GLOBAL(t80)] ; qs1
+ movdqa scratch1, %1
+ psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1)
+ pandn scratch2, scratch1 ; vp8_filter &= hev
+
+ pxor %2, [GLOBAL(t80)] ; ps0
+ pxor %3, [GLOBAL(t80)] ; qs0
+ movdqa scratch1, %3
+ psubsb scratch1, %2 ; qs0 - ps0
+ paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
+ paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
+ paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
+ pand %5, scratch2 ; &= mask
+
+ movdqa scratch2, %5
+ paddsb %5, [GLOBAL(t4)] ; Filter1
+ paddsb scratch2, [GLOBAL(t3)] ; Filter2
+
+ ; Filter1 >> 3
+ movdqa scratch1, zero
+ pcmpgtb scratch1, %5
+ psrlw %5, 3
+ pand scratch1, [GLOBAL(te0)]
+ pand %5, [GLOBAL(t1f)]
+ por %5, scratch1
+
+ psubsb %3, %5 ; qs0 - Filter1
+ pxor %3, [GLOBAL(t80)]
+
+ ; Filter2 >> 3
+ movdqa scratch1, zero
+ pcmpgtb scratch1, scratch2
+ psrlw scratch2, 3
+ pand scratch1, [GLOBAL(te0)]
+ pand scratch2, [GLOBAL(t1f)]
+ por scratch2, scratch1
+
+ paddsb %2, scratch2 ; ps0 + Filter2
+ pxor %2, [GLOBAL(t80)]
+
+ ; outer tap adjustments
+ paddsb %5, [GLOBAL(t1)]
+ movdqa scratch1, zero
+ pcmpgtb scratch1, %5
+ psrlw %5, 1
+ pand scratch1, [GLOBAL(t80)]
+ pand %5, [GLOBAL(t7f)]
+ por %5, scratch1
+ pand %5, %6 ; vp8_filter &= ~hev
+
+ psubsb %4, %5 ; qs1 - vp8_filter
+ pxor %4, [GLOBAL(t80)]
+
+ paddsb %1, %5 ; ps1 + vp8_filter
+ pxor %1, [GLOBAL(t80)]
+%endmacro
+
+;void vp8_loop_filter_bh_y_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh
+;)
+global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
+sym(vp8_loop_filter_bh_y_sse2):
+
+%ifidn __OUTPUT_FORMAT__,x64
+ %define src rcx ; src_ptr
+ %define stride rdx ; src_pixel_step
+ %define blimit r8
+ %define limit r9
+ %define thresh r10
+
+ %define spp rax
+ %define stride3 r11
+ %define stride5 r12
+ %define stride7 r13
+
+ push rbp
+ mov rbp, rsp
+ push r12
+ push r13
+ mov thresh, arg(4)
+%else
+ %define src rdi ; src_ptr
+ %define stride rsi ; src_pixel_step
+ %define blimit rdx
+ %define limit rcx
+ %define thresh r8
+
+ %define spp rax
+ %define stride3 r9
+ %define stride5 r10
+ %define stride7 r11
+%endif
+
+ %define scratch1 xmm5
+ %define scratch2 xmm6
+ %define zero xmm7
+
+ %define i0 [src]
+ %define i1 [spp]
+ %define i2 [src + 2 * stride]
+ %define i3 [spp + 2 * stride]
+ %define i4 [src + 4 * stride]
+ %define i5 [spp + 4 * stride]
+ %define i6 [src + 2 * stride3]
+ %define i7 [spp + 2 * stride3]
+ %define i8 [src + 8 * stride]
+ %define i9 [spp + 8 * stride]
+ %define i10 [src + 2 * stride5]
+ %define i11 [spp + 2 * stride5]
+ %define i12 [src + 4 * stride3]
+ %define i13 [spp + 4 * stride3]
+ %define i14 [src + 2 * stride7]
+ %define i15 [spp + 2 * stride7]
+
+ ; prep work
+ lea spp, [src + stride]
+ lea stride3, [stride + 2 * stride]
+ lea stride5, [stride3 + 2 * stride]
+ lea stride7, [stride3 + 4 * stride]
+ pxor zero, zero
+
+ ; load the first set into registers
+ movdqa xmm0, i0
+ movdqa xmm1, i1
+ movdqa xmm2, i2
+ movdqa xmm3, i3
+ movdqa xmm4, i4
+ movdqa xmm8, i5
+ movdqa xmm9, i6 ; q2, will contain abs(p1-p0)
+ movdqa xmm10, i7
+LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
+
+ movdqa xmm1, i2
+ movdqa xmm2, i3
+ movdqa xmm3, i4
+ movdqa xmm8, i5
+LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
+ movdqa i2, xmm1
+ movdqa i3, xmm2
+
+; second set
+ movdqa i4, xmm3
+ movdqa i5, xmm8
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm2, i8
+ movdqa xmm4, i9
+ movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i11
+LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm4, i8
+ movdqa xmm8, i9
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+ movdqa i6, xmm0
+ movdqa i7, xmm1
+
+; last set
+ movdqa i8, xmm4
+ movdqa i9, xmm8
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm2, i12
+ movdqa xmm3, i13
+ movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i15
+LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm3, i12
+ movdqa xmm8, i13
+LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
+ movdqa i10, xmm0
+ movdqa i11, xmm1
+ movdqa i12, xmm3
+ movdqa i13, xmm8
+
+%ifidn __OUTPUT_FORMAT__,x64
+ pop r13
+ pop r12
+ pop rbp
+%endif
+
+ ret
+
+
+;void vp8_loop_filter_bv_y_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh
+;)
+
+global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
+sym(vp8_loop_filter_bv_y_sse2):
+
+%ifidn __OUTPUT_FORMAT__,x64
+ %define src rcx ; src_ptr
+ %define stride rdx ; src_pixel_step
+ %define blimit r8
+ %define limit r9
+ %define thresh r10
+
+ %define spp rax
+ %define stride3 r11
+ %define stride5 r12
+ %define stride7 r13
+
+ push rbp
+ mov rbp, rsp
+ SAVE_XMM 15
+ push r12
+ push r13
+ mov thresh, arg(4)
+%else
+ %define src rdi
+ %define stride rsi
+ %define blimit rdx
+ %define limit rcx
+ %define thresh r8
+
+ %define spp rax
+ %define stride3 r9
+ %define stride5 r10
+ %define stride7 r11
+%endif
+
+ %define scratch1 xmm5
+ %define scratch2 xmm6
+ %define zero xmm7
+
+ %define s0 [src]
+ %define s1 [spp]
+ %define s2 [src + 2 * stride]
+ %define s3 [spp + 2 * stride]
+ %define s4 [src + 4 * stride]
+ %define s5 [spp + 4 * stride]
+ %define s6 [src + 2 * stride3]
+ %define s7 [spp + 2 * stride3]
+ %define s8 [src + 8 * stride]
+ %define s9 [spp + 8 * stride]
+ %define s10 [src + 2 * stride5]
+ %define s11 [spp + 2 * stride5]
+ %define s12 [src + 4 * stride3]
+ %define s13 [spp + 4 * stride3]
+ %define s14 [src + 2 * stride7]
+ %define s15 [spp + 2 * stride7]
+
+ %define i0 [rsp]
+ %define i1 [rsp + 16]
+ %define i2 [rsp + 32]
+ %define i3 [rsp + 48]
+ %define i4 [rsp + 64]
+ %define i5 [rsp + 80]
+ %define i6 [rsp + 96]
+ %define i7 [rsp + 112]
+ %define i8 [rsp + 128]
+ %define i9 [rsp + 144]
+ %define i10 [rsp + 160]
+ %define i11 [rsp + 176]
+ %define i12 [rsp + 192]
+ %define i13 [rsp + 208]
+ %define i14 [rsp + 224]
+ %define i15 [rsp + 240]
+
+ ALIGN_STACK 16, rax
+
+ ; reserve stack space
+ %define temp_storage 0 ; size is 256 (16*16)
+ %define stack_size 256
+ sub rsp, stack_size
+
+ ; prep work
+ lea spp, [src + stride]
+ lea stride3, [stride + 2 * stride]
+ lea stride5, [stride3 + 2 * stride]
+ lea stride7, [stride3 + 4 * stride]
+
+ ; 8-f
+ movdqa xmm0, s8
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, s9 ; 80 90
+ punpckhbw xmm1, s9 ; 88 98
+
+ movdqa xmm2, s10
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, s11 ; a0 b0
+ punpckhbw xmm3, s11 ; a8 b8
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 80 90 a0 b0
+ punpckhwd xmm4, xmm2 ; 84 94 a4 b4
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 88 98 a8 b8
+ punpckhwd xmm2, xmm3 ; 8c 9c ac bc
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, s12
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, s13 ; c0 d0
+ punpckhbw xmm5, s13 ; c8 d8
+
+ movdqa xmm6, s14
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, s15 ; e0 f0
+ punpckhbw xmm7, s15 ; e8 f8
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
+ punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
+ punpckhwd xmm6, xmm7 ; cc dc ec fc
+
+ ; pull the third and fourth sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
+ punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
+ punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
+ punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
+ punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
+
+ ; save the calculations. we only have 15 registers ...
+ movdqa i0, xmm0
+ movdqa i1, xmm7
+ movdqa i2, xmm4
+ movdqa i3, xmm3
+ movdqa i4, xmm1
+ movdqa i5, xmm8
+ movdqa i6, xmm2
+ movdqa i7, xmm5
+
+ ; 0-7
+ movdqa xmm0, s0
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, s1 ; 00 10
+ punpckhbw xmm1, s1 ; 08 18
+
+ movdqa xmm2, s2
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, s3 ; 20 30
+ punpckhbw xmm3, s3 ; 28 38
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 00 10 20 30
+ punpckhwd xmm4, xmm2 ; 04 14 24 34
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 08 18 28 38
+ punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, s4
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, s5 ; 40 50
+ punpckhbw xmm5, s5 ; 48 58
+
+ movdqa xmm6, s6
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, s7 ; 60 70
+ punpckhbw xmm7, s7 ; 68 78
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; 40 50 60 70
+ punpckhwd xmm8, xmm6 ; 44 54 64 74
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; 48 58 68 78
+ punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
+
+ ; pull the first two sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
+ punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
+ punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
+ punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
+ punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
+ ; final combination
+
+ movdqa xmm6, xmm0
+ punpcklqdq xmm0, i0
+ punpckhqdq xmm6, i0
+
+ movdqa xmm9, xmm7
+ punpcklqdq xmm7, i1
+ punpckhqdq xmm9, i1
+
+ movdqa xmm10, xmm4
+ punpcklqdq xmm4, i2
+ punpckhqdq xmm10, i2
+
+ movdqa xmm11, xmm3
+ punpcklqdq xmm3, i3
+ punpckhqdq xmm11, i3
+
+ movdqa xmm12, xmm1
+ punpcklqdq xmm1, i4
+ punpckhqdq xmm12, i4
+
+ movdqa xmm13, xmm8
+ punpcklqdq xmm8, i5
+ punpckhqdq xmm13, i5
+
+ movdqa xmm14, xmm2
+ punpcklqdq xmm2, i6
+ punpckhqdq xmm14, i6
+
+ movdqa xmm15, xmm5
+ punpcklqdq xmm5, i7
+ punpckhqdq xmm15, i7
+
+ movdqa i0, xmm0
+ movdqa i1, xmm6
+ movdqa i2, xmm7
+ movdqa i3, xmm9
+ movdqa i4, xmm4
+ movdqa i5, xmm10
+ movdqa i6, xmm3
+ movdqa i7, xmm11
+ movdqa i8, xmm1
+ movdqa i9, xmm12
+ movdqa i10, xmm8
+ movdqa i11, xmm13
+ movdqa i12, xmm2
+ movdqa i13, xmm14
+ movdqa i14, xmm5
+ movdqa i15, xmm15
+
+; TRANSPOSED DATA AVAILABLE ON THE STACK
+
+ movdqa xmm12, xmm6
+ movdqa xmm13, xmm7
+
+ pxor zero, zero
+
+LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
+
+ movdqa xmm1, i2
+ movdqa xmm2, i3
+ movdqa xmm8, i4
+ movdqa xmm9, i5
+LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
+ movdqa i2, xmm1
+ movdqa i3, xmm2
+
+; second set
+ movdqa i4, xmm8
+ movdqa i5, xmm9
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm2, i8
+ movdqa xmm4, i9
+ movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i11
+LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
+
+ movdqa xmm0, i6
+ movdqa xmm1, i7
+ movdqa xmm3, i8
+ movdqa xmm4, i9
+LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
+ movdqa i6, xmm0
+ movdqa i7, xmm1
+
+; last set
+ movdqa i8, xmm3
+ movdqa i9, xmm4
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm2, i12
+ movdqa xmm8, i13
+ movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
+ movdqa xmm11, i15
+LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
+
+ movdqa xmm0, i10
+ movdqa xmm1, i11
+ movdqa xmm4, i12
+ movdqa xmm8, i13
+LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
+ movdqa i10, xmm0
+ movdqa i11, xmm1
+ movdqa i12, xmm4
+ movdqa i13, xmm8
+
+
+; RESHUFFLE AND WRITE OUT
+ ; 8-f
+ movdqa xmm0, i8
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, i9 ; 80 90
+ punpckhbw xmm1, i9 ; 88 98
+
+ movdqa xmm2, i10
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, i11 ; a0 b0
+ punpckhbw xmm3, i11 ; a8 b8
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 80 90 a0 b0
+ punpckhwd xmm4, xmm2 ; 84 94 a4 b4
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 88 98 a8 b8
+ punpckhwd xmm2, xmm3 ; 8c 9c ac bc
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, i12
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, i13 ; c0 d0
+ punpckhbw xmm5, i13 ; c8 d8
+
+ movdqa xmm6, i14
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, i15 ; e0 f0
+ punpckhbw xmm7, i15 ; e8 f8
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
+ punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
+ punpckhwd xmm6, xmm7 ; cc dc ec fc
+
+ ; pull the third and fourth sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
+ punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
+ punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
+ punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
+ punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
+
+ ; save the calculations. we only have 15 registers ...
+ movdqa i8, xmm0
+ movdqa i9, xmm7
+ movdqa i10, xmm4
+ movdqa i11, xmm3
+ movdqa i12, xmm1
+ movdqa i13, xmm8
+ movdqa i14, xmm2
+ movdqa i15, xmm5
+
+ ; 0-7
+ movdqa xmm0, i0
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, i1 ; 00 10
+ punpckhbw xmm1, i1 ; 08 18
+
+ movdqa xmm2, i2
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, i3 ; 20 30
+ punpckhbw xmm3, i3 ; 28 38
+
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm2 ; 00 10 20 30
+ punpckhwd xmm4, xmm2 ; 04 14 24 34
+
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm3 ; 08 18 28 38
+ punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
+
+ ; using xmm[0124]
+ ; work on next 4 rows
+
+ movdqa xmm3, i4
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, i5 ; 40 50
+ punpckhbw xmm5, i5 ; 48 58
+
+ movdqa xmm6, i6
+ movdqa xmm7, xmm6
+ punpcklbw xmm6, i7 ; 60 70
+ punpckhbw xmm7, i7 ; 68 78
+
+ movdqa xmm8, xmm3
+ punpcklwd xmm3, xmm6 ; 40 50 60 70
+ punpckhwd xmm8, xmm6 ; 44 54 64 74
+
+ movdqa xmm6, xmm5
+ punpcklwd xmm5, xmm7 ; 48 58 68 78
+ punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
+
+ ; pull the first two sets together
+
+ movdqa xmm7, xmm0
+ punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
+ punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
+
+ movdqa xmm3, xmm4
+ punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
+ punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
+
+ movdqa xmm8, xmm1
+ punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
+ punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
+
+ movdqa xmm5, xmm2
+ punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
+ punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
+ ; final combination
+
+ movdqa xmm6, xmm0
+ punpcklqdq xmm0, i8
+ punpckhqdq xmm6, i8
+
+ movdqa xmm9, xmm7
+ punpcklqdq xmm7, i9
+ punpckhqdq xmm9, i9
+
+ movdqa xmm10, xmm4
+ punpcklqdq xmm4, i10
+ punpckhqdq xmm10, i10
+
+ movdqa xmm11, xmm3
+ punpcklqdq xmm3, i11
+ punpckhqdq xmm11, i11
+
+ movdqa xmm12, xmm1
+ punpcklqdq xmm1, i12
+ punpckhqdq xmm12, i12
+
+ movdqa xmm13, xmm8
+ punpcklqdq xmm8, i13
+ punpckhqdq xmm13, i13
+
+ movdqa xmm14, xmm2
+ punpcklqdq xmm2, i14
+ punpckhqdq xmm14, i14
+
+ movdqa xmm15, xmm5
+ punpcklqdq xmm5, i15
+ punpckhqdq xmm15, i15
+
+ movdqa s0, xmm0
+ movdqa s1, xmm6
+ movdqa s2, xmm7
+ movdqa s3, xmm9
+ movdqa s4, xmm4
+ movdqa s5, xmm10
+ movdqa s6, xmm3
+ movdqa s7, xmm11
+ movdqa s8, xmm1
+ movdqa s9, xmm12
+ movdqa s10, xmm8
+ movdqa s11, xmm13
+ movdqa s12, xmm2
+ movdqa s13, xmm14
+ movdqa s14, xmm5
+ movdqa s15, xmm15
+
+ ; free stack space
+ add rsp, stack_size
+
+ ; un-ALIGN_STACK
+ pop rsp
+
+%ifidn __OUTPUT_FORMAT__,x64
+ pop r13
+ pop r12
+ RESTORE_XMM
+ pop rbp
+%endif
+
+ ret
+
+SECTION_RODATA
+align 16
+te0:
+ times 16 db 0xe0
+align 16
+t7f:
+ times 16 db 0x7f
+align 16
+tfe:
+ times 16 db 0xfe
+align 16
+t1f:
+ times 16 db 0x1f
+align 16
+t80:
+ times 16 db 0x80
+align 16
+t1:
+ times 16 db 0x01
+align 16
+t3:
+ times 16 db 0x03
+align 16
+t4:
+ times 16 db 0x04
diff --git a/libvpx/vp8/common/x86/loopfilter_mmx.asm b/libvpx/vp8/common/x86/loopfilter_mmx.asm
new file mode 100644
index 0000000..f388d24
--- /dev/null
+++ b/libvpx/vp8/common/x86/loopfilter_mmx.asm
@@ -0,0 +1,1753 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_loop_filter_horizontal_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_loop_filter_horizontal_edge_mmx) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_h:
+ mov rdx, arg(3) ;limit
+ movq mm7, [rdx]
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+
+ ; calculate breakout conditions
+ movq mm2, [rdi+2*rax] ; q3
+ movq mm1, [rsi+2*rax] ; q2
+ movq mm6, mm1 ; q2
+ psubusb mm1, mm2 ; q2-=q3
+ psubusb mm2, mm6 ; q3-=q2
+ por mm1, mm2 ; abs(q3-q2)
+ psubusb mm1, mm7 ;
+
+
+ movq mm4, [rsi+rax] ; q1
+ movq mm3, mm4 ; q1
+ psubusb mm4, mm6 ; q1-=q2
+ psubusb mm6, mm3 ; q2-=q1
+ por mm4, mm6 ; abs(q2-q1)
+
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm4, [rsi] ; q0
+ movq mm0, mm4 ; q0
+ psubusb mm4, mm3 ; q0-=q1
+ psubusb mm3, mm0 ; q1-=q0
+ por mm4, mm3 ; abs(q0-q1)
+ movq t0, mm4 ; save to t0
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ neg rax ; negate pitch to deal with above border
+
+ movq mm2, [rsi+4*rax] ; p3
+ movq mm4, [rdi+4*rax] ; p2
+ movq mm5, mm4 ; p2
+ psubusb mm4, mm2 ; p2-=p3
+ psubusb mm2, mm5 ; p3-=p2
+ por mm4, mm2 ; abs(p3 - p2)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ movq mm4, [rsi+2*rax] ; p1
+ movq mm3, mm4 ; p1
+ psubusb mm4, mm5 ; p1-=p2
+ psubusb mm5, mm3 ; p2-=p1
+ por mm4, mm5 ; abs(p2 - p1)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm2, mm3 ; p1
+
+ movq mm4, [rsi+rax] ; p0
+ movq mm5, mm4 ; p0
+ psubusb mm4, mm3 ; p0-=p1
+ psubusb mm3, mm5 ; p1-=p0
+ por mm4, mm3 ; abs(p1 - p0)
+ movq t1, mm4 ; save to t1
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm3, [rdi] ; q1
+ movq mm4, mm3 ; q1
+ psubusb mm3, mm2 ; q1-=p1
+ psubusb mm2, mm4 ; p1-=q1
+ por mm2, mm3 ; abs(p1-q1)
+ pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm2, 1 ; abs(p1-q1)/2
+
+ movq mm6, mm5 ; p0
+ movq mm3, [rsi] ; q0
+ psubusb mm5, mm3 ; p0-=q0
+ psubusb mm3, mm6 ; q0-=p0
+ por mm5, mm3 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm7, [rdx] ; blimit
+
+ psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm5
+ pxor mm5, mm5
+ pcmpeqb mm1, mm5 ; mask mm1
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx] ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7
+ paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ pcmpeqb mm4, mm5
+
+ pcmpeqb mm5, mm5
+ pxor mm4, mm5
+
+
+ ; start work on filters
+ movq mm2, [rsi+2*rax] ; p1
+ movq mm7, [rdi] ; q1
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+ pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+ movq mm2, mm1
+ paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+ paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+ pxor mm0, mm0 ;
+ pxor mm5, mm5
+ punpcklbw mm0, mm2 ;
+ punpckhbw mm5, mm2 ;
+ psraw mm0, 11 ;
+ psraw mm5, 11
+ packsswb mm0, mm5
+ movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+ pxor mm0, mm0 ; 0
+ movq mm5, mm1 ; abcdefgh
+ punpcklbw mm0, mm1 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ pxor mm1, mm1 ; 0
+ punpckhbw mm1, mm5 ; a0b0c0d0
+ psraw mm1, 11 ; sign extended shift right by 3
+ movq mm5, mm0 ; save results
+
+ packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+ paddsw mm5, [GLOBAL(ones)]
+ paddsw mm1, [GLOBAL(ones)]
+ psraw mm5, 1 ; partial shifted one more time for 2nd tap
+ psraw mm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+ pandn mm4, mm5 ; high edge variance additive
+
+ paddsb mm6, mm2 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+ movq [rsi+rax], mm6 ; write back
+
+ movq mm6, [rsi+2*rax] ; p1
+ pxor mm6, [GLOBAL(t80)] ; reoffset
+ paddsb mm6, mm4 ; p1+= p1 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+ movq [rsi+2*rax], mm6 ; write back
+
+ psubsb mm3, mm0 ; q0-= q0 add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+ movq [rsi], mm3 ; write back
+
+ psubsb mm7, mm4 ; q1-= q1 add
+ pxor mm7, [GLOBAL(t80)] ; unoffset
+ movq [rdi], mm7 ; write back
+
+ add rsi,8
+ neg rax
+ dec rcx
+ jnz .next8_h
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_vertical_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_loop_filter_vertical_edge_mmx) PRIVATE
+sym(vp8_loop_filter_vertical_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 64 ; reserve 64 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[32];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi + rax*4 - 4]
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_v:
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+
+
+ ;transpose
+ movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
+ movq mm7, mm6 ; 77 76 75 74 73 72 71 70
+
+ punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64
+ punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60
+
+ movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
+ movq mm5, mm4 ; 47 46 45 44 43 42 41 40
+
+ punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44
+ punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40
+
+ movq mm3, mm5 ; 57 47 56 46 55 45 54 44
+ punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
+
+ punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
+ movq mm2, mm4 ; 53 43 52 42 51 41 50 40
+
+ punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
+ punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
+
+ neg rax
+ movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
+
+ movq mm1, mm6 ; 27 26 25 24 23 22 21 20
+ punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24
+
+ punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20
+ movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
+
+ punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
+ movq mm0, mm7 ; 17 07 16 06 15 05 14 04
+
+ punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
+ punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
+
+ movq mm6, mm7 ; 37 27 17 07 36 26 16 06
+ punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
+
+ punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
+
+ movq mm5, mm6 ; 76 66 56 46 36 26 16 06
+ psubusb mm5, mm7 ; q2-q3
+
+ psubusb mm7, mm6 ; q3-q2
+ por mm7, mm5; ; mm7=abs (q3-q2)
+
+ movq mm5, mm0 ; 35 25 15 05 34 24 14 04
+ punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
+
+ punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
+ movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
+
+ psubusb mm3, mm6 ; q1-q2
+ psubusb mm6, mm5 ; q2-q1
+
+ por mm6, mm3 ; mm6=abs(q2-q1)
+ lea rdx, srct
+
+ movq [rdx+24], mm5 ; save q1
+ movq [rdx+16], mm0 ; save q0
+
+ movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
+ punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
+
+ movq mm0, mm3 ; 13 03 12 02 11 01 10 00
+ punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
+
+ punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
+ movq mm1, mm0 ; 31 21 11 01 30 20 10 00
+
+ punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
+ punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
+
+ movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
+ psubusb mm2, mm0 ; p2-p3
+
+ psubusb mm0, mm1 ; p3-p2
+ por mm0, mm2 ; mm0=abs(p3-p2)
+
+ movq mm2, mm3 ; 33 23 13 03 32 22 12 02
+ punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
+
+ punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
+ movq [rdx+8], mm3 ; save p0
+
+ movq [rdx], mm2 ; save p1
+ movq mm5, mm2 ; mm5 = p1
+
+ psubusb mm2, mm1 ; p1-p2
+ psubusb mm1, mm5 ; p2-p1
+
+ por mm1, mm2 ; mm1=abs(p2-p1)
+ mov rdx, arg(3) ;limit
+
+ movq mm4, [rdx] ; mm4 = limit
+ psubusb mm7, mm4
+
+ psubusb mm0, mm4
+ psubusb mm1, mm4
+
+ psubusb mm6, mm4
+ por mm7, mm6
+
+ por mm0, mm1
+ por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+ movq mm1, mm5 ; p1
+
+ movq mm7, mm3 ; mm3=mm7=p0
+ psubusb mm7, mm5 ; p0 - p1
+
+ psubusb mm5, mm3 ; p1 - p0
+ por mm5, mm7 ; abs(p1-p0)
+
+ movq t0, mm5 ; save abs(p1-p0)
+ lea rdx, srct
+
+ psubusb mm5, mm4
+ por mm0, mm5 ; mm0=mask
+
+ movq mm5, [rdx+16] ; mm5=q0
+ movq mm7, [rdx+24] ; mm7=q1
+
+ movq mm6, mm5 ; mm6=q0
+ movq mm2, mm7 ; q1
+ psubusb mm5, mm7 ; q0-q1
+
+ psubusb mm7, mm6 ; q1-q0
+ por mm7, mm5 ; abs(q1-q0)
+
+ movq t1, mm7 ; save abs(q1-q0)
+ psubusb mm7, mm4
+
+ por mm0, mm7 ; mask
+
+ movq mm5, mm2 ; q1
+ psubusb mm5, mm1 ; q1-=p1
+ psubusb mm1, mm2 ; p1-=q1
+ por mm5, mm1 ; abs(p1-q1)
+ pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm5, 1 ; abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ;
+
+ movq mm4, [rdx] ;blimit
+ movq mm1, mm3 ; mm1=mm3=p0
+
+ movq mm7, mm6 ; mm7=mm6=q0
+ psubusb mm1, mm7 ; p0-q0
+
+ psubusb mm7, mm3 ; q0-p0
+ por mm1, mm7 ; abs(q0-p0)
+ paddusb mm1, mm1 ; abs(q0-p0)*2
+ paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm0; ; mask
+
+ pxor mm0, mm0
+ pcmpeqb mm1, mm0
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx]
+ ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7
+
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7
+
+ por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+ pcmpeqb mm4, mm0
+
+ pcmpeqb mm0, mm0
+ pxor mm4, mm0
+
+
+
+ ; start work on filters
+ lea rdx, srct
+
+ movq mm2, [rdx] ; p1
+ movq mm7, [rdx+24] ; q1
+
+ movq mm6, [rdx+8] ; p0
+ movq mm0, [rdx+16] ; q0
+
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+
+ psubsb mm2, mm7 ; p1 - q1
+ pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
+
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+
+ movq mm2, mm1
+ paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+
+ paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+ pxor mm0, mm0 ;
+
+ pxor mm5, mm5
+ punpcklbw mm0, mm2 ;
+
+ punpckhbw mm5, mm2 ;
+ psraw mm0, 11 ;
+
+ psraw mm5, 11
+ packsswb mm0, mm5
+
+ movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+
+ pxor mm0, mm0 ; 0
+ movq mm5, mm1 ; abcdefgh
+
+ punpcklbw mm0, mm1 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+
+ pxor mm1, mm1 ; 0
+ punpckhbw mm1, mm5 ; a0b0c0d0
+
+ psraw mm1, 11 ; sign extended shift right by 3
+ movq mm5, mm0 ; save results
+
+ packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+ paddsw mm5, [GLOBAL(ones)]
+
+ paddsw mm1, [GLOBAL(ones)]
+ psraw mm5, 1 ; partial shifted one more time for 2nd tap
+
+ psraw mm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+
+ pandn mm4, mm5 ; high edge variance additive
+
+ paddsb mm6, mm2 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+
+ ; mm6=p0 ;
+ movq mm1, [rdx] ; p1
+ pxor mm1, [GLOBAL(t80)] ; reoffset
+
+ paddsb mm1, mm4 ; p1+= p1 add
+ pxor mm1, [GLOBAL(t80)] ; unoffset
+ ; mm6 = p0 mm1 = p1
+
+ psubsb mm3, mm0 ; q0-= q0 add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+
+ ; mm3 = q0
+ psubsb mm7, mm4 ; q1-= q1 add
+ pxor mm7, [GLOBAL(t80)] ; unoffset
+ ; mm7 = q1
+
+ ; tranpose and write back
+ ; mm1 = 72 62 52 42 32 22 12 02
+ ; mm6 = 73 63 53 43 33 23 13 03
+ ; mm3 = 74 64 54 44 34 24 14 04
+ ; mm7 = 75 65 55 45 35 25 15 05
+
+ movq mm2, mm1 ; 72 62 52 42 32 22 12 02
+ punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02
+
+ movq mm4, mm3 ; 74 64 54 44 34 24 14 04
+ punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42
+
+ punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04
+ punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44
+
+ movq mm6, mm2 ; 33 32 23 22 13 12 03 02
+ punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02
+
+ punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22
+ movq mm5, mm1 ; 73 72 63 62 53 52 43 42
+
+ punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42
+ punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62
+
+
+ ; mm2 = 15 14 13 12 05 04 03 02
+ ; mm6 = 35 34 33 32 25 24 23 22
+ ; mm5 = 55 54 53 52 45 44 43 42
+ ; mm1 = 75 74 73 72 65 64 63 62
+
+
+
+ movd [rsi+rax*4+2], mm2
+ psrlq mm2, 32
+
+ movd [rdi+rax*4+2], mm2
+ movd [rsi+rax*2+2], mm6
+
+ psrlq mm6, 32
+ movd [rsi+rax+2],mm6
+
+ movd [rsi+2], mm1
+ psrlq mm1, 32
+
+ movd [rdi+2], mm1
+ neg rax
+
+ movd [rdi+rax+2],mm5
+ psrlq mm5, 32
+
+ movd [rdi+rax*2+2], mm5
+
+ lea rsi, [rsi+rax*8]
+ dec rcx
+ jnz .next8_v
+
+ add rsp, 64
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_horizontal_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_mmx) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_mbh:
+ mov rdx, arg(3) ;limit
+ movq mm7, [rdx]
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+
+ ; calculate breakout conditions
+ movq mm2, [rdi+2*rax] ; q3
+
+ movq mm1, [rsi+2*rax] ; q2
+ movq mm6, mm1 ; q2
+ psubusb mm1, mm2 ; q2-=q3
+ psubusb mm2, mm6 ; q3-=q2
+ por mm1, mm2 ; abs(q3-q2)
+ psubusb mm1, mm7
+
+
+ ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
+ movq mm4, [rsi+rax] ; q1
+ movq mm3, mm4 ; q1
+ psubusb mm4, mm6 ; q1-=q2
+ psubusb mm6, mm3 ; q2-=q1
+ por mm4, mm6 ; abs(q2-q1)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ ; mm1 = mask, mm3=q1, mm7 = limit
+
+ movq mm4, [rsi] ; q0
+ movq mm0, mm4 ; q0
+ psubusb mm4, mm3 ; q0-=q1
+ psubusb mm3, mm0 ; q1-=q0
+ por mm4, mm3 ; abs(q0-q1)
+ movq t0, mm4 ; save to t0
+ psubusb mm4, mm7
+ por mm1, mm4
+
+
+ ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
+
+ neg rax ; negate pitch to deal with above border
+
+ movq mm2, [rsi+4*rax] ; p3
+ movq mm4, [rdi+4*rax] ; p2
+ movq mm5, mm4 ; p2
+ psubusb mm4, mm2 ; p2-=p3
+ psubusb mm2, mm5 ; p3-=p2
+ por mm4, mm2 ; abs(p3 - p2)
+ psubusb mm4, mm7
+ por mm1, mm4
+ ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
+
+ movq mm4, [rsi+2*rax] ; p1
+ movq mm3, mm4 ; p1
+ psubusb mm4, mm5 ; p1-=p2
+ psubusb mm5, mm3 ; p2-=p1
+ por mm4, mm5 ; abs(p2 - p1)
+ psubusb mm4, mm7
+ por mm1, mm4
+
+ movq mm2, mm3 ; p1
+
+
+ ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
+
+ movq mm4, [rsi+rax] ; p0
+ movq mm5, mm4 ; p0
+ psubusb mm4, mm3 ; p0-=p1
+ psubusb mm3, mm5 ; p1-=p0
+ por mm4, mm3 ; abs(p1 - p0)
+ movq t1, mm4 ; save to t1
+ psubusb mm4, mm7
+ por mm1, mm4
+ ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+ ; mm5 = p0
+ movq mm3, [rdi] ; q1
+ movq mm4, mm3 ; q1
+ psubusb mm3, mm2 ; q1-=p1
+ psubusb mm2, mm4 ; p1-=q1
+ por mm2, mm3 ; abs(p1-q1)
+ pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm2, 1 ; abs(p1-q1)/2
+
+ movq mm6, mm5 ; p0
+ movq mm3, mm0 ; q0
+ psubusb mm5, mm3 ; p0-=q0
+ psubusb mm3, mm6 ; q0-=p0
+ por mm5, mm3 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm7, [rdx] ; blimit
+
+ psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm5
+ pxor mm5, mm5
+ pcmpeqb mm1, mm5 ; mask mm1
+
+ ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
+ ; mm6 = p0,
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx] ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7
+ paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ pcmpeqb mm4, mm5
+
+ pcmpeqb mm5, mm5
+ pxor mm4, mm5
+
+
+
+ ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
+ ; mm6 = p0, mm4=hev
+ ; start work on filters
+ movq mm2, [rsi+2*rax] ; p1
+ movq mm7, [rdi] ; q1
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0)
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+
+
+ ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
+ movq mm2, mm1 ; vp8_filter
+ pand mm2, mm4; ; Filter2 = vp8_filter & hev
+
+ movq mm5, mm2 ;
+ paddsb mm5, [GLOBAL(t3)];
+
+ pxor mm0, mm0 ; 0
+ pxor mm7, mm7 ; 0
+
+ punpcklbw mm0, mm5 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ punpckhbw mm7, mm5 ; a0b0c0d0
+ psraw mm7, 11 ; sign extended shift right by 3
+ packsswb mm0, mm7 ; Filter2 >>=3;
+
+ movq mm5, mm0 ; Filter2
+
+ paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
+ pxor mm0, mm0 ; 0
+ pxor mm7, mm7 ; 0
+
+ punpcklbw mm0, mm2 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ punpckhbw mm7, mm2 ; a0b0c0d0
+ psraw mm7, 11 ; sign extended shift right by 3
+ packsswb mm0, mm7 ; Filter2 >>=3;
+
+ ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
+ psubsb mm3, mm0 ; qs0 =qs0 - filter1
+ paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
+
+ ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
+ ; vp8_filter &= ~hev;
+ ; Filter2 = vp8_filter;
+ pandn mm4, mm1 ; vp8_filter&=~hev
+
+
+ ; mm3=qs0, mm4=filter2, mm6=ps0
+
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+ ; s = vp8_signed_char_clamp(qs0 - u);
+ ; *oq0 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps0 + u);
+ ; *op0 = s^0x80;
+ pxor mm0, mm0
+
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s27)]
+ pmulhw mm2, [GLOBAL(s27)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ psubsb mm3, mm1
+ paddsb mm6, mm1
+
+ pxor mm3, [GLOBAL(t80)]
+ pxor mm6, [GLOBAL(t80)]
+ movq [rsi+rax], mm6
+ movq [rsi], mm3
+
+ ; roughly 2/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+ ; s = vp8_signed_char_clamp(qs1 - u);
+ ; *oq1 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps1 + u);
+ ; *op1 = s^0x80;
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s18)]
+ pmulhw mm2, [GLOBAL(s18)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ movq mm3, [rdi]
+ movq mm6, [rsi+rax*2] ; p1
+
+ pxor mm3, [GLOBAL(t80)]
+ pxor mm6, [GLOBAL(t80)]
+
+ paddsb mm6, mm1
+ psubsb mm3, mm1
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+ movq [rdi], mm3
+ movq [rsi+rax*2], mm6
+
+ ; roughly 1/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+ ; s = vp8_signed_char_clamp(qs2 - u);
+ ; *oq2 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps2 + u);
+ ; *op2 = s^0x80;
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s9)]
+ pmulhw mm2, [GLOBAL(s9)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+
+ movq mm6, [rdi+rax*4]
+ neg rax
+ movq mm3, [rdi+rax ]
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+
+ paddsb mm6, mm1
+ psubsb mm3, mm1
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+ movq [rdi+rax ], mm3
+ neg rax
+ movq [rdi+rax*4], mm6
+
+;EARLY_BREAK_OUT:
+ neg rax
+ add rsi,8
+ dec rcx
+ jnz .next8_mbh
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_vertical_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_mbloop_filter_vertical_edge_mmx) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 96 ; reserve 96 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+ %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi + rax*4 - 4]
+
+ movsxd rcx, dword ptr arg(5) ;count
+.next8_mbv:
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+
+ ;transpose
+ movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70
+ movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
+
+ movq mm7, mm6 ; 77 76 75 74 73 72 71 70
+ punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64
+
+ punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
+ movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50
+
+ movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
+ movq mm5, mm4 ; 47 46 45 44 43 42 41 40
+
+ punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44
+ punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
+
+ movq mm3, mm5 ; 57 47 56 46 55 45 54 44
+ punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
+
+ punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
+ movq mm2, mm4 ; 53 43 52 42 51 41 50 40
+
+ punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
+ punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
+
+ neg rax
+
+ movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30
+ movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
+
+ movq mm1, mm6 ; 27 26 25 24 23 22 21 20
+ punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24
+
+ punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20
+
+ movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
+ punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
+
+ movq mm0, mm7 ; 17 07 16 06 15 05 14 04
+ punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
+
+ punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
+ movq mm6, mm7 ; 37 27 17 07 36 26 16 06
+
+ punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
+ punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
+
+ lea rdx, srct
+ movq mm5, mm6 ; 76 66 56 46 36 26 16 06
+
+ movq [rdx+56], mm7
+ psubusb mm5, mm7 ; q2-q3
+
+
+ movq [rdx+48], mm6
+ psubusb mm7, mm6 ; q3-q2
+
+ por mm7, mm5; ; mm7=abs (q3-q2)
+ movq mm5, mm0 ; 35 25 15 05 34 24 14 04
+
+ punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
+ punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
+
+ movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
+ psubusb mm3, mm6 ; q1-q2
+
+ psubusb mm6, mm5 ; q2-q1
+ por mm6, mm3 ; mm6=abs(q2-q1)
+
+ movq [rdx+40], mm5 ; save q1
+ movq [rdx+32], mm0 ; save q0
+
+ movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
+ punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
+
+ movq mm0, mm3 ; 13 03 12 02 11 01 10 00
+ punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
+
+ punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
+ movq mm1, mm0 ; 31 21 11 01 30 20 10 00
+
+ punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
+ punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
+
+ movq [rdx], mm0 ; save p3
+ movq [rdx+8], mm1 ; save p2
+
+ movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
+ psubusb mm2, mm0 ; p2-p3
+
+ psubusb mm0, mm1 ; p3-p2
+ por mm0, mm2 ; mm0=abs(p3-p2)
+
+ movq mm2, mm3 ; 33 23 13 03 32 22 12 02
+ punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
+
+ punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
+ movq [rdx+24], mm3 ; save p0
+
+ movq [rdx+16], mm2 ; save p1
+ movq mm5, mm2 ; mm5 = p1
+
+ psubusb mm2, mm1 ; p1-p2
+ psubusb mm1, mm5 ; p2-p1
+
+ por mm1, mm2 ; mm1=abs(p2-p1)
+ mov rdx, arg(3) ;limit
+
+ movq mm4, [rdx] ; mm4 = limit
+ psubusb mm7, mm4 ; abs(q3-q2) > limit
+
+ psubusb mm0, mm4 ; abs(p3-p2) > limit
+ psubusb mm1, mm4 ; abs(p2-p1) > limit
+
+ psubusb mm6, mm4 ; abs(q2-q1) > limit
+ por mm7, mm6 ; or
+
+ por mm0, mm1 ;
+ por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
+
+ movq mm1, mm5 ; p1
+
+ movq mm7, mm3 ; mm3=mm7=p0
+ psubusb mm7, mm5 ; p0 - p1
+
+ psubusb mm5, mm3 ; p1 - p0
+ por mm5, mm7 ; abs(p1-p0)
+
+ movq t0, mm5 ; save abs(p1-p0)
+ lea rdx, srct
+
+ psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit
+ por mm0, mm5 ; mm0=mask
+
+ movq mm5, [rdx+32] ; mm5=q0
+ movq mm7, [rdx+40] ; mm7=q1
+
+ movq mm6, mm5 ; mm6=q0
+ movq mm2, mm7 ; q1
+ psubusb mm5, mm7 ; q0-q1
+
+ psubusb mm7, mm6 ; q1-q0
+ por mm7, mm5 ; abs(q1-q0)
+
+ movq t1, mm7 ; save abs(q1-q0)
+ psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit
+
+ por mm0, mm7 ; mask
+
+ movq mm5, mm2 ; q1
+ psubusb mm5, mm1 ; q1-=p1
+ psubusb mm1, mm2 ; p1-=q1
+ por mm5, mm1 ; abs(p1-q1)
+ pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm5, 1 ; abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ;
+
+ movq mm4, [rdx] ;blimit
+ movq mm1, mm3 ; mm1=mm3=p0
+
+ movq mm7, mm6 ; mm7=mm6=q0
+ psubusb mm1, mm7 ; p0-q0
+
+ psubusb mm7, mm3 ; q0-p0
+ por mm1, mm7 ; abs(q0-p0)
+ paddusb mm1, mm1 ; abs(q0-p0)*2
+ paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por mm1, mm0; ; mask
+
+ pxor mm0, mm0
+ pcmpeqb mm1, mm0
+
+ ; calculate high edge variance
+ mov rdx, arg(4) ;thresh ; get thresh
+ movq mm7, [rdx]
+ ;
+ movq mm4, t0 ; get abs (q1 - q0)
+ psubusb mm4, mm7 ; abs(q1 - q0) > thresh
+
+ movq mm3, t1 ; get abs (p1 - p0)
+ psubusb mm3, mm7 ; abs(p1 - p0)> thresh
+
+ por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+ pcmpeqb mm4, mm0
+
+ pcmpeqb mm0, mm0
+ pxor mm4, mm0
+
+
+
+
+ ; start work on filters
+ lea rdx, srct
+
+ ; start work on filters
+ movq mm2, [rdx+16] ; p1
+ movq mm7, [rdx+40] ; q1
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+
+ movq mm6, [rdx+24] ; p0
+ movq mm0, [rdx+32] ; q0
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
+ paddsb mm2, mm0 ; 2 * (q0 - p0)
+ paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
+ pand mm1, mm2 ; mask filter values we don't care about
+
+ ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
+ movq mm2, mm1 ; vp8_filter
+ pand mm2, mm4; ; Filter2 = vp8_filter & hev
+
+ movq mm5, mm2 ;
+ paddsb mm5, [GLOBAL(t3)];
+
+ pxor mm0, mm0 ; 0
+ pxor mm7, mm7 ; 0
+
+ punpcklbw mm0, mm5 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ punpckhbw mm7, mm5 ; a0b0c0d0
+ psraw mm7, 11 ; sign extended shift right by 3
+ packsswb mm0, mm7 ; Filter2 >>=3;
+
+ movq mm5, mm0 ; Filter2
+
+ paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
+ pxor mm0, mm0 ; 0
+ pxor mm7, mm7 ; 0
+
+ punpcklbw mm0, mm2 ; e0f0g0h0
+ psraw mm0, 11 ; sign extended shift right by 3
+ punpckhbw mm7, mm2 ; a0b0c0d0
+ psraw mm7, 11 ; sign extended shift right by 3
+ packsswb mm0, mm7 ; Filter2 >>=3;
+
+ ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
+ psubsb mm3, mm0 ; qs0 =qs0 - filter1
+ paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
+
+ ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
+ ; vp8_filter &= ~hev;
+ ; Filter2 = vp8_filter;
+ pandn mm4, mm1 ; vp8_filter&=~hev
+
+
+ ; mm3=qs0, mm4=filter2, mm6=ps0
+
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
+ ; s = vp8_signed_char_clamp(qs0 - u);
+ ; *oq0 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps0 + u);
+ ; *op0 = s^0x80;
+ pxor mm0, mm0
+
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s27)]
+ pmulhw mm2, [GLOBAL(s27)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ psubsb mm3, mm1
+ paddsb mm6, mm1
+
+ pxor mm3, [GLOBAL(t80)]
+ pxor mm6, [GLOBAL(t80)]
+ movq [rdx+24], mm6
+ movq [rdx+32], mm3
+
+ ; roughly 2/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
+ ; s = vp8_signed_char_clamp(qs1 - u);
+ ; *oq1 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps1 + u);
+ ; *op1 = s^0x80;
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s18)]
+ pmulhw mm2, [GLOBAL(s18)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ movq mm3, [rdx + 40]
+ movq mm6, [rdx + 16] ; p1
+ pxor mm3, [GLOBAL(t80)]
+ pxor mm6, [GLOBAL(t80)]
+
+ paddsb mm6, mm1
+ psubsb mm3, mm1
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+ movq [rdx + 40], mm3
+ movq [rdx + 16], mm6
+
+ ; roughly 1/7th difference across boundary
+ ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
+ ; s = vp8_signed_char_clamp(qs2 - u);
+ ; *oq2 = s^0x80;
+ ; s = vp8_signed_char_clamp(ps2 + u);
+ ; *op2 = s^0x80;
+ pxor mm1, mm1
+ pxor mm2, mm2
+ punpcklbw mm1, mm4
+ punpckhbw mm2, mm4
+ pmulhw mm1, [GLOBAL(s9)]
+ pmulhw mm2, [GLOBAL(s9)]
+ paddw mm1, [GLOBAL(s63)]
+ paddw mm2, [GLOBAL(s63)]
+ psraw mm1, 7
+ psraw mm2, 7
+ packsswb mm1, mm2
+
+ movq mm6, [rdx+ 8]
+ movq mm3, [rdx+48]
+
+ pxor mm6, [GLOBAL(t80)]
+ pxor mm3, [GLOBAL(t80)]
+
+ paddsb mm6, mm1
+ psubsb mm3, mm1
+
+ pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01
+ pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06
+
+ ; tranpose and write back
+ movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00
+ movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00
+
+ punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00
+ punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40
+
+ movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02
+ movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02
+
+ punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02
+ punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42
+
+ movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00
+ punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00
+
+ punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20
+ movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40
+
+ punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40
+ punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60
+
+ movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
+ punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04
+
+ movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06
+ punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06
+
+ movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04
+ punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04
+
+ punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24
+ movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00
+
+ punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00
+ punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10
+
+ movq [rsi+rax*4], mm0 ; write out
+ movq [rdi+rax*4], mm6 ; write out
+
+ movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20
+ punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20
+
+ punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30
+ movq [rsi+rax*2], mm0 ; write out
+
+ movq [rdi+rax*2], mm5 ; write out
+ movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
+
+ punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44
+ punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46
+
+ movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44
+ punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44
+
+ punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64
+ movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40
+
+ movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60
+ punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40
+
+ punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50
+ movq [rsi], mm0 ; write out
+
+ movq [rdi], mm1 ; write out
+ neg rax
+
+ punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60
+ punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60
+
+ movq [rsi+rax*2], mm3
+ movq [rdi+rax*2], mm4
+
+ lea rsi, [rsi+rax*8]
+ dec rcx
+
+ jnz .next8_mbv
+
+ add rsp, 96
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_horizontal_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit
+;)
+global sym(vp8_loop_filter_simple_horizontal_edge_mmx) PRIVATE
+sym(vp8_loop_filter_simple_horizontal_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ mov rcx, 2 ; count
+.nexts8_h:
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm3, [rdx] ;
+
+ mov rdi, rsi ; rdi points to row +1 for indirect addressing
+ add rdi, rax
+ neg rax
+
+ ; calculate mask
+ movq mm1, [rsi+2*rax] ; p1
+ movq mm0, [rdi] ; q1
+ movq mm2, mm1
+ movq mm7, mm0
+ movq mm4, mm0
+ psubusb mm0, mm1 ; q1-=p1
+ psubusb mm1, mm4 ; p1-=q1
+ por mm1, mm0 ; abs(p1-q1)
+ pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm1, 1 ; abs(p1-q1)/2
+
+ movq mm5, [rsi+rax] ; p0
+ movq mm4, [rsi] ; q0
+ movq mm0, mm4 ; q0
+ movq mm6, mm5 ; p0
+ psubusb mm5, mm4 ; p0-=q0
+ psubusb mm4, mm6 ; q0-=p0
+ por mm5, mm4 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor mm3, mm3
+ pcmpeqb mm5, mm3
+
+ ; start work on filters
+ pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ psubsb mm2, mm7 ; p1 - q1
+
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm0 ; q0
+ psubsb mm0, mm6 ; q0 - p0
+ paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0)
+ pand mm5, mm2 ; mask filter values we don't care about
+
+ ; do + 4 side
+ paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+ movq mm1, mm5 ; get a copy of filters
+ psraw mm1, 11 ; arithmetic shift right 11
+ psllw mm1, 8 ; shift left 8 to put it back
+
+ por mm0, mm1 ; put the two together to get result
+
+ psubsb mm3, mm0 ; q0-= q0 add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+ movq [rsi], mm3 ; write back
+
+
+ ; now do +3 side
+ psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+ psraw mm5, 11 ; arithmetic shift right 11
+ psllw mm5, 8 ; shift left 8 to put it back
+ por mm0, mm5 ; put the two together to get result
+
+
+ paddsb mm6, mm0 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+ movq [rsi+rax], mm6 ; write back
+
+ add rsi,8
+ neg rax
+ dec rcx
+ jnz .nexts8_h
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_vertical_edge_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit
+;)
+global sym(vp8_loop_filter_simple_vertical_edge_mmx) PRIVATE
+sym(vp8_loop_filter_simple_vertical_edge_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi + rax*4- 2]; ;
+ mov rcx, 2 ; count
+.nexts8_v:
+
+ lea rdi, [rsi + rax];
+ movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
+
+ movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60
+ punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
+
+ movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50
+ movd mm4, [rsi] ; xx xx xx xx 43 42 41 40
+
+ punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
+ movq mm5, mm4 ; 53 43 52 42 51 41 50 40
+
+ punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40
+ punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42
+
+ neg rax
+
+ movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30
+ movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20
+
+ punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20
+ movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10
+
+ movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00
+ punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00
+
+ movq mm2, mm0 ; 13 03 12 02 11 01 10 00
+ punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00
+
+ punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02
+ movq mm1, mm0 ; 13 03 12 02 11 01 10 00
+
+ punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1
+ movq mm3, mm2 ; 33 23 13 03 32 22 12 02
+
+ punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0
+ punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0
+
+ punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1
+
+
+ ; calculate mask
+ movq mm6, mm0 ; p1
+ movq mm7, mm3 ; q1
+ psubusb mm7, mm6 ; q1-=p1
+ psubusb mm6, mm3 ; p1-=q1
+ por mm6, mm7 ; abs(p1-q1)
+ pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw mm6, 1 ; abs(p1-q1)/2
+
+ movq mm5, mm1 ; p0
+ movq mm4, mm2 ; q0
+
+ psubusb mm5, mm2 ; p0-=q0
+ psubusb mm4, mm1 ; q0-=p0
+
+ por mm5, mm4 ; abs(p0 - q0)
+ paddusb mm5, mm5 ; abs(p0-q0)*2
+ paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ mov rdx, arg(2) ;blimit ; get blimit
+ movq mm7, [rdx]
+
+ psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor mm7, mm7
+ pcmpeqb mm5, mm7 ; mm5 = mask
+
+ ; start work on filters
+ movq t0, mm0
+ movq t1, mm3
+
+ pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
+ pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
+
+ psubsb mm0, mm3 ; p1 - q1
+ movq mm6, mm1 ; p0
+
+ movq mm7, mm2 ; q0
+ pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
+
+ pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values
+ movq mm3, mm7 ; offseted ; q0
+
+ psubsb mm7, mm6 ; q0 - p0
+ paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0)
+
+ paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0)
+
+ pand mm5, mm0 ; mask filter values we don't care about
+
+ paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+
+ movq mm7, mm5 ; get a copy of filters
+ psraw mm7, 11 ; arithmetic shift right 11
+ psllw mm7, 8 ; shift left 8 to put it back
+
+ por mm0, mm7 ; put the two together to get result
+
+ psubsb mm3, mm0 ; q0-= q0sz add
+ pxor mm3, [GLOBAL(t80)] ; unoffset
+
+ ; now do +3 side
+ psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
+
+ movq mm0, mm5 ; get a copy of filters
+ psllw mm0, 8 ; shift left 8
+ psraw mm0, 3 ; arithmetic shift right 11
+ psrlw mm0, 8
+
+ psraw mm5, 11 ; arithmetic shift right 11
+ psllw mm5, 8 ; shift left 8 to put it back
+ por mm0, mm5 ; put the two together to get result
+
+ paddsb mm6, mm0 ; p0+= p0 add
+ pxor mm6, [GLOBAL(t80)] ; unoffset
+
+
+ movq mm0, t0
+ movq mm4, t1
+
+ ; mm0 = 70 60 50 40 30 20 10 00
+ ; mm6 = 71 61 51 41 31 21 11 01
+ ; mm3 = 72 62 52 42 32 22 12 02
+ ; mm4 = 73 63 53 43 33 23 13 03
+ ; transpose back to write out
+
+ movq mm1, mm0 ;
+ punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00
+
+ punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40
+ movq mm2, mm3 ;
+
+ punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02
+ movq mm5, mm1 ; 71 70 61 60 51 50 41 40
+
+ punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42
+ movq mm6, mm0 ; 31 30 21 20 11 10 01 00
+
+ punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00
+ punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20
+
+ movd [rsi+rax*4], mm0 ; write 03 02 01 00
+ punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40
+
+ psrlq mm0, 32 ; xx xx xx xx 13 12 11 10
+ punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60
+
+ movd [rdi+rax*4], mm0 ; write 13 12 11 10
+ movd [rsi+rax*2], mm6 ; write 23 22 21 20
+
+ psrlq mm6, 32 ; 33 32 31 30
+ movd [rsi], mm1 ; write 43 42 41 40
+
+ movd [rsi + rax], mm6 ; write 33 32 31 30
+ neg rax
+
+ movd [rsi + rax*2], mm5 ; write 63 62 61 60
+ psrlq mm1, 32 ; 53 52 51 50
+
+ movd [rdi], mm1 ; write out 53 52 51 50
+ psrlq mm5, 32 ; 73 72 71 70
+
+ movd [rdi + rax*2], mm5 ; write 73 72 71 70
+
+ lea rsi, [rsi+rax*8] ; next 8
+
+ dec rcx
+ jnz .nexts8_v
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
+; int y_stride,
+; loop_filter_info *lfi)
+;{
+;
+;
+; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
+;}
+
+SECTION_RODATA
+align 16
+tfe:
+ times 8 db 0xfe
+align 16
+t80:
+ times 8 db 0x80
+align 16
+t1s:
+ times 8 db 0x01
+align 16
+t3:
+ times 8 db 0x03
+align 16
+t4:
+ times 8 db 0x04
+align 16
+ones:
+ times 4 dw 0x0001
+align 16
+s27:
+ times 4 dw 0x1b00
+align 16
+s18:
+ times 4 dw 0x1200
+align 16
+s9:
+ times 4 dw 0x0900
+align 16
+s63:
+ times 4 dw 0x003f
diff --git a/libvpx/vp8/common/x86/loopfilter_sse2.asm b/libvpx/vp8/common/x86/loopfilter_sse2.asm
new file mode 100644
index 0000000..a66753b
--- /dev/null
+++ b/libvpx/vp8/common/x86/loopfilter_sse2.asm
@@ -0,0 +1,1640 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+%define _t0 0
+%define _t1 _t0 + 16
+%define _p3 _t1 + 16
+%define _p2 _p3 + 16
+%define _p1 _p2 + 16
+%define _p0 _p1 + 16
+%define _q0 _p0 + 16
+%define _q1 _q0 + 16
+%define _q2 _q1 + 16
+%define _q3 _q2 + 16
+%define lf_var_size 160
+
+; Use of pmaxub instead of psubusb to compute filter mask was seen
+; in ffvp8
+
+%macro LFH_FILTER_AND_HEV_MASK 1
+%if %1
+ movdqa xmm2, [rdi+2*rax] ; q3
+ movdqa xmm1, [rsi+2*rax] ; q2
+ movdqa xmm4, [rsi+rax] ; q1
+ movdqa xmm5, [rsi] ; q0
+ neg rax ; negate pitch to deal with above border
+%else
+ movlps xmm2, [rsi + rcx*2] ; q3
+ movlps xmm1, [rsi + rcx] ; q2
+ movlps xmm4, [rsi] ; q1
+ movlps xmm5, [rsi + rax] ; q0
+
+ movhps xmm2, [rdi + rcx*2]
+ movhps xmm1, [rdi + rcx]
+ movhps xmm4, [rdi]
+ movhps xmm5, [rdi + rax]
+
+ lea rsi, [rsi + rax*4]
+ lea rdi, [rdi + rax*4]
+
+ movdqa [rsp+_q2], xmm1 ; store q2
+ movdqa [rsp+_q1], xmm4 ; store q1
+%endif
+ movdqa xmm7, [rdx] ;limit
+
+ movdqa xmm6, xmm1 ; q2
+ movdqa xmm3, xmm4 ; q1
+
+ psubusb xmm1, xmm2 ; q2-=q3
+ psubusb xmm2, xmm6 ; q3-=q2
+
+ psubusb xmm4, xmm6 ; q1-=q2
+ psubusb xmm6, xmm3 ; q2-=q1
+
+ por xmm4, xmm6 ; abs(q2-q1)
+ por xmm1, xmm2 ; abs(q3-q2)
+
+ movdqa xmm0, xmm5 ; q0
+ pmaxub xmm1, xmm4
+
+ psubusb xmm5, xmm3 ; q0-=q1
+ psubusb xmm3, xmm0 ; q1-=q0
+
+ por xmm5, xmm3 ; abs(q0-q1)
+ movdqa [rsp+_t0], xmm5 ; save to t0
+
+ pmaxub xmm1, xmm5
+
+%if %1
+ movdqa xmm2, [rsi+4*rax] ; p3
+ movdqa xmm4, [rdi+4*rax] ; p2
+ movdqa xmm6, [rsi+2*rax] ; p1
+%else
+ movlps xmm2, [rsi + rax] ; p3
+ movlps xmm4, [rsi] ; p2
+ movlps xmm6, [rsi + rcx] ; p1
+
+ movhps xmm2, [rdi + rax]
+ movhps xmm4, [rdi]
+ movhps xmm6, [rdi + rcx]
+
+ movdqa [rsp+_p2], xmm4 ; store p2
+ movdqa [rsp+_p1], xmm6 ; store p1
+%endif
+
+ movdqa xmm5, xmm4 ; p2
+ movdqa xmm3, xmm6 ; p1
+
+ psubusb xmm4, xmm2 ; p2-=p3
+ psubusb xmm2, xmm5 ; p3-=p2
+
+ psubusb xmm3, xmm5 ; p1-=p2
+ pmaxub xmm1, xmm4 ; abs(p3 - p2)
+
+ psubusb xmm5, xmm6 ; p2-=p1
+ pmaxub xmm1, xmm2 ; abs(p3 - p2)
+
+ pmaxub xmm1, xmm5 ; abs(p2 - p1)
+ movdqa xmm2, xmm6 ; p1
+
+ pmaxub xmm1, xmm3 ; abs(p2 - p1)
+%if %1
+ movdqa xmm4, [rsi+rax] ; p0
+ movdqa xmm3, [rdi] ; q1
+%else
+ movlps xmm4, [rsi + rcx*2] ; p0
+ movhps xmm4, [rdi + rcx*2]
+ movdqa xmm3, [rsp+_q1] ; q1
+%endif
+
+ movdqa xmm5, xmm4 ; p0
+ psubusb xmm4, xmm6 ; p0-=p1
+
+ psubusb xmm6, xmm5 ; p1-=p0
+
+ por xmm6, xmm4 ; abs(p1 - p0)
+ mov rdx, arg(2) ; get blimit
+
+ movdqa [rsp+_t1], xmm6 ; save to t1
+
+ movdqa xmm4, xmm3 ; q1
+ pmaxub xmm1, xmm6
+
+ psubusb xmm3, xmm2 ; q1-=p1
+ psubusb xmm2, xmm4 ; p1-=q1
+
+ psubusb xmm1, xmm7
+ por xmm2, xmm3 ; abs(p1-q1)
+
+ movdqa xmm7, [rdx] ; blimit
+ mov rdx, arg(4) ; hev get thresh
+
+ movdqa xmm3, xmm0 ; q0
+ pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
+
+ movdqa xmm6, xmm5 ; p0
+ psrlw xmm2, 1 ; abs(p1-q1)/2
+
+ psubusb xmm5, xmm3 ; p0-=q0
+ psubusb xmm3, xmm6 ; q0-=p0
+ por xmm5, xmm3 ; abs(p0 - q0)
+
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+
+ movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0)
+ movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0)
+
+ paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ movdqa xmm2, [rdx] ; hev
+
+ psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ psubusb xmm4, xmm2 ; hev
+
+ psubusb xmm3, xmm2 ; hev
+ por xmm1, xmm5
+
+ pxor xmm7, xmm7
+ paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ pcmpeqb xmm4, xmm5 ; hev
+ pcmpeqb xmm3, xmm3 ; hev
+
+ pcmpeqb xmm1, xmm7 ; mask xmm1
+ pxor xmm4, xmm3 ; hev
+%endmacro
+
+%macro B_FILTER 1
+ movdqa xmm3, [GLOBAL(t80)]
+%if %1 == 0
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm7, [rsp+_q1] ; q1
+%elif %1 == 1
+ movdqa xmm2, [rsi+2*rax] ; p1
+ movdqa xmm7, [rdi] ; q1
+%elif %1 == 2
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm6, [rsp+_p0] ; p0
+ movdqa xmm0, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
+%endif
+
+ pxor xmm2, xmm3 ; p1 offset to convert to signed values
+ pxor xmm7, xmm3 ; q1 offset to convert to signed values
+
+ psubsb xmm2, xmm7 ; p1 - q1
+ pxor xmm6, xmm3 ; offset to convert to signed values
+
+ pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
+ pxor xmm0, xmm3 ; offset to convert to signed values
+
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
+ paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
+ paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
+ pand xmm1, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm2, xmm1
+ paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
+ paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
+
+ punpckhbw xmm5, xmm2 ; axbxcxdx
+ punpcklbw xmm2, xmm2 ; exfxgxhx
+
+ punpcklbw xmm0, xmm1 ; exfxgxhx
+ psraw xmm5, 11 ; sign extended shift right by 3
+
+ punpckhbw xmm1, xmm1 ; axbxcxdx
+ psraw xmm2, 11 ; sign extended shift right by 3
+
+ packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
+ psraw xmm0, 11 ; sign extended shift right by 3
+
+ psraw xmm1, 11 ; sign extended shift right by 3
+ movdqa xmm5, xmm0 ; save results
+
+ packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
+
+ paddsb xmm6, xmm2 ; p0+= p0 add
+
+ movdqa xmm2, [GLOBAL(ones)]
+ paddsw xmm5, xmm2
+ paddsw xmm1, xmm2
+ psraw xmm5, 1 ; partial shifted one more time for 2nd tap
+ psraw xmm1, 1 ; partial shifted one more time for 2nd tap
+ packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+ movdqa xmm2, [GLOBAL(t80)]
+
+%if %1 == 0
+ movdqa xmm1, [rsp+_p1] ; p1
+ lea rsi, [rsi + rcx*2]
+ lea rdi, [rdi + rcx*2]
+%elif %1 == 1
+ movdqa xmm1, [rsi+2*rax] ; p1
+%elif %1 == 2
+ movdqa xmm1, [rsp+_p1] ; p1
+%endif
+
+ pandn xmm4, xmm5 ; high edge variance additive
+ pxor xmm6, xmm2 ; unoffset
+
+ pxor xmm1, xmm2 ; reoffset
+ psubsb xmm3, xmm0 ; q0-= q0 add
+
+ paddsb xmm1, xmm4 ; p1+= p1 add
+ pxor xmm3, xmm2 ; unoffset
+
+ pxor xmm1, xmm2 ; unoffset
+ psubsb xmm7, xmm4 ; q1-= q1 add
+
+ pxor xmm7, xmm2 ; unoffset
+%if %1 == 0
+ movq [rsi], xmm6 ; p0
+ movhps [rdi], xmm6
+ movq [rsi + rax], xmm1 ; p1
+ movhps [rdi + rax], xmm1
+ movq [rsi + rcx], xmm3 ; q0
+ movhps [rdi + rcx], xmm3
+ movq [rsi + rcx*2], xmm7 ; q1
+ movhps [rdi + rcx*2], xmm7
+%elif %1 == 1
+ movdqa [rsi+rax], xmm6 ; write back
+ movdqa [rsi+2*rax], xmm1 ; write back
+ movdqa [rsi], xmm3 ; write back
+ movdqa [rdi], xmm7 ; write back
+%endif
+
+%endmacro
+
+%if ABI_IS_32BIT
+
+;void vp8_loop_filter_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step
+
+ mov rdx, arg(3) ;limit
+
+ lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 1
+ ; filter and write back the result
+ B_FILTER 1
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%endif
+
+;void vp8_loop_filter_horizontal_edge_uv_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; int count
+;)
+global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE
+sym(vp8_loop_filter_horizontal_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u
+ mov rdi, arg(5) ; v
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+ mov rcx, rax
+ neg rax ; negate pitch to deal with above border
+
+ mov rdx, arg(3) ;limit
+
+ lea rsi, [rsi + rcx]
+ lea rdi, [rdi + rcx]
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 0
+ ; filter and write back the result
+ B_FILTER 0
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+%macro MB_FILTER_AND_WRITEBACK 1
+ movdqa xmm3, [GLOBAL(t80)]
+%if %1 == 0
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm7, [rsp+_q1] ; q1
+%elif %1 == 1
+ movdqa xmm2, [rsi+2*rax] ; p1
+ movdqa xmm7, [rdi] ; q1
+
+ mov rcx, rax
+ neg rcx
+%elif %1 == 2
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm6, [rsp+_p0] ; p0
+ movdqa xmm0, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
+%endif
+
+ pxor xmm2, xmm3 ; p1 offset to convert to signed values
+ pxor xmm7, xmm3 ; q1 offset to convert to signed values
+ pxor xmm6, xmm3 ; offset to convert to signed values
+ pxor xmm0, xmm3 ; offset to convert to signed values
+
+ psubsb xmm2, xmm7 ; p1 - q1
+
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
+ paddsb xmm2, xmm0 ; 2 * (q0 - p0)
+ paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1)
+ pand xmm1, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm2, xmm1 ; vp8_filter
+
+ pand xmm2, xmm4 ; Filter2 = vp8_filter & hev
+ pxor xmm0, xmm0
+
+ pandn xmm4, xmm1 ; vp8_filter&=~hev
+ pxor xmm1, xmm1
+
+ punpcklbw xmm0, xmm4 ; Filter 2 (hi)
+ punpckhbw xmm1, xmm4 ; Filter 2 (lo)
+
+ movdqa xmm5, xmm2
+
+ movdqa xmm4, [GLOBAL(s9)]
+ paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3)
+ paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
+
+ pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9
+ pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9
+
+ punpckhbw xmm7, xmm5 ; axbxcxdx
+ punpcklbw xmm5, xmm5 ; exfxgxhx
+
+ psraw xmm7, 11 ; sign extended shift right by 3
+
+ psraw xmm5, 11 ; sign extended shift right by 3
+ punpckhbw xmm4, xmm2 ; axbxcxdx
+
+ punpcklbw xmm2, xmm2 ; exfxgxhx
+ psraw xmm4, 11 ; sign extended shift right by 3
+
+ packsswb xmm5, xmm7 ; Filter2 >>=3;
+ psraw xmm2, 11 ; sign extended shift right by 3
+
+ packsswb xmm2, xmm4 ; Filter1 >>=3;
+
+ paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
+
+ psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1
+ movdqa xmm7, xmm1
+
+ movdqa xmm4, [GLOBAL(s63)]
+ movdqa xmm5, xmm0
+ movdqa xmm2, xmm5
+ paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63
+ paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63
+ movdqa xmm4, xmm7
+
+ paddw xmm5, xmm5 ; Filter 2 (hi) * 18
+
+ paddw xmm7, xmm7 ; Filter 2 (lo) * 18
+ paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63
+
+ paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63
+ paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63
+ psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7
+
+ paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63
+ psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7
+ psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7
+
+ packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
+
+ psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7
+ psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7
+ psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7
+
+ packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+ packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+ movdqa xmm7, [GLOBAL(t80)]
+
+%if %1 == 0
+ movdqa xmm1, [rsp+_q1] ; q1
+ movdqa xmm4, [rsp+_p1] ; p1
+ lea rsi, [rsi+rcx*2]
+ lea rdi, [rdi+rcx*2]
+
+%elif %1 == 1
+ movdqa xmm1, [rdi] ; q1
+ movdqa xmm4, [rsi+rax*2] ; p1
+%elif %1 == 2
+ movdqa xmm4, [rsp+_p1] ; p1
+ movdqa xmm1, [rsp+_q1] ; q1
+%endif
+
+ pxor xmm1, xmm7
+ pxor xmm4, xmm7
+
+ psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3)
+ paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3)
+ psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2)
+ paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2)
+
+%if %1 == 1
+ movdqa xmm2, [rdi+rax*4] ; p2
+ movdqa xmm5, [rdi+rcx] ; q2
+%else
+ movdqa xmm2, [rsp+_p2] ; p2
+ movdqa xmm5, [rsp+_q2] ; q2
+%endif
+
+ pxor xmm1, xmm7 ; *oq1 = sq^0x80;
+ pxor xmm4, xmm7 ; *op1 = sp^0x80;
+ pxor xmm2, xmm7
+ pxor xmm5, xmm7
+ paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u)
+ psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u)
+ pxor xmm2, xmm7 ; *op2 = sp^0x80;
+ pxor xmm5, xmm7 ; *oq2 = sq^0x80;
+ pxor xmm3, xmm7 ; *oq0 = sq^0x80
+ pxor xmm6, xmm7 ; *oq0 = sp^0x80
+%if %1 == 0
+ movq [rsi], xmm6 ; p0
+ movhps [rdi], xmm6
+ movq [rsi + rcx], xmm3 ; q0
+ movhps [rdi + rcx], xmm3
+ lea rdx, [rcx + rcx*2]
+ movq [rsi+rcx*2], xmm1 ; q1
+ movhps [rdi+rcx*2], xmm1
+
+ movq [rsi + rax], xmm4 ; p1
+ movhps [rdi + rax], xmm4
+
+ movq [rsi+rax*2], xmm2 ; p2
+ movhps [rdi+rax*2], xmm2
+
+ movq [rsi+rdx], xmm5 ; q2
+ movhps [rdi+rdx], xmm5
+%elif %1 == 1
+ movdqa [rdi+rcx], xmm5 ; q2
+ movdqa [rdi], xmm1 ; q1
+ movdqa [rsi], xmm3 ; q0
+ movdqa [rsi+rax ], xmm6 ; p0
+ movdqa [rsi+rax*2], xmm4 ; p1
+ movdqa [rdi+rax*4], xmm2 ; p2
+%elif %1 == 2
+ movdqa [rsp+_p1], xmm4 ; p1
+ movdqa [rsp+_p0], xmm6 ; p0
+ movdqa [rsp+_q0], xmm3 ; q0
+ movdqa [rsp+_q1], xmm1 ; q1
+%endif
+
+%endmacro
+
+
+;void vp8_mbloop_filter_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step
+ mov rdx, arg(3) ;limit
+
+ lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 1
+ ; filter and write back the results
+ MB_FILTER_AND_WRITEBACK 1
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_horizontal_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE
+sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u
+ mov rdi, arg(5) ; v
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+ mov rcx, rax
+ neg rax ; negate pitch to deal with above border
+ mov rdx, arg(3) ;limit
+
+ lea rsi, [rsi + rcx]
+ lea rdi, [rdi + rcx]
+
+ ; calculate breakout conditions and high edge variance
+ LFH_FILTER_AND_HEV_MASK 0
+ ; filter and write back the results
+ MB_FILTER_AND_WRITEBACK 0
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+%macro TRANSPOSE_16X8 2
+ movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
+ movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
+ movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
+ movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+ movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
+ movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
+
+ punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+
+ movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
+
+ movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
+ punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
+
+ movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+
+ punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+%if %1
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
+%else
+ mov rsi, arg(5) ; v_ptr
+%endif
+
+ movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
+ punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
+ punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+ punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
+ punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+
+%if %1 == 0
+ lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
+ lea rsi, [rsi - 4]
+%endif
+
+ movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+ punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+
+ movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
+ punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+ punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+ punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+
+ punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+
+ movdqa [rsp+_t0], xmm2 ; save to free XMM2
+
+ movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
+ movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+ movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+ movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
+ movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
+
+ punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+ movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
+
+ punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
+
+ movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
+
+ punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
+
+ movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
+
+ punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
+
+ movdqa xmm6, xmm1 ;
+ punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
+
+ punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+ movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
+
+ punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+
+ punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+ movdqa xmm0, xmm5
+ punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+
+ punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+ movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
+
+ punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
+
+ punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
+ movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
+
+ punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+
+ punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
+
+%if %2 == 0
+ movdqa [rsp+_q3], xmm7 ; save 7
+ movdqa [rsp+_q2], xmm6 ; save 6
+%endif
+ movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+ punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ movdqa [rsp+_p1], xmm2 ; save 2
+
+ movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
+ punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ movdqa [rsp+_p0], xmm3 ; save 3
+
+ punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+
+ movdqa [rsp+_q0], xmm4 ; save 4
+ movdqa [rsp+_q1], xmm5 ; save 5
+ movdqa xmm1, [rsp+_t0]
+
+ movdqa xmm2, xmm1 ;
+ punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+%if %2 == 0
+ movdqa [rsp+_p2], xmm1
+ movdqa [rsp+_p3], xmm2
+%endif
+
+%endmacro
+
+%macro LFV_FILTER_MASK_HEV_MASK 0
+ movdqa xmm0, xmm6 ; q2
+ psubusb xmm0, xmm7 ; q2-q3
+
+ psubusb xmm7, xmm6 ; q3-q2
+ movdqa xmm4, xmm5 ; q1
+
+ por xmm7, xmm0 ; abs (q3-q2)
+ psubusb xmm4, xmm6 ; q1-q2
+
+ movdqa xmm0, xmm1
+ psubusb xmm6, xmm5 ; q2-q1
+
+ por xmm6, xmm4 ; abs (q2-q1)
+ psubusb xmm0, xmm2 ; p2 - p3;
+
+ psubusb xmm2, xmm1 ; p3 - p2;
+ por xmm0, xmm2 ; abs(p2-p3)
+
+ movdqa xmm5, [rsp+_p1] ; p1
+ pmaxub xmm0, xmm7
+
+ movdqa xmm2, xmm5 ; p1
+ psubusb xmm5, xmm1 ; p1-p2
+ psubusb xmm1, xmm2 ; p2-p1
+
+ movdqa xmm7, xmm3 ; p0
+ psubusb xmm7, xmm2 ; p0-p1
+
+ por xmm1, xmm5 ; abs(p2-p1)
+ pmaxub xmm0, xmm6
+
+ pmaxub xmm0, xmm1
+ movdqa xmm1, xmm2 ; p1
+
+ psubusb xmm2, xmm3 ; p1-p0
+
+ por xmm2, xmm7 ; abs(p1-p0)
+
+ pmaxub xmm0, xmm2
+
+ movdqa xmm5, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
+
+ mov rdx, arg(3) ; limit
+
+ movdqa xmm6, xmm5 ; q0
+ movdqa xmm4, xmm7 ; q1
+
+ psubusb xmm5, xmm7 ; q0-q1
+ psubusb xmm7, xmm6 ; q1-q0
+
+ por xmm7, xmm5 ; abs(q1-q0)
+
+ pmaxub xmm0, xmm7
+
+ psubusb xmm0, [rdx] ; limit
+
+ mov rdx, arg(2) ; blimit
+ movdqa xmm5, xmm4 ; q1
+
+ psubusb xmm5, xmm1 ; q1-=p1
+ psubusb xmm1, xmm4 ; p1-=q1
+
+ por xmm5, xmm1 ; abs(p1-q1)
+ movdqa xmm1, xmm3 ; p0
+
+ pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psubusb xmm1, xmm6 ; p0-q0
+
+ movdqa xmm4, [rdx] ; blimit
+ mov rdx, arg(4) ; get thresh
+
+ psrlw xmm5, 1 ; abs(p1-q1)/2
+ psubusb xmm6, xmm3 ; q0-p0
+
+ por xmm1, xmm6 ; abs(q0-p0)
+ paddusb xmm1, xmm1 ; abs(q0-p0)*2
+ movdqa xmm3, [rdx]
+
+ paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+ psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh
+
+ psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh
+
+ psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+
+ por xmm1, xmm0 ; mask
+ pcmpeqb xmm2, xmm0
+
+ pxor xmm0, xmm0
+ pcmpeqb xmm4, xmm4
+
+ pcmpeqb xmm1, xmm0
+ pxor xmm4, xmm2
+%endmacro
+
+%macro BV_TRANSPOSE 0
+ ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
+ movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+
+ movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+ punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+
+ movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+
+ punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+ movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+
+ punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+ ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
+ ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
+ ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
+ ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
+%endmacro
+
+%macro BV_WRITEBACK 2
+ movd [rsi+2], %1
+ movd [rsi+4*rax+2], %2
+ psrldq %1, 4
+ psrldq %2, 4
+ movd [rdi+2], %1
+ movd [rdi+4*rax+2], %2
+ psrldq %1, 4
+ psrldq %2, 4
+ movd [rsi+2*rax+2], %1
+ movd [rsi+2*rcx+2], %2
+ psrldq %1, 4
+ psrldq %2, 4
+ movd [rdi+2*rax+2], %1
+ movd [rdi+2*rcx+2], %2
+%endmacro
+
+%if ABI_IS_32BIT
+
+;void vp8_loop_filter_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE
+sym(vp8_loop_filter_vertical_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; src_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax*2+rax]
+
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+ TRANSPOSE_16X8 1, 1
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ ; start work on filters
+ B_FILTER 2
+
+ ; tranpose and write back - only work on q1, q0, p0, p1
+ BV_TRANSPOSE
+ ; store 16-line result
+
+ lea rdx, [rax]
+ neg rdx
+
+ BV_WRITEBACK xmm1, xmm5
+
+ lea rsi, [rsi+rdx*8]
+ lea rdi, [rdi+rdx*8]
+ BV_WRITEBACK xmm2, xmm6
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%endif
+
+;void vp8_loop_filter_vertical_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE
+sym(vp8_loop_filter_vertical_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax+2*rax]
+
+ ;transpose 16x8 to 8x16, and store the 8-line result on stack.
+ TRANSPOSE_16X8 0, 1
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ ; start work on filters
+ B_FILTER 2
+
+ ; tranpose and write back - only work on q1, q0, p0, p1
+ BV_TRANSPOSE
+
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+
+ ; store 16-line result
+ BV_WRITEBACK xmm1, xmm5
+
+ mov rsi, arg(0) ; u_ptr
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ BV_WRITEBACK xmm2, xmm6
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%macro MBV_TRANSPOSE 0
+ movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+
+ punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+
+ punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+ punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+ movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+ movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+
+ movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
+ punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
+
+ movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+ punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
+
+ punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
+ movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+
+ punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
+ punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
+%endmacro
+
+%macro MBV_WRITEBACK_1 0
+ movq [rsi], xmm0
+ movhps [rdi], xmm0
+
+ movq [rsi+2*rax], xmm6
+ movhps [rdi+2*rax], xmm6
+
+ movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+ punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
+ punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
+
+ movq [rsi+4*rax], xmm0
+ movhps [rdi+4*rax], xmm0
+
+ movq [rsi+2*rcx], xmm3
+ movhps [rdi+2*rcx], xmm3
+
+ movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+ punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
+
+ movdqa xmm0, xmm7
+ punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
+ punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
+
+ movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
+ punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
+%endmacro
+
+%macro MBV_WRITEBACK_2 0
+ movq [rsi], xmm1
+ movhps [rdi], xmm1
+
+ movq [rsi+2*rax], xmm5
+ movhps [rdi+2*rax], xmm5
+
+ movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+ punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
+ punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
+
+ movq [rsi+4*rax], xmm1
+ movhps [rdi+4*rax], xmm1
+
+ movq [rsi+2*rcx], xmm4
+ movhps [rdi+2*rcx], xmm4
+%endmacro
+
+
+;void vp8_mbloop_filter_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+;)
+global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; src_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax*2+rax]
+
+ ; Transpose
+ TRANSPOSE_16X8 1, 0
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ neg rax
+ ; start work on filters
+ MB_FILTER_AND_WRITEBACK 2
+
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
+
+ ; transpose and write back
+ MBV_TRANSPOSE
+
+ neg rax
+
+ MBV_WRITEBACK_1
+
+
+ lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
+ MBV_WRITEBACK_2
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_mbloop_filter_vertical_edge_uv_sse2
+;(
+; unsigned char *u,
+; int src_pixel_step,
+; const char *blimit,
+; const char *limit,
+; const char *thresh,
+; unsigned char *v
+;)
+global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE
+sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, lf_var_size
+
+ mov rsi, arg(0) ; u_ptr
+ movsxd rax, dword ptr arg(1) ; src_pixel_step
+
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+ lea rcx, [rax+2*rax]
+
+ ; Transpose
+ TRANSPOSE_16X8 0, 0
+
+ ; calculate filter mask and high edge variance
+ LFV_FILTER_MASK_HEV_MASK
+
+ ; start work on filters
+ MB_FILTER_AND_WRITEBACK 2
+
+ ; transpose and write back
+ MBV_TRANSPOSE
+
+ mov rsi, arg(0) ;u_ptr
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax]
+ MBV_WRITEBACK_1
+ mov rsi, arg(5) ;v_ptr
+ lea rsi, [rsi - 4]
+ lea rdi, [rsi + rax]
+ MBV_WRITEBACK_2
+
+ add rsp, lf_var_size
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_horizontal_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+;)
+global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE
+sym(vp8_loop_filter_simple_horizontal_edge_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 7
+ GET_GOT rbx
+ ; end prolog
+
+ mov rcx, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+ movdqa xmm6, [GLOBAL(tfe)]
+ lea rdx, [rcx + rax]
+ neg rax
+
+ ; calculate mask
+ movdqa xmm0, [rdx] ; q1
+ mov rdx, arg(2) ;blimit
+ movdqa xmm1, [rcx+2*rax] ; p1
+
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm0
+
+ psubusb xmm0, xmm1 ; q1-=p1
+ psubusb xmm1, xmm3 ; p1-=q1
+ por xmm1, xmm0 ; abs(p1-q1)
+ pand xmm1, xmm6 ; set lsb of each byte to zero
+ psrlw xmm1, 1 ; abs(p1-q1)/2
+
+ movdqa xmm7, XMMWORD PTR [rdx]
+
+ movdqa xmm5, [rcx+rax] ; p0
+ movdqa xmm4, [rcx] ; q0
+ movdqa xmm0, xmm4 ; q0
+ movdqa xmm6, xmm5 ; p0
+ psubusb xmm5, xmm4 ; p0-=q0
+ psubusb xmm4, xmm6 ; q0-=p0
+ por xmm5, xmm4 ; abs(p0 - q0)
+
+ movdqa xmm4, [GLOBAL(t80)]
+
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+ paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+ psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor xmm7, xmm7
+ pcmpeqb xmm5, xmm7
+
+
+ ; start work on filters
+ pxor xmm2, xmm4 ; p1 offset to convert to signed values
+ pxor xmm3, xmm4 ; q1 offset to convert to signed values
+ psubsb xmm2, xmm3 ; p1 - q1
+
+ pxor xmm6, xmm4 ; offset to convert to signed values
+ pxor xmm0, xmm4 ; offset to convert to signed values
+ movdqa xmm3, xmm0 ; q0
+ psubsb xmm0, xmm6 ; q0 - p0
+ paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
+ pand xmm5, xmm2 ; mask filter values we don't care about
+
+ movdqa xmm0, xmm5
+ paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4
+ paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4
+
+ movdqa xmm1, [GLOBAL(te0)]
+ movdqa xmm2, [GLOBAL(t1f)]
+
+; pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm0 ;save sign
+ pand xmm7, xmm1 ;preserve the upper 3 bits
+ psrlw xmm0, 3
+ pand xmm0, xmm2 ;clear out upper 3 bits
+ por xmm0, xmm7 ;add sign
+ psubsb xmm3, xmm0 ; q0-= q0sz add
+
+ pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm5 ;save sign
+ pand xmm7, xmm1 ;preserve the upper 3 bits
+ psrlw xmm5, 3
+ pand xmm5, xmm2 ;clear out upper 3 bits
+ por xmm5, xmm7 ;add sign
+ paddsb xmm6, xmm5 ; p0+= p0 add
+
+ pxor xmm3, xmm4 ; unoffset
+ movdqa [rcx], xmm3 ; write back
+
+ pxor xmm6, xmm4 ; unoffset
+ movdqa [rcx+rax], xmm6 ; write back
+
+ ; begin epilog
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_loop_filter_simple_vertical_edge_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixel_step,
+; const char *blimit,
+;)
+global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE
+sym(vp8_loop_filter_simple_vertical_edge_sse2):
+ push rbp ; save old base pointer value.
+ mov rbp, rsp ; set new base pointer value.
+ SHADOW_ARGS_TO_STACK 3
+ SAVE_XMM 7
+ GET_GOT rbx ; save callee-saved reg
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 32 ; reserve 32 bytes
+ %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
+ %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
+
+ lea rsi, [rsi - 2 ]
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
+ movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
+ movd xmm2, [rdi] ; 13 12 11 10
+ movd xmm3, [rcx] ; 53 52 51 50
+ punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
+ punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
+
+ movd xmm4, [rsi + rax*2] ; 23 22 21 20
+ movd xmm5, [rdx + rax*2] ; 63 62 61 60
+ movd xmm6, [rdi + rax*2] ; 33 32 31 30
+ movd xmm7, [rcx + rax*2] ; 73 72 71 70
+ punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
+ punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
+
+ punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+ punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+ punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+
+ movdqa xmm2, xmm0
+ punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+ punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+
+ lea rsi, [rsi + rax*8]
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd xmm4, [rsi] ; 83 82 81 80
+ movd xmm1, [rdx] ; c3 c2 c1 c0
+ movd xmm6, [rdi] ; 93 92 91 90
+ movd xmm3, [rcx] ; d3 d2 d1 d0
+ punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
+ punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
+
+ movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0
+ movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
+ movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0
+ movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
+ punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
+ punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
+
+ punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
+ punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
+
+ movdqa xmm7, xmm4
+ punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
+ punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
+
+ movdqa xmm6, xmm4
+ punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+ punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+
+ punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+
+ mov rdx, arg(2) ;blimit
+
+ ; calculate mask
+ movdqa xmm6, xmm0 ; p1
+ movdqa xmm7, xmm3 ; q1
+ psubusb xmm7, xmm0 ; q1-=p1
+ psubusb xmm6, xmm3 ; p1-=q1
+ por xmm6, xmm7 ; abs(p1-q1)
+ pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ psrlw xmm6, 1 ; abs(p1-q1)/2
+
+ movdqa xmm7, [rdx]
+
+ movdqa xmm5, xmm1 ; p0
+ movdqa xmm4, xmm2 ; q0
+ psubusb xmm5, xmm2 ; p0-=q0
+ psubusb xmm4, xmm1 ; q0-=p0
+ por xmm5, xmm4 ; abs(p0 - q0)
+ paddusb xmm5, xmm5 ; abs(p0-q0)*2
+ paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
+
+ movdqa xmm4, [GLOBAL(t80)]
+
+ psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor xmm7, xmm7
+ pcmpeqb xmm5, xmm7 ; mm5 = mask
+
+ ; start work on filters
+ movdqa t0, xmm0
+ movdqa t1, xmm3
+
+ pxor xmm0, xmm4 ; p1 offset to convert to signed values
+ pxor xmm3, xmm4 ; q1 offset to convert to signed values
+ psubsb xmm0, xmm3 ; p1 - q1
+
+ pxor xmm1, xmm4 ; offset to convert to signed values
+ pxor xmm2, xmm4 ; offset to convert to signed values
+
+ movdqa xmm3, xmm2 ; offseted ; q0
+ psubsb xmm2, xmm1 ; q0 - p0
+ paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0)
+ paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0)
+ paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0)
+ pand xmm5, xmm0 ; mask filter values we don't care about
+
+ movdqa xmm0, xmm5
+ paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4
+ paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4
+
+ movdqa xmm6, [GLOBAL(te0)]
+ movdqa xmm2, [GLOBAL(t1f)]
+
+; pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm0 ;save sign
+ pand xmm7, xmm6 ;preserve the upper 3 bits
+ psrlw xmm0, 3
+ pand xmm0, xmm2 ;clear out upper 3 bits
+ por xmm0, xmm7 ;add sign
+ psubsb xmm3, xmm0 ; q0-= q0sz add
+
+ pxor xmm7, xmm7
+ pcmpgtb xmm7, xmm5 ;save sign
+ pand xmm7, xmm6 ;preserve the upper 3 bits
+ psrlw xmm5, 3
+ pand xmm5, xmm2 ;clear out upper 3 bits
+ por xmm5, xmm7 ;add sign
+ paddsb xmm1, xmm5 ; p0+= p0 add
+
+ pxor xmm3, xmm4 ; unoffset q0
+ pxor xmm1, xmm4 ; unoffset p0
+
+ movdqa xmm0, t0 ; p1
+ movdqa xmm4, t1 ; q1
+
+ ; write out order: xmm0 xmm2 xmm1 xmm3
+ lea rdx, [rsi + rax*4]
+
+ ; transpose back to write out
+ ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+ ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+ movdqa xmm6, xmm0
+ punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+
+ movdqa xmm5, xmm3
+ punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+ punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+
+ movdqa xmm3, xmm6
+ punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+
+ movd [rsi], xmm6 ; write the second 8-line result
+ movd [rdx], xmm3
+ psrldq xmm6, 4
+ psrldq xmm3, 4
+ movd [rdi], xmm6
+ movd [rcx], xmm3
+ psrldq xmm6, 4
+ psrldq xmm3, 4
+ movd [rsi + rax*2], xmm6
+ movd [rdx + rax*2], xmm3
+ psrldq xmm6, 4
+ psrldq xmm3, 4
+ movd [rdi + rax*2], xmm6
+ movd [rcx + rax*2], xmm3
+
+ neg rax
+ lea rsi, [rsi + rax*8]
+ neg rax
+ lea rdi, [rsi + rax]
+ lea rdx, [rsi + rax*4]
+ lea rcx, [rdx + rax]
+
+ movd [rsi], xmm0 ; write the first 8-line result
+ movd [rdx], xmm2
+ psrldq xmm0, 4
+ psrldq xmm2, 4
+ movd [rdi], xmm0
+ movd [rcx], xmm2
+ psrldq xmm0, 4
+ psrldq xmm2, 4
+ movd [rsi + rax*2], xmm0
+ movd [rdx + rax*2], xmm2
+ psrldq xmm0, 4
+ psrldq xmm2, 4
+ movd [rdi + rax*2], xmm0
+ movd [rcx + rax*2], xmm2
+
+ add rsp, 32
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+tfe:
+ times 16 db 0xfe
+align 16
+t80:
+ times 16 db 0x80
+align 16
+t1s:
+ times 16 db 0x01
+align 16
+t3:
+ times 16 db 0x03
+align 16
+t4:
+ times 16 db 0x04
+align 16
+ones:
+ times 8 dw 0x0001
+align 16
+s9:
+ times 8 dw 0x0900
+align 16
+s63:
+ times 8 dw 0x003f
+align 16
+te0:
+ times 16 db 0xe0
+align 16
+t1f:
+ times 16 db 0x1f
diff --git a/libvpx/vp8/common/x86/loopfilter_x86.c b/libvpx/vp8/common/x86/loopfilter_x86.c
new file mode 100644
index 0000000..6586004
--- /dev/null
+++ b/libvpx/vp8/common/x86/loopfilter_x86.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vp8/common/loopfilter.h"
+
+#define prototype_loopfilter(sym) \
+ void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+ const unsigned char *limit, const unsigned char *thresh, int count)
+
+#define prototype_loopfilter_nc(sym) \
+ void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+ const unsigned char *limit, const unsigned char *thresh)
+
+#define prototype_simple_loopfilter(sym) \
+ void sym(unsigned char *y, int ystride, const unsigned char *blimit)
+
+prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
+prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
+prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
+prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
+
+#if HAVE_SSE2 && ARCH_X86_64
+prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
+prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
+#else
+prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2);
+#endif
+prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2);
+
+extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_sse2;
+extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
+
+#if HAVE_MMX
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+ if (v_ptr)
+ vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+
+void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
+}
+#endif
+
+
+/* Horizontal MB filtering */
+#if HAVE_SSE2
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
+
+ if (u_ptr)
+ vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+ vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
+
+ if (u_ptr)
+ vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+#if ARCH_X86_64
+ vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+#else
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+#endif
+
+ if (u_ptr)
+ vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride);
+}
+
+
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit);
+ vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+ int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+#if ARCH_X86_64
+ vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+#else
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+#endif
+
+ if (u_ptr)
+ vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4);
+}
+
+
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
+{
+ vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
+ vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
+}
+
+#endif
diff --git a/libvpx/vp8/common/x86/mfqe_sse2.asm b/libvpx/vp8/common/x86/mfqe_sse2.asm
new file mode 100644
index 0000000..c1d2174
--- /dev/null
+++ b/libvpx/vp8/common/x86/mfqe_sse2.asm
@@ -0,0 +1,281 @@
+;
+; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_filter_by_weight16x16_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
+sym(vp8_filter_by_weight16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 16 ; loop count
+ pxor xmm6, xmm6
+
+.combine
+ movdqa xmm2, [rax]
+ movdqa xmm4, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm6
+ punpckhbw xmm3, xmm6
+ pmullw xmm2, xmm0
+ pmullw xmm3, xmm0
+
+ ; dst * dst_weight
+ movdqa xmm5, xmm4
+ punpcklbw xmm4, xmm6
+ punpckhbw xmm5, xmm6
+ pmullw xmm4, xmm1
+ pmullw xmm5, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ paddw xmm3, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+ psrlw xmm3, 4
+
+ packuswb xmm2, xmm3
+ movdqa [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp8_filter_by_weight8x8_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
+sym(vp8_filter_by_weight8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 8 ; loop count
+ pxor xmm4, xmm4
+
+.combine
+ movq xmm2, [rax]
+ movq xmm3, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ punpcklbw xmm2, xmm4
+ pmullw xmm2, xmm0
+
+ ; dst * dst_weight
+ punpcklbw xmm3, xmm4
+ pmullw xmm3, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm3
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+
+ packuswb xmm2, xmm4
+ movq [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp8_variance_and_sad_16x16_sse2 | arg
+;(
+; unsigned char *src1, 0
+; int stride1, 1
+; unsigned char *src2, 2
+; int stride2, 3
+; unsigned int *variance, 4
+; unsigned int *sad, 5
+;)
+global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
+sym(vp8_variance_and_sad_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ; src1
+ mov rcx, arg(1) ; stride1
+ mov rdx, arg(2) ; src2
+ mov rdi, arg(3) ; stride2
+
+ mov rsi, 16 ; block height
+
+ ; Prep accumulator registers
+ pxor xmm3, xmm3 ; SAD
+ pxor xmm4, xmm4 ; sum of src2
+ pxor xmm5, xmm5 ; sum of src2^2
+
+ ; Because we're working with the actual output frames
+ ; we can't depend on any kind of data alignment.
+.accumulate
+ movdqa xmm0, [rax] ; src1
+ movdqa xmm1, [rdx] ; src2
+ add rax, rcx ; src1 + stride1
+ add rdx, rdi ; src2 + stride2
+
+ ; SAD(src1, src2)
+ psadbw xmm0, xmm1
+ paddusw xmm3, xmm0
+
+ ; SUM(src2)
+ pxor xmm2, xmm2
+ psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
+ paddusw xmm4, xmm2
+
+ ; pmaddubsw would be ideal if it took two unsigned values. instead,
+ ; it expects a signed and an unsigned value. so instead we zero extend
+ ; and operate on words.
+ pxor xmm2, xmm2
+ movdqa xmm0, xmm1
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm1, xmm1
+ paddd xmm5, xmm0
+ paddd xmm5, xmm1
+
+ sub rsi, 1
+ jnz .accumulate
+
+ ; phaddd only operates on adjacent double words.
+ ; Finalize SAD and store
+ movdqa xmm0, xmm3
+ psrldq xmm0, 8
+ paddusw xmm0, xmm3
+ paddd xmm0, [GLOBAL(t128)]
+ psrld xmm0, 8
+
+ mov rax, arg(5)
+ movd [rax], xmm0
+
+ ; Accumulate sum of src2
+ movdqa xmm0, xmm4
+ psrldq xmm0, 8
+ paddusw xmm0, xmm4
+ ; Square src2. Ignore high value
+ pmuludq xmm0, xmm0
+ psrld xmm0, 8
+
+ ; phaddw could be used to sum adjacent values but we want
+ ; all the values summed. promote to doubles, accumulate,
+ ; shift and sum
+ pxor xmm2, xmm2
+ movdqa xmm1, xmm5
+ punpckldq xmm1, xmm2
+ punpckhdq xmm5, xmm2
+ paddd xmm1, xmm5
+ movdqa xmm2, xmm1
+ psrldq xmm1, 8
+ paddd xmm1, xmm2
+
+ psubd xmm1, xmm0
+
+ ; (variance + 128) >> 8
+ paddd xmm1, [GLOBAL(t128)]
+ psrld xmm1, 8
+ mov rax, arg(4)
+
+ movd [rax], xmm1
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+t128:
+ ddq 128
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+ times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+ times 8 dw 0x08
+
diff --git a/libvpx/vp8/common/x86/postproc_mmx.asm b/libvpx/vp8/common/x86/postproc_mmx.asm
new file mode 100644
index 0000000..966c586
--- /dev/null
+++ b/libvpx/vp8/common/x86/postproc_mmx.asm
@@ -0,0 +1,314 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT 7
+
+;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
+; int pitch, int rows, int cols,int flimit)
+extern sym(vp8_rv)
+global sym(vp8_mbpost_proc_down_mmx) PRIVATE
+sym(vp8_mbpost_proc_down_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 136
+
+ ; unsigned char d[16][8] at [rsp]
+ ; create flimit2 at [rsp+128]
+ mov eax, dword ptr arg(4) ;flimit
+ mov [rsp+128], eax
+ mov [rsp+128+4], eax
+%define flimit2 [rsp+128]
+
+%if ABI_IS_32BIT=0
+ lea r8, [GLOBAL(sym(vp8_rv))]
+%endif
+
+ ;rows +=8;
+ add dword ptr arg(2), 8
+
+ ;for(c=0; c<cols; c+=4)
+.loop_col:
+ mov rsi, arg(0) ;s
+ pxor mm0, mm0 ;
+
+ movsxd rax, dword ptr arg(1) ;pitch ;
+
+ ; this copies the last row down into the border 8 rows
+ mov rdi, rsi
+ mov rdx, arg(2)
+ sub rdx, 9
+ imul rdx, rax
+ lea rdi, [rdi+rdx]
+ movq mm1, QWORD ptr[rdi] ; first row
+ mov rcx, 8
+.init_borderd ; initialize borders
+ lea rdi, [rdi + rax]
+ movq [rdi], xmm1
+
+ dec rcx
+ jne .init_borderd
+
+ neg rax ; rax = -pitch
+
+ ; this copies the first row up into the border 8 rows
+ mov rdi, rsi
+ movq mm1, QWORD ptr[rdi] ; first row
+ mov rcx, 8
+.init_border ; initialize borders
+ lea rdi, [rdi + rax]
+ movq [rdi], mm1
+
+ dec rcx
+ jne .init_border
+
+
+ lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
+ neg rax
+
+
+ pxor mm5, mm5
+ pxor mm6, mm6 ;
+
+ pxor mm7, mm7 ;
+ mov rdi, rsi
+
+ mov rcx, 15 ;
+
+.loop_initvar:
+ movd mm1, DWORD PTR [rdi];
+ punpcklbw mm1, mm0 ;
+
+ paddw mm5, mm1 ;
+ pmullw mm1, mm1 ;
+
+ movq mm2, mm1 ;
+ punpcklwd mm1, mm0 ;
+
+ punpckhwd mm2, mm0 ;
+ paddd mm6, mm1 ;
+
+ paddd mm7, mm2 ;
+ lea rdi, [rdi+rax] ;
+
+ dec rcx
+ jne .loop_initvar
+ ;save the var and sum
+ xor rdx, rdx
+.loop_row:
+ movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
+ movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
+
+ punpcklbw mm1, mm0
+ punpcklbw mm2, mm0
+
+ paddw mm5, mm2
+ psubw mm5, mm1
+
+ pmullw mm2, mm2
+ movq mm4, mm2
+
+ punpcklwd mm2, mm0
+ punpckhwd mm4, mm0
+
+ paddd mm6, mm2
+ paddd mm7, mm4
+
+ pmullw mm1, mm1
+ movq mm2, mm1
+
+ punpcklwd mm1, mm0
+ psubd mm6, mm1
+
+ punpckhwd mm2, mm0
+ psubd mm7, mm2
+
+
+ movq mm3, mm6
+ pslld mm3, 4
+
+ psubd mm3, mm6
+ movq mm1, mm5
+
+ movq mm4, mm5
+ pmullw mm1, mm1
+
+ pmulhw mm4, mm4
+ movq mm2, mm1
+
+ punpcklwd mm1, mm4
+ punpckhwd mm2, mm4
+
+ movq mm4, mm7
+ pslld mm4, 4
+
+ psubd mm4, mm7
+
+ psubd mm3, mm1
+ psubd mm4, mm2
+
+ psubd mm3, flimit2
+ psubd mm4, flimit2
+
+ psrad mm3, 31
+ psrad mm4, 31
+
+ packssdw mm3, mm4
+ packsswb mm3, mm0
+
+ movd mm1, DWORD PTR [rsi+rax*8]
+
+ movq mm2, mm1
+ punpcklbw mm1, mm0
+
+ paddw mm1, mm5
+ mov rcx, rdx
+
+ and rcx, 127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+ push rax
+ lea rax, [GLOBAL(sym(vp8_rv))]
+ movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2]
+ pop rax
+%elif ABI_IS_32BIT=0
+ movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
+%else
+ movq mm4, [sym(vp8_rv) + rcx*2]
+%endif
+ paddw mm1, mm4
+ ;paddw xmm1, eight8s
+ psraw mm1, 4
+
+ packuswb mm1, mm0
+ pand mm1, mm3
+
+ pandn mm3, mm2
+ por mm1, mm3
+
+ and rcx, 15
+ movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
+
+ mov rcx, rdx
+ sub rcx, 8
+
+ and rcx, 15
+ movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
+
+ movd [rsi], mm1
+ lea rsi, [rsi+rax]
+
+ lea rdi, [rdi+rax]
+ add rdx, 1
+
+ cmp edx, dword arg(2) ;rows
+ jl .loop_row
+
+
+ add dword arg(0), 4 ; s += 4
+ sub dword arg(3), 4 ; cols -= 4
+ cmp dword arg(3), 0
+ jg .loop_col
+
+ add rsp, 136
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit2
+
+
+;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
+; unsigned char blackclamp[16],
+; unsigned char whiteclamp[16],
+; unsigned char bothclamp[16],
+; unsigned int Width, unsigned int Height, int Pitch)
+extern sym(rand)
+global sym(vp8_plane_add_noise_mmx) PRIVATE
+sym(vp8_plane_add_noise_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+.addnoise_loop:
+ call sym(rand) WRT_PLT
+ mov rcx, arg(1) ;noise
+ and rax, 0xff
+ add rcx, rax
+
+ ; we rely on the fact that the clamping vectors are stored contiguously
+ ; in black/white/both order. Note that we have to reload this here because
+ ; rdx could be trashed by rand()
+ mov rdx, arg(2) ; blackclamp
+
+
+ mov rdi, rcx
+ movsxd rcx, dword arg(5) ;[Width]
+ mov rsi, arg(0) ;Pos
+ xor rax,rax
+
+.addnoise_nextset:
+ movq mm1,[rsi+rax] ; get the source
+
+ psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
+ paddusb mm1, [rdx+32] ;bothclamp
+ psubusb mm1, [rdx+16] ;whiteclamp
+
+ movq mm2,[rdi+rax] ; get the noise for this line
+ paddb mm1,mm2 ; add it in
+ movq [rsi+rax],mm1 ; store the result
+
+ add rax,8 ; move to the next line
+
+ cmp rax, rcx
+ jl .addnoise_nextset
+
+ movsxd rax, dword arg(7) ; Pitch
+ add arg(0), rax ; Start += Pitch
+ sub dword arg(6), 1 ; Height -= 1
+ jg .addnoise_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+Blur:
+ times 16 dw 16
+ times 8 dw 64
+ times 16 dw 16
+ times 8 dw 0
+
+rd:
+ times 4 dw 0x40
diff --git a/libvpx/vp8/common/x86/postproc_sse2.asm b/libvpx/vp8/common/x86/postproc_sse2.asm
new file mode 100644
index 0000000..00f84a3
--- /dev/null
+++ b/libvpx/vp8/common/x86/postproc_sse2.asm
@@ -0,0 +1,721 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;macro in deblock functions
+%macro FIRST_2_ROWS 0
+ movdqa xmm4, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm5, xmm1
+ pavgb xmm5, xmm3
+
+ ;calculate absolute value
+ psubusb xmm4, xmm1
+ psubusb xmm1, xmm0
+ psubusb xmm6, xmm3
+ psubusb xmm3, xmm0
+ paddusb xmm4, xmm1
+ paddusb xmm6, xmm3
+
+ ;get threshold
+ movdqa xmm2, flimit
+ pxor xmm1, xmm1
+ movdqa xmm7, xmm2
+
+ ;get mask
+ psubusb xmm2, xmm4
+ psubusb xmm7, xmm6
+ pcmpeqb xmm2, xmm1
+ pcmpeqb xmm7, xmm1
+ por xmm7, xmm2
+%endmacro
+
+%macro SECOND_2_ROWS 0
+ movdqa xmm6, xmm0
+ movdqa xmm4, xmm0
+ movdqa xmm2, xmm1
+ pavgb xmm1, xmm3
+
+ ;calculate absolute value
+ psubusb xmm6, xmm2
+ psubusb xmm2, xmm0
+ psubusb xmm4, xmm3
+ psubusb xmm3, xmm0
+ paddusb xmm6, xmm2
+ paddusb xmm4, xmm3
+
+ pavgb xmm5, xmm1
+
+ ;get threshold
+ movdqa xmm2, flimit
+ pxor xmm1, xmm1
+ movdqa xmm3, xmm2
+
+ ;get mask
+ psubusb xmm2, xmm6
+ psubusb xmm3, xmm4
+ pcmpeqb xmm2, xmm1
+ pcmpeqb xmm3, xmm1
+
+ por xmm7, xmm2
+ por xmm7, xmm3
+
+ pavgb xmm5, xmm0
+
+ ;decide if or not to use filtered value
+ pand xmm0, xmm7
+ pandn xmm7, xmm5
+ paddusb xmm0, xmm7
+%endmacro
+
+%macro UPDATE_FLIMIT 0
+ movdqa xmm2, XMMWORD PTR [rbx]
+ movdqa [rsp], xmm2
+ add rbx, 16
+%endmacro
+
+;void vp8_post_proc_down_and_across_mb_row_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned char *dst_ptr,
+; int src_pixels_per_line,
+; int dst_pixels_per_line,
+; int cols,
+; int *flimits,
+; int size
+;)
+global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
+sym(vp8_post_proc_down_and_across_mb_row_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+ ALIGN_STACK 16, rax
+ sub rsp, 16
+
+ ; put flimit on stack
+ mov rbx, arg(5) ;flimits ptr
+ UPDATE_FLIMIT
+
+%define flimit [rsp]
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;dst_ptr
+
+ movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line
+ movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock
+.nextrow:
+ xor rdx, rdx ;col
+.nextcol:
+ ;load current and next 2 rows
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + rax]
+ movdqu xmm3, XMMWORD PTR [rsi + 2*rax]
+
+ FIRST_2_ROWS
+
+ ;load above 2 rows
+ neg rax
+ movdqu xmm1, XMMWORD PTR [rsi + 2*rax]
+ movdqu xmm3, XMMWORD PTR [rsi + rax]
+
+ SECOND_2_ROWS
+
+ movdqu XMMWORD PTR [rdi], xmm0
+
+ neg rax ; positive stride
+ add rsi, 16
+ add rdi, 16
+
+ add rdx, 16
+ cmp edx, dword arg(4) ;cols
+ jge .downdone
+ UPDATE_FLIMIT
+ jmp .nextcol
+
+.downdone:
+ ; done with the all cols, start the across filtering in place
+ sub rsi, rdx
+ sub rdi, rdx
+
+ mov rbx, arg(5) ; flimits
+ UPDATE_FLIMIT
+
+ ; dup the first byte into the left border 8 times
+ movq mm1, [rdi]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ mov rdx, -8
+ movq [rdi+rdx], mm1
+
+ ; dup the last byte into the right border
+ movsxd rdx, dword arg(4)
+ movq mm1, [rdi + rdx + -1]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ movq [rdi+rdx], mm1
+
+ xor rdx, rdx
+ movq mm0, QWORD PTR [rdi-16];
+ movq mm1, QWORD PTR [rdi-8];
+
+.acrossnextcol:
+ movdqu xmm0, XMMWORD PTR [rdi + rdx]
+ movdqu xmm1, XMMWORD PTR [rdi + rdx -2]
+ movdqu xmm3, XMMWORD PTR [rdi + rdx -1]
+
+ FIRST_2_ROWS
+
+ movdqu xmm1, XMMWORD PTR [rdi + rdx +1]
+ movdqu xmm3, XMMWORD PTR [rdi + rdx +2]
+
+ SECOND_2_ROWS
+
+ movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes
+ movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes
+ movdq2q mm0, xmm0
+ psrldq xmm0, 8
+ movdq2q mm1, xmm0
+
+ add rdx, 16
+ cmp edx, dword arg(4) ;cols
+ jge .acrossdone
+ UPDATE_FLIMIT
+ jmp .acrossnextcol
+
+.acrossdone
+ ; last 16 pixels
+ movq QWORD PTR [rdi+rdx-16], mm0
+
+ cmp edx, dword arg(4)
+ jne .throw_last_8
+ movq QWORD PTR [rdi+rdx-8], mm1
+.throw_last_8:
+ ; done with this rwo
+ add rsi,rax ;next src line
+ mov eax, dword arg(3) ;dst_pixels_per_line
+ add rdi,rax ;next destination
+ mov eax, dword arg(2) ;src_pixels_per_line
+
+ mov rbx, arg(5) ;flimits
+ UPDATE_FLIMIT
+
+ dec rcx ;decrement count
+ jnz .nextrow ;next row
+
+ add rsp, 16
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit
+
+;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
+; int pitch, int rows, int cols,int flimit)
+extern sym(vp8_rv)
+global sym(vp8_mbpost_proc_down_xmm) PRIVATE
+sym(vp8_mbpost_proc_down_xmm):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 128+16
+
+ ; unsigned char d[16][8] at [rsp]
+ ; create flimit2 at [rsp+128]
+ mov eax, dword ptr arg(4) ;flimit
+ mov [rsp+128], eax
+ mov [rsp+128+4], eax
+ mov [rsp+128+8], eax
+ mov [rsp+128+12], eax
+%define flimit4 [rsp+128]
+
+%if ABI_IS_32BIT=0
+ lea r8, [GLOBAL(sym(vp8_rv))]
+%endif
+
+ ;rows +=8;
+ add dword arg(2), 8
+
+ ;for(c=0; c<cols; c+=8)
+.loop_col:
+ mov rsi, arg(0) ; s
+ pxor xmm0, xmm0 ;
+
+ movsxd rax, dword ptr arg(1) ;pitch ;
+
+ ; this copies the last row down into the border 8 rows
+ mov rdi, rsi
+ mov rdx, arg(2)
+ sub rdx, 9
+ imul rdx, rax
+ lea rdi, [rdi+rdx]
+ movq xmm1, QWORD ptr[rdi] ; first row
+ mov rcx, 8
+.init_borderd ; initialize borders
+ lea rdi, [rdi + rax]
+ movq [rdi], xmm1
+
+ dec rcx
+ jne .init_borderd
+
+ neg rax ; rax = -pitch
+
+ ; this copies the first row up into the border 8 rows
+ mov rdi, rsi
+ movq xmm1, QWORD ptr[rdi] ; first row
+ mov rcx, 8
+.init_border ; initialize borders
+ lea rdi, [rdi + rax]
+ movq [rdi], xmm1
+
+ dec rcx
+ jne .init_border
+
+
+
+ lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
+ neg rax
+
+ pxor xmm5, xmm5
+ pxor xmm6, xmm6 ;
+
+ pxor xmm7, xmm7 ;
+ mov rdi, rsi
+
+ mov rcx, 15 ;
+
+.loop_initvar:
+ movq xmm1, QWORD PTR [rdi];
+ punpcklbw xmm1, xmm0 ;
+
+ paddw xmm5, xmm1 ;
+ pmullw xmm1, xmm1 ;
+
+ movdqa xmm2, xmm1 ;
+ punpcklwd xmm1, xmm0 ;
+
+ punpckhwd xmm2, xmm0 ;
+ paddd xmm6, xmm1 ;
+
+ paddd xmm7, xmm2 ;
+ lea rdi, [rdi+rax] ;
+
+ dec rcx
+ jne .loop_initvar
+ ;save the var and sum
+ xor rdx, rdx
+.loop_row:
+ movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
+ movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
+
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+
+ paddw xmm5, xmm2
+ psubw xmm5, xmm1
+
+ pmullw xmm2, xmm2
+ movdqa xmm4, xmm2
+
+ punpcklwd xmm2, xmm0
+ punpckhwd xmm4, xmm0
+
+ paddd xmm6, xmm2
+ paddd xmm7, xmm4
+
+ pmullw xmm1, xmm1
+ movdqa xmm2, xmm1
+
+ punpcklwd xmm1, xmm0
+ psubd xmm6, xmm1
+
+ punpckhwd xmm2, xmm0
+ psubd xmm7, xmm2
+
+
+ movdqa xmm3, xmm6
+ pslld xmm3, 4
+
+ psubd xmm3, xmm6
+ movdqa xmm1, xmm5
+
+ movdqa xmm4, xmm5
+ pmullw xmm1, xmm1
+
+ pmulhw xmm4, xmm4
+ movdqa xmm2, xmm1
+
+ punpcklwd xmm1, xmm4
+ punpckhwd xmm2, xmm4
+
+ movdqa xmm4, xmm7
+ pslld xmm4, 4
+
+ psubd xmm4, xmm7
+
+ psubd xmm3, xmm1
+ psubd xmm4, xmm2
+
+ psubd xmm3, flimit4
+ psubd xmm4, flimit4
+
+ psrad xmm3, 31
+ psrad xmm4, 31
+
+ packssdw xmm3, xmm4
+ packsswb xmm3, xmm0
+
+ movq xmm1, QWORD PTR [rsi+rax*8]
+
+ movq xmm2, xmm1
+ punpcklbw xmm1, xmm0
+
+ paddw xmm1, xmm5
+ mov rcx, rdx
+
+ and rcx, 127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+ push rax
+ lea rax, [GLOBAL(sym(vp8_rv))]
+ movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2]
+ pop rax
+%elif ABI_IS_32BIT=0
+ movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
+%else
+ movdqu xmm4, [sym(vp8_rv) + rcx*2]
+%endif
+
+ paddw xmm1, xmm4
+ ;paddw xmm1, eight8s
+ psraw xmm1, 4
+
+ packuswb xmm1, xmm0
+ pand xmm1, xmm3
+
+ pandn xmm3, xmm2
+ por xmm1, xmm3
+
+ and rcx, 15
+ movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
+
+ mov rcx, rdx
+ sub rcx, 8
+
+ and rcx, 15
+ movq mm0, [rsp + rcx*8] ;d[rcx*8]
+
+ movq [rsi], mm0
+ lea rsi, [rsi+rax]
+
+ lea rdi, [rdi+rax]
+ add rdx, 1
+
+ cmp edx, dword arg(2) ;rows
+ jl .loop_row
+
+ add dword arg(0), 8 ; s += 8
+ sub dword arg(3), 8 ; cols -= 8
+ cmp dword arg(3), 0
+ jg .loop_col
+
+ add rsp, 128+16
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit4
+
+
+;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
+; int pitch, int rows, int cols,int flimit)
+global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
+sym(vp8_mbpost_proc_across_ip_xmm):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16
+
+ ; create flimit4 at [rsp]
+ mov eax, dword ptr arg(4) ;flimit
+ mov [rsp], eax
+ mov [rsp+4], eax
+ mov [rsp+8], eax
+ mov [rsp+12], eax
+%define flimit4 [rsp]
+
+
+ ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+ xor rdx, rdx ;sumsq=0;
+ xor rcx, rcx ;sum=0;
+ mov rsi, arg(0); s
+
+
+ ; dup the first byte into the left border 8 times
+ movq mm1, [rsi]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+
+ mov rdi, -8
+ movq [rsi+rdi], mm1
+
+ ; dup the last byte into the right border
+ movsxd rdx, dword arg(3)
+ movq mm1, [rsi + rdx + -1]
+ punpcklbw mm1, mm1
+ punpcklwd mm1, mm1
+ punpckldq mm1, mm1
+ movq [rsi+rdx], mm1
+
+.ip_var_loop:
+ ;for(i=-8;i<=6;i++)
+ ;{
+ ; sumsq += s[i]*s[i];
+ ; sum += s[i];
+ ;}
+ movzx eax, byte [rsi+rdi]
+ add ecx, eax
+ mul al
+ add edx, eax
+ add rdi, 1
+ cmp rdi, 6
+ jle .ip_var_loop
+
+
+ ;mov rax, sumsq
+ ;movd xmm7, rax
+ movd xmm7, edx
+
+ ;mov rax, sum
+ ;movd xmm6, rax
+ movd xmm6, ecx
+
+ mov rsi, arg(0) ;s
+ xor rcx, rcx
+
+ movsxd rdx, dword arg(3) ;cols
+ add rdx, 8
+ pxor mm0, mm0
+ pxor mm1, mm1
+
+ pxor xmm0, xmm0
+.nextcol4:
+
+ movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
+ movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
+
+ punpcklbw xmm1, xmm0 ; expanding
+ punpcklbw xmm2, xmm0 ; expanding
+
+ punpcklwd xmm1, xmm0 ; expanding to dwords
+ punpcklwd xmm2, xmm0 ; expanding to dwords
+
+ psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
+ paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
+
+ paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
+ pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
+
+ paddd xmm6, xmm2
+ paddd xmm7, xmm1
+
+ pshufd xmm6, xmm6, 0 ; duplicate the last ones
+ pshufd xmm7, xmm7, 0 ; duplicate the last ones
+
+ psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
+ psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
+
+ pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
+ pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm3
+
+ pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
+ pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
+
+ paddd xmm7, xmm3
+ paddd xmm6, xmm4
+
+ pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
+ pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
+
+ paddd xmm7, xmm3
+ paddd xmm6, xmm4
+
+ movdqa xmm3, xmm6
+ pmaddwd xmm3, xmm3
+
+ movdqa xmm5, xmm7
+ pslld xmm5, 4
+
+ psubd xmm5, xmm7
+ psubd xmm5, xmm3
+
+ psubd xmm5, flimit4
+ psrad xmm5, 31
+
+ packssdw xmm5, xmm0
+ packsswb xmm5, xmm0
+
+ movd xmm1, DWORD PTR [rsi+rcx]
+ movq xmm2, xmm1
+
+ punpcklbw xmm1, xmm0
+ punpcklwd xmm1, xmm0
+
+ paddd xmm1, xmm6
+ paddd xmm1, [GLOBAL(four8s)]
+
+ psrad xmm1, 4
+ packssdw xmm1, xmm0
+
+ packuswb xmm1, xmm0
+ pand xmm1, xmm5
+
+ pandn xmm5, xmm2
+ por xmm5, xmm1
+
+ movd [rsi+rcx-8], mm0
+ movq mm0, mm1
+
+ movdq2q mm1, xmm5
+ psrldq xmm7, 12
+
+ psrldq xmm6, 12
+ add rcx, 4
+
+ cmp rcx, rdx
+ jl .nextcol4
+
+ ;s+=pitch;
+ movsxd rax, dword arg(1)
+ add arg(0), rax
+
+ sub dword arg(2), 1 ;rows-=1
+ cmp dword arg(2), 0
+ jg .ip_row_loop
+
+ add rsp, 16
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%undef flimit4
+
+
+;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
+; unsigned char blackclamp[16],
+; unsigned char whiteclamp[16],
+; unsigned char bothclamp[16],
+; unsigned int Width, unsigned int Height, int Pitch)
+extern sym(rand)
+global sym(vp8_plane_add_noise_wmt) PRIVATE
+sym(vp8_plane_add_noise_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+.addnoise_loop:
+ call sym(rand) WRT_PLT
+ mov rcx, arg(1) ;noise
+ and rax, 0xff
+ add rcx, rax
+
+ ; we rely on the fact that the clamping vectors are stored contiguously
+ ; in black/white/both order. Note that we have to reload this here because
+ ; rdx could be trashed by rand()
+ mov rdx, arg(2) ; blackclamp
+
+
+ mov rdi, rcx
+ movsxd rcx, dword arg(5) ;[Width]
+ mov rsi, arg(0) ;Pos
+ xor rax,rax
+
+.addnoise_nextset:
+ movdqu xmm1,[rsi+rax] ; get the source
+
+ psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
+ paddusb xmm1, [rdx+32] ;bothclamp
+ psubusb xmm1, [rdx+16] ;whiteclamp
+
+ movdqu xmm2,[rdi+rax] ; get the noise for this line
+ paddb xmm1,xmm2 ; add it in
+ movdqu [rsi+rax],xmm1 ; store the result
+
+ add rax,16 ; move to the next line
+
+ cmp rax, rcx
+ jl .addnoise_nextset
+
+ movsxd rax, dword arg(7) ; Pitch
+ add arg(0), rax ; Start += Pitch
+ sub dword arg(6), 1 ; Height -= 1
+ jg .addnoise_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+four8s:
+ times 4 dd 8
diff --git a/libvpx/vp8/common/x86/postproc_x86.c b/libvpx/vp8/common/x86/postproc_x86.c
new file mode 100644
index 0000000..3ec0106
--- /dev/null
+++ b/libvpx/vp8/common/x86/postproc_x86.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* On Android NDK, rand is inlined function, but postproc needs rand symbol */
+#if defined(__ANDROID__)
+#define rand __rand
+#include <stdlib.h>
+#undef rand
+
+extern int rand(void)
+{
+ return __rand();
+}
+#else
+/* ISO C forbids an empty translation unit. */
+int vp8_unused;
+#endif
diff --git a/libvpx/vp8/common/x86/recon_mmx.asm b/libvpx/vp8/common/x86/recon_mmx.asm
new file mode 100644
index 0000000..15e9871
--- /dev/null
+++ b/libvpx/vp8/common/x86/recon_mmx.asm
@@ -0,0 +1,274 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void copy_mem8x8_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp8_copy_mem8x8_mmx) PRIVATE
+sym(vp8_copy_mem8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movq mm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movq mm1, [rsi+rax]
+ movq mm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movq [rdi], mm0
+ add rsi, rax
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx*2], mm2
+
+
+ lea rdi, [rdi+rcx*2]
+ movq mm3, [rsi]
+
+ add rdi, rcx
+ movq mm4, [rsi+rax]
+
+ movq mm5, [rsi+rax*2]
+ movq [rdi], mm3
+
+ lea rsi, [rsi+rax*2]
+ movq [rdi+rcx], mm4
+
+ movq [rdi+rcx*2], mm5
+ lea rdi, [rdi+rcx*2]
+
+ movq mm0, [rsi+rax]
+ movq mm1, [rsi+rax*2]
+
+ movq [rdi+rcx], mm0
+ movq [rdi+rcx*2],mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void copy_mem8x4_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp8_copy_mem8x4_mmx) PRIVATE
+sym(vp8_copy_mem8x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movq mm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movq mm1, [rsi+rax]
+ movq mm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movq [rdi], mm0
+ movq [rdi+rcx], mm1
+
+ movq [rdi+rcx*2], mm2
+ lea rdi, [rdi+rcx*2]
+
+ movq mm3, [rsi+rax]
+ movq [rdi+rcx], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void copy_mem16x16_mmx(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp8_copy_mem16x16_mmx) PRIVATE
+sym(vp8_copy_mem16x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movsxd rax, dword ptr arg(1) ;src_stride;
+
+ mov rdi, arg(2) ;dst;
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq mm1, [rsi+rax]
+ movq mm4, [rsi+rax+8]
+
+ movq mm2, [rsi+rax*2]
+ movq mm5, [rsi+rax*2+8]
+
+ lea rsi, [rsi+rax*2]
+ add rsi, rax
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ movq [rdi+rcx], mm1
+ movq [rdi+rcx+8], mm4
+
+ movq [rdi+rcx*2], mm2
+ movq [rdi+rcx*2+8], mm5
+
+ lea rdi, [rdi+rcx*2]
+ add rdi, rcx
+
+ movq mm0, [rsi]
+ movq mm3, [rsi+8];
+
+ movq [rdi], mm0
+ movq [rdi+8], mm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/common/x86/recon_sse2.asm b/libvpx/vp8/common/x86/recon_sse2.asm
new file mode 100644
index 0000000..fe77450
--- /dev/null
+++ b/libvpx/vp8/common/x86/recon_sse2.asm
@@ -0,0 +1,1080 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void copy_mem16x16_sse2(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride
+; )
+global sym(vp8_copy_mem16x16_sse2) PRIVATE
+sym(vp8_copy_mem16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src;
+ movdqu xmm0, [rsi]
+
+ movsxd rax, dword ptr arg(1) ;src_stride;
+ mov rdi, arg(2) ;dst;
+
+ movdqu xmm1, [rsi+rax]
+ movdqu xmm2, [rsi+rax*2]
+
+ movsxd rcx, dword ptr arg(3) ;dst_stride
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm0
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm1
+ movdqa [rdi+rcx*2],xmm2
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm3, [rsi]
+
+ add rdi, rcx
+ movdqu xmm4, [rsi+rax]
+
+ movdqu xmm5, [rsi+rax*2]
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm3
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm4
+ movdqa [rdi+rcx*2],xmm5
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm0, [rsi]
+
+ add rdi, rcx
+ movdqu xmm1, [rsi+rax]
+
+ movdqu xmm2, [rsi+rax*2]
+ lea rsi, [rsi+rax*2]
+
+ movdqa [rdi], xmm0
+ add rsi, rax
+
+ movdqa [rdi+rcx], xmm1
+
+ movdqa [rdi+rcx*2], xmm2
+ movdqu xmm3, [rsi]
+
+ movdqu xmm4, [rsi+rax]
+ lea rdi, [rdi+rcx*2]
+
+ add rdi, rcx
+ movdqu xmm5, [rsi+rax*2]
+
+ lea rsi, [rsi+rax*2]
+ movdqa [rdi], xmm3
+
+ add rsi, rax
+ movdqa [rdi+rcx], xmm4
+
+ movdqa [rdi+rcx*2],xmm5
+ movdqu xmm0, [rsi]
+
+ lea rdi, [rdi+rcx*2]
+ movdqu xmm1, [rsi+rax]
+
+ add rdi, rcx
+ movdqu xmm2, [rsi+rax*2]
+
+ lea rsi, [rsi+rax*2]
+ movdqa [rdi], xmm0
+
+ movdqa [rdi+rcx], xmm1
+ movdqa [rdi+rcx*2],xmm2
+
+ movdqu xmm3, [rsi+rax]
+ lea rdi, [rdi+rcx*2]
+
+ movdqa [rdi+rcx], xmm3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_intra_pred_uv_dc_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE
+sym(vp8_intra_pred_uv_dc_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; from top
+ mov rdi, arg(2) ;above;
+ mov rsi, arg(3) ;left;
+ movsxd rax, dword ptr arg(4) ;left_stride;
+ pxor mm0, mm0
+ movq mm1, [rdi]
+ lea rdi, [rax*3]
+ psadbw mm1, mm0
+ ; from left
+ movzx ecx, byte [rsi]
+ movzx edx, byte [rsi+rax*1]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+
+ movzx edx, byte [rsi+rdi]
+ lea rsi, [rsi+rax*4]
+ add ecx, edx
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+
+ ; add up
+ pextrw edx, mm1, 0x0
+ lea edx, [edx+ecx+8]
+ sar edx, 4
+ movd mm1, edx
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ pshufw mm1, mm1, 0x0
+ mov rdi, arg(0) ;dst;
+ packuswb mm1, mm1
+
+ ; write out
+ lea rax, [rcx*3]
+ lea rdx, [rdi+rcx*4]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ movq [rdx ], mm1
+ movq [rdx+rcx ], mm1
+ movq [rdx+rcx*2], mm1
+ movq [rdx+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_dctop_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE
+sym(vp8_intra_pred_uv_dctop_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;arg(3), arg(4) not used
+
+ ; from top
+ mov rsi, arg(2) ;above;
+ pxor mm0, mm0
+ movq mm1, [rsi]
+ psadbw mm1, mm0
+
+ ; add up
+ paddw mm1, [GLOBAL(dc_4)]
+ psraw mm1, 3
+ pshufw mm1, mm1, 0x0
+ packuswb mm1, mm1
+
+ ; write out
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ lea rdi, [rdi+rcx*4]
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_dcleft_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE
+sym(vp8_intra_pred_uv_dcleft_mmx2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;arg(2) not used
+
+ ; from left
+ mov rsi, arg(3) ;left;
+ movsxd rax, dword ptr arg(4) ;left_stride;
+ lea rdi, [rax*3]
+ movzx ecx, byte [rsi]
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ lea edx, [ecx+edx+4]
+
+ ; add up
+ shr edx, 3
+ movd mm1, edx
+ pshufw mm1, mm1, 0x0
+ packuswb mm1, mm1
+
+ ; write out
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+ lea rdi, [rdi+rcx*4]
+ movq [rdi ], mm1
+ movq [rdi+rcx ], mm1
+ movq [rdi+rcx*2], mm1
+ movq [rdi+rax ], mm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_dc128_mmx(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE
+sym(vp8_intra_pred_uv_dc128_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ ; end prolog
+
+ ;arg(2), arg(3), arg(4) not used
+
+ ; write out
+ movq mm1, [GLOBAL(dc_128)]
+ mov rax, arg(0) ;dst;
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+ lea rcx, [rdx*3]
+
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+ lea rax, [rax+rdx*4]
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+
+ ; begin epilog
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_tm_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+%macro vp8_intra_pred_uv_tm 1
+global sym(vp8_intra_pred_uv_tm_%1) PRIVATE
+sym(vp8_intra_pred_uv_tm_%1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; read top row
+ mov edx, 4
+ mov rsi, arg(2) ;above
+ movsxd rax, dword ptr arg(4) ;left_stride;
+ pxor xmm0, xmm0
+%ifidn %1, ssse3
+ movdqa xmm2, [GLOBAL(dc_1024)]
+%endif
+ movq xmm1, [rsi]
+ punpcklbw xmm1, xmm0
+
+ ; set up left ptrs ans subtract topleft
+ movd xmm3, [rsi-1]
+ mov rsi, arg(3) ;left;
+%ifidn %1, sse2
+ punpcklbw xmm3, xmm0
+ pshuflw xmm3, xmm3, 0x0
+ punpcklqdq xmm3, xmm3
+%else
+ pshufb xmm3, xmm2
+%endif
+ psubw xmm1, xmm3
+
+ ; set up dest ptrs
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+
+.vp8_intra_pred_uv_tm_%1_loop:
+ movd xmm3, [rsi]
+ movd xmm5, [rsi+rax]
+%ifidn %1, sse2
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm5, xmm0
+ pshuflw xmm3, xmm3, 0x0
+ pshuflw xmm5, xmm5, 0x0
+ punpcklqdq xmm3, xmm3
+ punpcklqdq xmm5, xmm5
+%else
+ pshufb xmm3, xmm2
+ pshufb xmm5, xmm2
+%endif
+ paddw xmm3, xmm1
+ paddw xmm5, xmm1
+ packuswb xmm3, xmm5
+ movq [rdi ], xmm3
+ movhps[rdi+rcx], xmm3
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz .vp8_intra_pred_uv_tm_%1_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endmacro
+
+vp8_intra_pred_uv_tm sse2
+vp8_intra_pred_uv_tm ssse3
+
+;void vp8_intra_pred_uv_ve_mmx(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE
+sym(vp8_intra_pred_uv_ve_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ ; end prolog
+
+ ; arg(3), arg(4) not used
+
+ ; read from top
+ mov rax, arg(2) ;src;
+
+ movq mm1, [rax]
+
+ ; write out
+ mov rax, arg(0) ;dst;
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+ lea rcx, [rdx*3]
+
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+ lea rax, [rax+rdx*4]
+ movq [rax ], mm1
+ movq [rax+rdx ], mm1
+ movq [rax+rdx*2], mm1
+ movq [rax+rcx ], mm1
+
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_uv_ho_mmx2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+%macro vp8_intra_pred_uv_ho 1
+global sym(vp8_intra_pred_uv_ho_%1) PRIVATE
+sym(vp8_intra_pred_uv_ho_%1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+%ifidn %1, ssse3
+%ifndef GET_GOT_SAVE_ARG
+ push rbx
+%endif
+ GET_GOT rbx
+%endif
+ ; end prolog
+
+ ;arg(2) not used
+
+ ; read from left and write out
+%ifidn %1, mmx2
+ mov edx, 4
+%endif
+ mov rsi, arg(3) ;left
+ movsxd rax, dword ptr arg(4) ;left_stride;
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+%ifidn %1, ssse3
+ lea rdx, [rcx*3]
+ movdqa xmm2, [GLOBAL(dc_00001111)]
+ lea rbx, [rax*3]
+%endif
+
+%ifidn %1, mmx2
+.vp8_intra_pred_uv_ho_%1_loop:
+ movd mm0, [rsi]
+ movd mm1, [rsi+rax]
+ punpcklbw mm0, mm0
+ punpcklbw mm1, mm1
+ pshufw mm0, mm0, 0x0
+ pshufw mm1, mm1, 0x0
+ movq [rdi ], mm0
+ movq [rdi+rcx], mm1
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz .vp8_intra_pred_uv_ho_%1_loop
+%else
+ movd xmm0, [rsi]
+ movd xmm3, [rsi+rax]
+ movd xmm1, [rsi+rax*2]
+ movd xmm4, [rsi+rbx]
+ punpcklbw xmm0, xmm3
+ punpcklbw xmm1, xmm4
+ pshufb xmm0, xmm2
+ pshufb xmm1, xmm2
+ movq [rdi ], xmm0
+ movhps [rdi+rcx], xmm0
+ movq [rdi+rcx*2], xmm1
+ movhps [rdi+rdx], xmm1
+ lea rsi, [rsi+rax*4]
+ lea rdi, [rdi+rcx*4]
+ movd xmm0, [rsi]
+ movd xmm3, [rsi+rax]
+ movd xmm1, [rsi+rax*2]
+ movd xmm4, [rsi+rbx]
+ punpcklbw xmm0, xmm3
+ punpcklbw xmm1, xmm4
+ pshufb xmm0, xmm2
+ pshufb xmm1, xmm2
+ movq [rdi ], xmm0
+ movhps [rdi+rcx], xmm0
+ movq [rdi+rcx*2], xmm1
+ movhps [rdi+rdx], xmm1
+%endif
+
+ ; begin epilog
+%ifidn %1, ssse3
+ RESTORE_GOT
+%ifndef GET_GOT_SAVE_ARG
+ pop rbx
+%endif
+%endif
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endmacro
+
+vp8_intra_pred_uv_ho mmx2
+vp8_intra_pred_uv_ho ssse3
+
+;void vp8_intra_pred_y_dc_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+global sym(vp8_intra_pred_y_dc_sse2) PRIVATE
+sym(vp8_intra_pred_y_dc_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ ; from top
+ mov rdi, arg(2) ;above
+ mov rsi, arg(3) ;left
+ movsxd rax, dword ptr arg(4) ;left_stride;
+
+ pxor xmm0, xmm0
+ movdqa xmm1, [rdi]
+ psadbw xmm1, xmm0
+ movq xmm2, xmm1
+ punpckhqdq xmm1, xmm1
+ paddw xmm1, xmm2
+
+ ; from left
+ lea rdi, [rax*3]
+
+ movzx ecx, byte [rsi]
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+
+ ; add up
+ pextrw edx, xmm1, 0x0
+ lea edx, [edx+ecx+16]
+ sar edx, 5
+ movd xmm1, edx
+ ; FIXME use pshufb for ssse3 version
+ pshuflw xmm1, xmm1, 0x0
+ punpcklqdq xmm1, xmm1
+ packuswb xmm1, xmm1
+
+ ; write out
+ mov rsi, 2
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+.label
+ movdqa [rdi ], xmm1
+ movdqa [rdi+rcx ], xmm1
+ movdqa [rdi+rcx*2], xmm1
+ movdqa [rdi+rax ], xmm1
+ lea rdi, [rdi+rcx*4]
+ movdqa [rdi ], xmm1
+ movdqa [rdi+rcx ], xmm1
+ movdqa [rdi+rcx*2], xmm1
+ movdqa [rdi+rax ], xmm1
+ lea rdi, [rdi+rcx*4]
+ dec rsi
+ jnz .label
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_y_dctop_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE
+sym(vp8_intra_pred_y_dctop_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ GET_GOT rbx
+ ; end prolog
+
+ ;arg(3), arg(4) not used
+
+ ; from top
+ mov rcx, arg(2) ;above;
+ pxor xmm0, xmm0
+ movdqa xmm1, [rcx]
+ psadbw xmm1, xmm0
+ movdqa xmm2, xmm1
+ punpckhqdq xmm1, xmm1
+ paddw xmm1, xmm2
+
+ ; add up
+ paddw xmm1, [GLOBAL(dc_8)]
+ psraw xmm1, 4
+ ; FIXME use pshufb for ssse3 version
+ pshuflw xmm1, xmm1, 0x0
+ punpcklqdq xmm1, xmm1
+ packuswb xmm1, xmm1
+
+ ; write out
+ mov rsi, 2
+ mov rdx, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+.label
+ movdqa [rdx ], xmm1
+ movdqa [rdx+rcx ], xmm1
+ movdqa [rdx+rcx*2], xmm1
+ movdqa [rdx+rax ], xmm1
+ lea rdx, [rdx+rcx*4]
+ movdqa [rdx ], xmm1
+ movdqa [rdx+rcx ], xmm1
+ movdqa [rdx+rcx*2], xmm1
+ movdqa [rdx+rax ], xmm1
+ lea rdx, [rdx+rcx*4]
+ dec rsi
+ jnz .label
+
+ ; begin epilog
+ RESTORE_GOT
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_y_dcleft_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE
+sym(vp8_intra_pred_y_dcleft_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;arg(2) not used
+
+ ; from left
+ mov rsi, arg(3) ;left;
+ movsxd rax, dword ptr arg(4) ;left_stride;
+
+ lea rdi, [rax*3]
+ movzx ecx, byte [rsi]
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ add ecx, edx
+ lea rsi, [rsi+rax*4]
+ movzx edx, byte [rsi]
+ add ecx, edx
+ movzx edx, byte [rsi+rax]
+ add ecx, edx
+ movzx edx, byte [rsi+rax*2]
+ add ecx, edx
+ movzx edx, byte [rsi+rdi]
+ lea edx, [ecx+edx+8]
+
+ ; add up
+ shr edx, 4
+ movd xmm1, edx
+ ; FIXME use pshufb for ssse3 version
+ pshuflw xmm1, xmm1, 0x0
+ punpcklqdq xmm1, xmm1
+ packuswb xmm1, xmm1
+
+ ; write out
+ mov rsi, 2
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+ lea rax, [rcx*3]
+
+.label
+ movdqa [rdi ], xmm1
+ movdqa [rdi+rcx ], xmm1
+ movdqa [rdi+rcx*2], xmm1
+ movdqa [rdi+rax ], xmm1
+ lea rdi, [rdi+rcx*4]
+ movdqa [rdi ], xmm1
+ movdqa [rdi+rcx ], xmm1
+ movdqa [rdi+rcx*2], xmm1
+ movdqa [rdi+rax ], xmm1
+ lea rdi, [rdi+rcx*4]
+ dec rsi
+ jnz .label
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_y_dc128_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE
+sym(vp8_intra_pred_y_dc128_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ GET_GOT rbx
+ ; end prolog
+
+ ;arg(2), arg(3), arg(4) not used
+
+ ; write out
+ mov rsi, 2
+ movdqa xmm1, [GLOBAL(dc_128)]
+ mov rax, arg(0) ;dst;
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+ lea rcx, [rdx*3]
+
+.label
+ movdqa [rax ], xmm1
+ movdqa [rax+rdx ], xmm1
+ movdqa [rax+rdx*2], xmm1
+ movdqa [rax+rcx ], xmm1
+ lea rax, [rax+rdx*4]
+ movdqa [rax ], xmm1
+ movdqa [rax+rdx ], xmm1
+ movdqa [rax+rdx*2], xmm1
+ movdqa [rax+rcx ], xmm1
+ lea rax, [rax+rdx*4]
+ dec rsi
+ jnz .label
+
+ ; begin epilog
+ RESTORE_GOT
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_y_tm_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+%macro vp8_intra_pred_y_tm 1
+global sym(vp8_intra_pred_y_tm_%1) PRIVATE
+sym(vp8_intra_pred_y_tm_%1):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ GET_GOT rbx
+ ; end prolog
+
+ ; read top row
+ mov edx, 8
+ mov rsi, arg(2) ;above
+ movsxd rax, dword ptr arg(4) ;left_stride;
+ pxor xmm0, xmm0
+%ifidn %1, ssse3
+ movdqa xmm3, [GLOBAL(dc_1024)]
+%endif
+ movdqa xmm1, [rsi]
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+
+ ; set up left ptrs ans subtract topleft
+ movd xmm4, [rsi-1]
+ mov rsi, arg(3) ;left
+%ifidn %1, sse2
+ punpcklbw xmm4, xmm0
+ pshuflw xmm4, xmm4, 0x0
+ punpcklqdq xmm4, xmm4
+%else
+ pshufb xmm4, xmm3
+%endif
+ psubw xmm1, xmm4
+ psubw xmm2, xmm4
+
+ ; set up dest ptrs
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+vp8_intra_pred_y_tm_%1_loop:
+ movd xmm4, [rsi]
+ movd xmm5, [rsi+rax]
+%ifidn %1, sse2
+ punpcklbw xmm4, xmm0
+ punpcklbw xmm5, xmm0
+ pshuflw xmm4, xmm4, 0x0
+ pshuflw xmm5, xmm5, 0x0
+ punpcklqdq xmm4, xmm4
+ punpcklqdq xmm5, xmm5
+%else
+ pshufb xmm4, xmm3
+ pshufb xmm5, xmm3
+%endif
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm5
+ paddw xmm4, xmm1
+ paddw xmm6, xmm2
+ paddw xmm5, xmm1
+ paddw xmm7, xmm2
+ packuswb xmm4, xmm6
+ packuswb xmm5, xmm7
+ movdqa [rdi ], xmm4
+ movdqa [rdi+rcx], xmm5
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz vp8_intra_pred_y_tm_%1_loop
+
+ ; begin epilog
+ RESTORE_GOT
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+%endmacro
+
+vp8_intra_pred_y_tm sse2
+vp8_intra_pred_y_tm ssse3
+
+;void vp8_intra_pred_y_ve_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride
+; )
+global sym(vp8_intra_pred_y_ve_sse2) PRIVATE
+sym(vp8_intra_pred_y_ve_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ ; end prolog
+
+ ;arg(3), arg(4) not used
+
+ mov rax, arg(2) ;above;
+ mov rsi, 2
+ movsxd rdx, dword ptr arg(1) ;dst_stride
+
+ ; read from top
+ movdqa xmm1, [rax]
+
+ ; write out
+ mov rax, arg(0) ;dst;
+ lea rcx, [rdx*3]
+
+.label
+ movdqa [rax ], xmm1
+ movdqa [rax+rdx ], xmm1
+ movdqa [rax+rdx*2], xmm1
+ movdqa [rax+rcx ], xmm1
+ lea rax, [rax+rdx*4]
+ movdqa [rax ], xmm1
+ movdqa [rax+rdx ], xmm1
+ movdqa [rax+rdx*2], xmm1
+ movdqa [rax+rcx ], xmm1
+ lea rax, [rax+rdx*4]
+ dec rsi
+ jnz .label
+
+ ; begin epilog
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_intra_pred_y_ho_sse2(
+; unsigned char *dst,
+; int dst_stride
+; unsigned char *above,
+; unsigned char *left,
+; int left_stride,
+; )
+global sym(vp8_intra_pred_y_ho_sse2) PRIVATE
+sym(vp8_intra_pred_y_ho_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;arg(2) not used
+
+ ; read from left and write out
+ mov edx, 8
+ mov rsi, arg(3) ;left;
+ movsxd rax, dword ptr arg(4) ;left_stride;
+ mov rdi, arg(0) ;dst;
+ movsxd rcx, dword ptr arg(1) ;dst_stride
+
+vp8_intra_pred_y_ho_sse2_loop:
+ movd xmm0, [rsi]
+ movd xmm1, [rsi+rax]
+ ; FIXME use pshufb for ssse3 version
+ punpcklbw xmm0, xmm0
+ punpcklbw xmm1, xmm1
+ pshuflw xmm0, xmm0, 0x0
+ pshuflw xmm1, xmm1, 0x0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ movdqa [rdi ], xmm0
+ movdqa [rdi+rcx], xmm1
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rcx*2]
+ dec edx
+ jnz vp8_intra_pred_y_ho_sse2_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+dc_128:
+ times 16 db 128
+dc_4:
+ times 4 dw 4
+align 16
+dc_8:
+ times 8 dw 8
+align 16
+dc_1024:
+ times 8 dw 0x400
+align 16
+dc_00001111:
+ times 8 db 0
+ times 8 db 1
diff --git a/libvpx/vp8/common/x86/recon_wrapper_sse2.c b/libvpx/vp8/common/x86/recon_wrapper_sse2.c
new file mode 100644
index 0000000..b482faa
--- /dev/null
+++ b/libvpx/vp8/common/x86/recon_wrapper_sse2.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp8/common/blockd.h"
+
+#define build_intra_predictors_mbuv_prototype(sym) \
+ void sym(unsigned char *dst, int dst_stride, \
+ const unsigned char *above, \
+ const unsigned char *left, int left_stride)
+typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t));
+
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_ssse3);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2);
+extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
+
+static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_stride,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ build_intra_predictors_mbuv_fn_t tm_func,
+ build_intra_predictors_mbuv_fn_t ho_func)
+{
+ int mode = x->mode_info_context->mbmi.uv_mode;
+ build_intra_predictors_mbuv_fn_t fn;
+
+ switch (mode) {
+ case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
+ case H_PRED: fn = ho_func; break;
+ case TM_PRED: fn = tm_func; break;
+ case DC_PRED:
+ if (x->up_available) {
+ if (x->left_available) {
+ fn = vp8_intra_pred_uv_dc_mmx2; break;
+ } else {
+ fn = vp8_intra_pred_uv_dctop_mmx2; break;
+ }
+ } else if (x->left_available) {
+ fn = vp8_intra_pred_uv_dcleft_mmx2; break;
+ } else {
+ fn = vp8_intra_pred_uv_dc128_mmx; break;
+ }
+ break;
+ default: return;
+ }
+
+ fn(dst_u, dst_stride, uabove_row, uleft, left_stride);
+ fn(dst_v, dst_stride, vabove_row, vleft, left_stride);
+}
+
+void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ unsigned char * upred_ptr,
+ unsigned char * vpred_ptr,
+ int pred_stride)
+{
+ vp8_build_intra_predictors_mbuv_x86(x,
+ uabove_row, vabove_row,
+ upred_ptr,
+ vpred_ptr, pred_stride,
+ uleft,
+ vleft,
+ left_stride,
+ vp8_intra_pred_uv_tm_sse2,
+ vp8_intra_pred_uv_ho_mmx2);
+}
+
+void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x,
+ unsigned char * uabove_row,
+ unsigned char * vabove_row,
+ unsigned char * uleft,
+ unsigned char * vleft,
+ int left_stride,
+ unsigned char * upred_ptr,
+ unsigned char * vpred_ptr,
+ int pred_stride)
+{
+ vp8_build_intra_predictors_mbuv_x86(x,
+ uabove_row, vabove_row,
+ upred_ptr,
+ vpred_ptr, pred_stride,
+ uleft,
+ vleft,
+ left_stride,
+ vp8_intra_pred_uv_tm_ssse3,
+ vp8_intra_pred_uv_ho_ssse3);
+}
+
+#define build_intra_predictors_mby_prototype(sym) \
+ void sym(unsigned char *dst, int dst_stride, \
+ const unsigned char *above, \
+ const unsigned char *left, int left_stride)
+typedef build_intra_predictors_mby_prototype((*build_intra_predictors_mby_fn_t));
+
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dctop_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dcleft_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc128_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ho_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ve_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_ssse3);
+
+static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x,
+ unsigned char * yabove_row,
+ unsigned char *dst_y,
+ int dst_stride,
+ unsigned char * yleft,
+ int left_stride,
+ build_intra_predictors_mby_fn_t tm_func)
+{
+ int mode = x->mode_info_context->mbmi.mode;
+ build_intra_predictors_mbuv_fn_t fn;
+
+ switch (mode) {
+ case V_PRED: fn = vp8_intra_pred_y_ve_sse2; break;
+ case H_PRED: fn = vp8_intra_pred_y_ho_sse2; break;
+ case TM_PRED: fn = tm_func; break;
+ case DC_PRED:
+ if (x->up_available) {
+ if (x->left_available) {
+ fn = vp8_intra_pred_y_dc_sse2; break;
+ } else {
+ fn = vp8_intra_pred_y_dctop_sse2; break;
+ }
+ } else if (x->left_available) {
+ fn = vp8_intra_pred_y_dcleft_sse2; break;
+ } else {
+ fn = vp8_intra_pred_y_dc128_sse2; break;
+ }
+ break;
+ default: return;
+ }
+
+ fn(dst_y, dst_stride, yabove_row, yleft, left_stride);
+ return;
+}
+
+void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x,
+ unsigned char * yabove_row,
+ unsigned char * yleft,
+ int left_stride,
+ unsigned char * ypred_ptr,
+ int y_stride)
+{
+ vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr,
+ y_stride, yleft, left_stride,
+ vp8_intra_pred_y_tm_sse2);
+}
+
+void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x,
+ unsigned char * yabove_row,
+ unsigned char * yleft,
+ int left_stride,
+ unsigned char * ypred_ptr,
+ int y_stride)
+{
+ vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr,
+ y_stride, yleft, left_stride,
+ vp8_intra_pred_y_tm_ssse3);
+
+}
diff --git a/libvpx/vp8/common/x86/sad_mmx.asm b/libvpx/vp8/common/x86/sad_mmx.asm
new file mode 100644
index 0000000..592112f
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_mmx.asm
@@ -0,0 +1,427 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp8_sad16x16_mmx) PRIVATE
+global sym(vp8_sad8x16_mmx) PRIVATE
+global sym(vp8_sad8x8_mmx) PRIVATE
+global sym(vp8_sad4x4_mmx) PRIVATE
+global sym(vp8_sad16x8_mmx) PRIVATE
+
+;unsigned int vp8_sad16x16_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad16x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+.x16x16sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm2, QWORD PTR [rsi+8]
+
+ movq mm1, QWORD PTR [rdi]
+ movq mm3, QWORD PTR [rdi+8]
+
+ movq mm4, mm0
+ movq mm5, mm2
+
+ psubusb mm0, mm1
+ psubusb mm1, mm4
+
+ psubusb mm2, mm3
+ psubusb mm3, mm5
+
+ por mm0, mm1
+ por mm2, mm3
+
+ movq mm1, mm0
+ movq mm3, mm2
+
+ punpcklbw mm0, mm6
+ punpcklbw mm2, mm6
+
+ punpckhbw mm1, mm6
+ punpckhbw mm3, mm6
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+
+
+ lea rsi, [rsi+rax]
+ add rdi, rdx
+
+ paddw mm7, mm0
+ paddw mm7, mm1
+
+ cmp rsi, rcx
+ jne .x16x16sad_mmx_loop
+
+
+ movq mm0, mm7
+
+ punpcklwd mm0, mm6
+ punpckhwd mm7, mm6
+
+ paddw mm0, mm7
+ movq mm7, mm0
+
+
+ psrlq mm0, 32
+ paddw mm7, mm0
+
+ movq rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad8x16_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad8x16_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+.x8x16sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ punpcklbw mm0, mm6
+
+ punpckhbw mm2, mm6
+ lea rsi, [rsi+rax]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ paddw mm7, mm2
+ cmp rsi, rcx
+
+ jne .x8x16sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movq rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad8x8_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+.x8x8sad_mmx_loop:
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ punpcklbw mm0, mm6
+
+ punpckhbw mm2, mm6
+ paddw mm0, mm2
+
+ lea rsi, [rsi+rax]
+ add rdi, rdx
+
+ paddw mm7, mm0
+ cmp rsi, rcx
+
+ jne .x8x8sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movq rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad4x4_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ movd mm0, DWORD PTR [rsi]
+ movd mm1, DWORD PTR [rdi]
+
+ movd mm2, DWORD PTR [rsi+rax]
+ movd mm3, DWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movq mm2, mm0
+ psubusb mm0, mm1
+
+ psubusb mm1, mm2
+ por mm0, mm1
+
+ movq mm2, mm0
+ pxor mm3, mm3
+
+ punpcklbw mm0, mm3
+ punpckhbw mm2, mm3
+
+ paddw mm0, mm2
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movd mm4, DWORD PTR [rsi]
+ movd mm5, DWORD PTR [rdi]
+
+ movd mm6, DWORD PTR [rsi+rax]
+ movd mm7, DWORD PTR [rdi+rdx]
+
+ punpcklbw mm4, mm6
+ punpcklbw mm5, mm7
+
+ movq mm6, mm4
+ psubusb mm4, mm5
+
+ psubusb mm5, mm6
+ por mm4, mm5
+
+ movq mm5, mm4
+ punpcklbw mm4, mm3
+
+ punpckhbw mm5, mm3
+ paddw mm4, mm5
+
+ paddw mm0, mm4
+ movq mm1, mm0
+
+ punpcklwd mm0, mm3
+ punpckhwd mm1, mm3
+
+ paddw mm0, mm1
+ movq mm1, mm0
+
+ psrlq mm0, 32
+ paddw mm0, mm1
+
+ movq rax, mm0
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad16x8_mmx(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+sym(vp8_sad16x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+ pxor mm7, mm7
+
+ pxor mm6, mm6
+
+.x16x8sad_mmx_loop:
+
+ movq mm0, [rsi]
+ movq mm1, [rdi]
+
+ movq mm2, [rsi+8]
+ movq mm3, [rdi+8]
+
+ movq mm4, mm0
+ movq mm5, mm2
+
+ psubusb mm0, mm1
+ psubusb mm1, mm4
+
+ psubusb mm2, mm3
+ psubusb mm3, mm5
+
+ por mm0, mm1
+ por mm2, mm3
+
+ movq mm1, mm0
+ movq mm3, mm2
+
+ punpcklbw mm0, mm6
+ punpckhbw mm1, mm6
+
+ punpcklbw mm2, mm6
+ punpckhbw mm3, mm6
+
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+
+ paddw mm0, mm1
+ lea rsi, [rsi+rax]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ cmp rsi, rcx
+ jne .x16x8sad_mmx_loop
+
+ movq mm0, mm7
+ punpcklwd mm0, mm6
+
+ punpckhwd mm7, mm6
+ paddw mm0, mm7
+
+ movq mm7, mm0
+ psrlq mm0, 32
+
+ paddw mm7, mm0
+ movq rax, mm7
+
+ pop rdi
+ pop rsi
+ mov rsp, rbp
+ ; begin epilog
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/common/x86/sad_sse2.asm b/libvpx/vp8/common/x86/sad_sse2.asm
new file mode 100644
index 0000000..8d86abc
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_sse2.asm
@@ -0,0 +1,410 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp8_sad16x16_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad16x16_wmt) PRIVATE
+sym(vp8_sad16x16_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ SAVE_XMM 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rax*8]
+
+ lea rcx, [rcx+rax*8]
+ pxor xmm6, xmm6
+
+.x16x16sad_wmt_loop:
+
+ movq xmm0, QWORD PTR [rsi]
+ movq xmm2, QWORD PTR [rsi+8]
+
+ movq xmm1, QWORD PTR [rdi]
+ movq xmm3, QWORD PTR [rdi+8]
+
+ movq xmm4, QWORD PTR [rsi+rax]
+ movq xmm5, QWORD PTR [rdi+rdx]
+
+
+ punpcklbw xmm0, xmm2
+ punpcklbw xmm1, xmm3
+
+ psadbw xmm0, xmm1
+ movq xmm2, QWORD PTR [rsi+rax+8]
+
+ movq xmm3, QWORD PTR [rdi+rdx+8]
+ lea rsi, [rsi+rax*2]
+
+ lea rdi, [rdi+rdx*2]
+ punpcklbw xmm4, xmm2
+
+ punpcklbw xmm5, xmm3
+ psadbw xmm4, xmm5
+
+ paddw xmm6, xmm0
+ paddw xmm6, xmm4
+
+ cmp rsi, rcx
+ jne .x16x16sad_wmt_loop
+
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movq rax, xmm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp8_sad8x16_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int max_sad)
+global sym(vp8_sad8x16_wmt) PRIVATE
+sym(vp8_sad8x16_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+
+ lea rcx, [rcx+rbx*8]
+ pxor mm7, mm7
+
+.x8x16sad_wmt_loop:
+
+ movq rax, mm7
+ cmp eax, arg(4)
+ ja .x8x16sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ movq mm2, QWORD PTR [rsi+rbx]
+ movq mm3, QWORD PTR [rdi+rdx]
+
+ psadbw mm0, mm1
+ psadbw mm2, mm3
+
+ lea rsi, [rsi+rbx*2]
+ lea rdi, [rdi+rdx*2]
+
+ paddw mm7, mm0
+ paddw mm7, mm2
+
+ cmp rsi, rcx
+ jne .x8x16sad_wmt_loop
+
+ movq rax, mm7
+
+.x8x16sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad8x8_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad8x8_wmt) PRIVATE
+sym(vp8_sad8x8_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+ pxor mm7, mm7
+
+.x8x8sad_wmt_loop:
+
+ movq rax, mm7
+ cmp eax, arg(4)
+ ja .x8x8sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm1, QWORD PTR [rdi]
+
+ psadbw mm0, mm1
+ lea rsi, [rsi+rbx]
+
+ add rdi, rdx
+ paddw mm7, mm0
+
+ cmp rsi, rcx
+ jne .x8x8sad_wmt_loop
+
+ movq rax, mm7
+.x8x8sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;unsigned int vp8_sad4x4_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad4x4_wmt) PRIVATE
+sym(vp8_sad4x4_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ movd mm0, DWORD PTR [rsi]
+ movd mm1, DWORD PTR [rdi]
+
+ movd mm2, DWORD PTR [rsi+rax]
+ movd mm3, DWORD PTR [rdi+rdx]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ psadbw mm0, mm1
+ lea rsi, [rsi+rax*2]
+
+ lea rdi, [rdi+rdx*2]
+ movd mm4, DWORD PTR [rsi]
+
+ movd mm5, DWORD PTR [rdi]
+ movd mm6, DWORD PTR [rsi+rax]
+
+ movd mm7, DWORD PTR [rdi+rdx]
+ punpcklbw mm4, mm6
+
+ punpcklbw mm5, mm7
+ psadbw mm4, mm5
+
+ paddw mm0, mm4
+ movq rax, mm0
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_sad16x8_wmt(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride)
+global sym(vp8_sad16x8_wmt) PRIVATE
+sym(vp8_sad16x8_wmt):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rbx, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ lea rcx, [rsi+rbx*8]
+ pxor mm7, mm7
+
+.x16x8sad_wmt_loop:
+
+ movq rax, mm7
+ cmp eax, arg(4)
+ ja .x16x8sad_wmt_early_exit
+
+ movq mm0, QWORD PTR [rsi]
+ movq mm2, QWORD PTR [rsi+8]
+
+ movq mm1, QWORD PTR [rdi]
+ movq mm3, QWORD PTR [rdi+8]
+
+ movq mm4, QWORD PTR [rsi+rbx]
+ movq mm5, QWORD PTR [rdi+rdx]
+
+ psadbw mm0, mm1
+ psadbw mm2, mm3
+
+ movq mm1, QWORD PTR [rsi+rbx+8]
+ movq mm3, QWORD PTR [rdi+rdx+8]
+
+ psadbw mm4, mm5
+ psadbw mm1, mm3
+
+ lea rsi, [rsi+rbx*2]
+ lea rdi, [rdi+rdx*2]
+
+ paddw mm0, mm2
+ paddw mm4, mm1
+
+ paddw mm7, mm0
+ paddw mm7, mm4
+
+ cmp rsi, rcx
+ jne .x16x8sad_wmt_loop
+
+ movq rax, mm7
+
+.x16x8sad_wmt_early_exit:
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_copy32xn_sse2(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; int height);
+global sym(vp8_copy32xn_sse2) PRIVATE
+sym(vp8_copy32xn_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;dst_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;dst_stride
+ movsxd rcx, dword ptr arg(4) ;height
+
+.block_copy_sse2_loopx4:
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + 16]
+ movdqu xmm2, XMMWORD PTR [rsi + rax]
+ movdqu xmm3, XMMWORD PTR [rsi + rax + 16]
+
+ lea rsi, [rsi+rax*2]
+
+ movdqu xmm4, XMMWORD PTR [rsi]
+ movdqu xmm5, XMMWORD PTR [rsi + 16]
+ movdqu xmm6, XMMWORD PTR [rsi + rax]
+ movdqu xmm7, XMMWORD PTR [rsi + rax + 16]
+
+ lea rsi, [rsi+rax*2]
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm1
+ movdqa XMMWORD PTR [rdi + rdx], xmm2
+ movdqa XMMWORD PTR [rdi + rdx + 16], xmm3
+
+ lea rdi, [rdi+rdx*2]
+
+ movdqa XMMWORD PTR [rdi], xmm4
+ movdqa XMMWORD PTR [rdi + 16], xmm5
+ movdqa XMMWORD PTR [rdi + rdx], xmm6
+ movdqa XMMWORD PTR [rdi + rdx + 16], xmm7
+
+ lea rdi, [rdi+rdx*2]
+
+ sub rcx, 4
+ cmp rcx, 4
+ jge .block_copy_sse2_loopx4
+
+ cmp rcx, 0
+ je .copy_is_done
+
+.block_copy_sse2_loop:
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi + 16]
+ lea rsi, [rsi+rax]
+
+ movdqa XMMWORD PTR [rdi], xmm0
+ movdqa XMMWORD PTR [rdi + 16], xmm1
+ lea rdi, [rdi+rdx]
+
+ sub rcx, 1
+ jne .block_copy_sse2_loop
+
+.copy_is_done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/common/x86/sad_sse3.asm b/libvpx/vp8/common/x86/sad_sse3.asm
new file mode 100644
index 0000000..f90a589
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_sse3.asm
@@ -0,0 +1,960 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE_X3 0
+%if ABI_IS_32BIT
+ %define src_ptr rsi
+ %define src_stride rax
+ %define ref_ptr rdi
+ %define ref_stride rdx
+ %define end_ptr rcx
+ %define ret_var rbx
+ %define result_ptr arg(4)
+ %define max_sad arg(4)
+ %define height dword ptr arg(4)
+ push rbp
+ mov rbp, rsp
+ push rsi
+ push rdi
+ push rbx
+
+ mov rsi, arg(0) ; src_ptr
+ mov rdi, arg(2) ; ref_ptr
+
+ movsxd rax, dword ptr arg(1) ; src_stride
+ movsxd rdx, dword ptr arg(3) ; ref_stride
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ SAVE_XMM 7, u
+ %define src_ptr rcx
+ %define src_stride rdx
+ %define ref_ptr r8
+ %define ref_stride r9
+ %define end_ptr r10
+ %define ret_var r11
+ %define result_ptr [rsp+xmm_stack_space+8+4*8]
+ %define max_sad [rsp+xmm_stack_space+8+4*8]
+ %define height dword ptr [rsp+xmm_stack_space+8+4*8]
+ %else
+ %define src_ptr rdi
+ %define src_stride rsi
+ %define ref_ptr rdx
+ %define ref_stride rcx
+ %define end_ptr r9
+ %define ret_var r10
+ %define result_ptr r8
+ %define max_sad r8
+ %define height r8
+ %endif
+%endif
+
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X3 0
+ %define src_ptr
+ %define src_stride
+ %define ref_ptr
+ %define ref_stride
+ %define end_ptr
+ %define ret_var
+ %define result_ptr
+ %define max_sad
+ %define height
+
+%if ABI_IS_32BIT
+ pop rbx
+ pop rdi
+ pop rsi
+ pop rbp
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ RESTORE_XMM
+ %endif
+%endif
+ ret
+%endmacro
+
+%macro STACK_FRAME_CREATE_X4 0
+%if ABI_IS_32BIT
+ %define src_ptr rsi
+ %define src_stride rax
+ %define r0_ptr rcx
+ %define r1_ptr rdx
+ %define r2_ptr rbx
+ %define r3_ptr rdi
+ %define ref_stride rbp
+ %define result_ptr arg(4)
+ push rbp
+ mov rbp, rsp
+ push rsi
+ push rdi
+ push rbx
+
+ push rbp
+ mov rdi, arg(2) ; ref_ptr_base
+
+ LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+ mov rsi, arg(0) ; src_ptr
+
+ movsxd rbx, dword ptr arg(1) ; src_stride
+ movsxd rbp, dword ptr arg(3) ; ref_stride
+
+ xchg rbx, rax
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ SAVE_XMM 7, u
+ %define src_ptr rcx
+ %define src_stride rdx
+ %define r0_ptr rsi
+ %define r1_ptr r10
+ %define r2_ptr r11
+ %define r3_ptr r8
+ %define ref_stride r9
+ %define result_ptr [rsp+xmm_stack_space+16+4*8]
+ push rsi
+
+ LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
+ %else
+ %define src_ptr rdi
+ %define src_stride rsi
+ %define r0_ptr r9
+ %define r1_ptr r10
+ %define r2_ptr r11
+ %define r3_ptr rdx
+ %define ref_stride rcx
+ %define result_ptr r8
+
+ LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
+
+ %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X4 0
+ %define src_ptr
+ %define src_stride
+ %define r0_ptr
+ %define r1_ptr
+ %define r2_ptr
+ %define r3_ptr
+ %define ref_stride
+ %define result_ptr
+
+%if ABI_IS_32BIT
+ pop rbx
+ pop rdi
+ pop rsi
+ pop rbp
+%else
+ %ifidn __OUTPUT_FORMAT__,x64
+ pop rsi
+ RESTORE_XMM
+ %endif
+%endif
+ ret
+%endmacro
+
+%macro PROCESS_16X2X3 5
+%if %1==0
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm5, XMMWORD PTR [%3]
+ lddqu xmm6, XMMWORD PTR [%3+1]
+ lddqu xmm7, XMMWORD PTR [%3+2]
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm1, XMMWORD PTR [%3]
+ lddqu xmm2, XMMWORD PTR [%3+1]
+ lddqu xmm3, XMMWORD PTR [%3+2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, XMMWORD PTR [%2+%4]
+ lddqu xmm1, XMMWORD PTR [%3+%5]
+ lddqu xmm2, XMMWORD PTR [%3+%5+1]
+ lddqu xmm3, XMMWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+ lea %2, [%2+%4*2]
+ lea %3, [%3+%5*2]
+%endif
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_8X2X3 5
+%if %1==0
+ movq mm0, QWORD PTR [%2]
+ movq mm5, QWORD PTR [%3]
+ movq mm6, QWORD PTR [%3+1]
+ movq mm7, QWORD PTR [%3+2]
+
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+ psadbw mm7, mm0
+%else
+ movq mm0, QWORD PTR [%2]
+ movq mm1, QWORD PTR [%3]
+ movq mm2, QWORD PTR [%3+1]
+ movq mm3, QWORD PTR [%3+2]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+%endif
+ movq mm0, QWORD PTR [%2+%4]
+ movq mm1, QWORD PTR [%3+%5]
+ movq mm2, QWORD PTR [%3+%5+1]
+ movq mm3, QWORD PTR [%3+%5+2]
+
+%if %1==0 || %1==1
+ lea %2, [%2+%4*2]
+ lea %3, [%3+%5*2]
+%endif
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+%endmacro
+
+%macro LOAD_X4_ADDRESSES 5
+ mov %2, [%1+REG_SZ_BYTES*0]
+ mov %3, [%1+REG_SZ_BYTES*1]
+
+ mov %4, [%1+REG_SZ_BYTES*2]
+ mov %5, [%1+REG_SZ_BYTES*3]
+%endmacro
+
+%macro PROCESS_16X2X4 8
+%if %1==0
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm4, XMMWORD PTR [%3]
+ lddqu xmm5, XMMWORD PTR [%4]
+ lddqu xmm6, XMMWORD PTR [%5]
+ lddqu xmm7, XMMWORD PTR [%6]
+
+ psadbw xmm4, xmm0
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [%2]
+ lddqu xmm1, XMMWORD PTR [%3]
+ lddqu xmm2, XMMWORD PTR [%4]
+ lddqu xmm3, XMMWORD PTR [%5]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm4, xmm1
+ lddqu xmm1, XMMWORD PTR [%6]
+ paddw xmm5, xmm2
+ paddw xmm6, xmm3
+
+ psadbw xmm1, xmm0
+ paddw xmm7, xmm1
+%endif
+ movdqa xmm0, XMMWORD PTR [%2+%7]
+ lddqu xmm1, XMMWORD PTR [%3+%8]
+ lddqu xmm2, XMMWORD PTR [%4+%8]
+ lddqu xmm3, XMMWORD PTR [%5+%8]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm4, xmm1
+ lddqu xmm1, XMMWORD PTR [%6+%8]
+ paddw xmm5, xmm2
+ paddw xmm6, xmm3
+
+%if %1==0 || %1==1
+ lea %2, [%2+%7*2]
+ lea %3, [%3+%8*2]
+
+ lea %4, [%4+%8*2]
+ lea %5, [%5+%8*2]
+
+ lea %6, [%6+%8*2]
+%endif
+ psadbw xmm1, xmm0
+ paddw xmm7, xmm1
+
+%endmacro
+
+%macro PROCESS_8X2X4 8
+%if %1==0
+ movq mm0, QWORD PTR [%2]
+ movq mm4, QWORD PTR [%3]
+ movq mm5, QWORD PTR [%4]
+ movq mm6, QWORD PTR [%5]
+ movq mm7, QWORD PTR [%6]
+
+ psadbw mm4, mm0
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+ psadbw mm7, mm0
+%else
+ movq mm0, QWORD PTR [%2]
+ movq mm1, QWORD PTR [%3]
+ movq mm2, QWORD PTR [%4]
+ movq mm3, QWORD PTR [%5]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm4, mm1
+ movq mm1, QWORD PTR [%6]
+ paddw mm5, mm2
+ paddw mm6, mm3
+
+ psadbw mm1, mm0
+ paddw mm7, mm1
+%endif
+ movq mm0, QWORD PTR [%2+%7]
+ movq mm1, QWORD PTR [%3+%8]
+ movq mm2, QWORD PTR [%4+%8]
+ movq mm3, QWORD PTR [%5+%8]
+
+ psadbw mm1, mm0
+ psadbw mm2, mm0
+ psadbw mm3, mm0
+
+ paddw mm4, mm1
+ movq mm1, QWORD PTR [%6+%8]
+ paddw mm5, mm2
+ paddw mm6, mm3
+
+%if %1==0 || %1==1
+ lea %2, [%2+%7*2]
+ lea %3, [%3+%8*2]
+
+ lea %4, [%4+%8*2]
+ lea %5, [%5+%8*2]
+
+ lea %6, [%6+%8*2]
+%endif
+ psadbw mm1, mm0
+ paddw mm7, mm1
+
+%endmacro
+
+;void int vp8_sad16x16x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x16x3_sse3) PRIVATE
+sym(vp8_sad16x16x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+8], xmm0
+
+ STACK_FRAME_DESTROY_X3
+
+;void int vp8_sad16x8x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x8x3_sse3) PRIVATE
+sym(vp8_sad16x8x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+8], xmm0
+
+ STACK_FRAME_DESTROY_X3
+
+;void int vp8_sad8x16x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x16x3_sse3) PRIVATE
+sym(vp8_sad8x16x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ punpckldq mm5, mm6
+
+ movq [rcx], mm5
+ movd [rcx+8], mm7
+
+ STACK_FRAME_DESTROY_X3
+
+;void int vp8_sad8x8x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x8x3_sse3) PRIVATE
+sym(vp8_sad8x8x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
+ PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
+
+ mov rcx, result_ptr
+
+ punpckldq mm5, mm6
+
+ movq [rcx], mm5
+ movd [rcx+8], mm7
+
+ STACK_FRAME_DESTROY_X3
+
+;void int vp8_sad4x4x3_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad4x4x3_sse3) PRIVATE
+sym(vp8_sad4x4x3_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm1, DWORD PTR [ref_ptr]
+
+ movd mm2, DWORD PTR [src_ptr+src_stride]
+ movd mm3, DWORD PTR [ref_ptr+ref_stride]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movd mm4, DWORD PTR [ref_ptr+1]
+ movd mm5, DWORD PTR [ref_ptr+2]
+
+ movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
+ movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
+
+ psadbw mm1, mm0
+
+ punpcklbw mm4, mm2
+ punpcklbw mm5, mm3
+
+ psadbw mm4, mm0
+ psadbw mm5, mm0
+
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea ref_ptr, [ref_ptr+ref_stride*2]
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm2, DWORD PTR [ref_ptr]
+
+ movd mm3, DWORD PTR [src_ptr+src_stride]
+ movd mm6, DWORD PTR [ref_ptr+ref_stride]
+
+ punpcklbw mm0, mm3
+ punpcklbw mm2, mm6
+
+ movd mm3, DWORD PTR [ref_ptr+1]
+ movd mm7, DWORD PTR [ref_ptr+2]
+
+ psadbw mm2, mm0
+
+ paddw mm1, mm2
+
+ movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
+ movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
+
+ punpcklbw mm3, mm2
+ punpcklbw mm7, mm6
+
+ psadbw mm3, mm0
+ psadbw mm7, mm0
+
+ paddw mm3, mm4
+ paddw mm7, mm5
+
+ mov rcx, result_ptr
+
+ punpckldq mm1, mm3
+
+ movq [rcx], mm1
+ movd [rcx+8], mm7
+
+ STACK_FRAME_DESTROY_X3
+
+;unsigned int vp8_sad16x16_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int max_sad)
+;%define lddqu movdqu
+global sym(vp8_sad16x16_sse3) PRIVATE
+sym(vp8_sad16x16_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+ mov end_ptr, 4
+ pxor xmm7, xmm7
+
+.vp8_sad16x16_sse3_loop:
+ movdqa xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [ref_ptr]
+ movdqa xmm2, XMMWORD PTR [src_ptr+src_stride]
+ movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride]
+
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea ref_ptr, [ref_ptr+ref_stride*2]
+
+ movdqa xmm4, XMMWORD PTR [src_ptr]
+ movdqu xmm5, XMMWORD PTR [ref_ptr]
+ movdqa xmm6, XMMWORD PTR [src_ptr+src_stride]
+
+ psadbw xmm0, xmm1
+
+ movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride]
+
+ psadbw xmm2, xmm3
+ psadbw xmm4, xmm5
+ psadbw xmm6, xmm1
+
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea ref_ptr, [ref_ptr+ref_stride*2]
+
+ paddw xmm7, xmm0
+ paddw xmm7, xmm2
+ paddw xmm7, xmm4
+ paddw xmm7, xmm6
+
+ sub end_ptr, 1
+ jne .vp8_sad16x16_sse3_loop
+
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+ paddw xmm0, xmm7
+ movq rax, xmm0
+
+ STACK_FRAME_DESTROY_X3
+
+;void vp8_copy32xn_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *dst_ptr,
+; int dst_stride,
+; int height);
+global sym(vp8_copy32xn_sse3) PRIVATE
+sym(vp8_copy32xn_sse3):
+
+ STACK_FRAME_CREATE_X3
+
+.block_copy_sse3_loopx4:
+ lea end_ptr, [src_ptr+src_stride*2]
+
+ movdqu xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16]
+ movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
+ movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
+ movdqu xmm4, XMMWORD PTR [end_ptr]
+ movdqu xmm5, XMMWORD PTR [end_ptr + 16]
+ movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
+ movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
+
+ lea src_ptr, [src_ptr+src_stride*4]
+
+ lea end_ptr, [ref_ptr+ref_stride*2]
+
+ movdqa XMMWORD PTR [ref_ptr], xmm0
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1
+ movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
+ movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
+ movdqa XMMWORD PTR [end_ptr], xmm4
+ movdqa XMMWORD PTR [end_ptr + 16], xmm5
+ movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
+ movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
+
+ lea ref_ptr, [ref_ptr+ref_stride*4]
+
+ sub height, 4
+ cmp height, 4
+ jge .block_copy_sse3_loopx4
+
+ ;Check to see if there is more rows need to be copied.
+ cmp height, 0
+ je .copy_is_done
+
+.block_copy_sse3_loop:
+ movdqu xmm0, XMMWORD PTR [src_ptr]
+ movdqu xmm1, XMMWORD PTR [src_ptr + 16]
+ lea src_ptr, [src_ptr+src_stride]
+
+ movdqa XMMWORD PTR [ref_ptr], xmm0
+ movdqa XMMWORD PTR [ref_ptr + 16], xmm1
+ lea ref_ptr, [ref_ptr+ref_stride]
+
+ sub height, 1
+ jne .block_copy_sse3_loop
+
+.copy_is_done:
+ STACK_FRAME_DESTROY_X3
+
+;void vp8_sad16x16x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr_base,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x16x4d_sse3) PRIVATE
+sym(vp8_sad16x16x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+ pop rbp
+%endif
+ mov rcx, result_ptr
+
+ movq xmm0, xmm4
+ psrldq xmm4, 8
+
+ paddw xmm0, xmm4
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+8], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+12], xmm0
+
+ STACK_FRAME_DESTROY_X4
+
+;void vp8_sad16x8x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr_base,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x8x4d_sse3) PRIVATE
+sym(vp8_sad16x8x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+ pop rbp
+%endif
+ mov rcx, result_ptr
+
+ movq xmm0, xmm4
+ psrldq xmm4, 8
+
+ paddw xmm0, xmm4
+ movd [rcx], xmm0
+;-
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rcx+4], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rcx+8], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rcx+12], xmm0
+
+ STACK_FRAME_DESTROY_X4
+
+;void int vp8_sad8x16x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x16x4d_sse3) PRIVATE
+sym(vp8_sad8x16x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+ pop rbp
+%endif
+ mov rcx, result_ptr
+
+ punpckldq mm4, mm5
+ punpckldq mm6, mm7
+
+ movq [rcx], mm4
+ movq [rcx+8], mm6
+
+ STACK_FRAME_DESTROY_X4
+
+;void int vp8_sad8x8x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad8x8x4d_sse3) PRIVATE
+sym(vp8_sad8x8x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+ PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
+
+%if ABI_IS_32BIT
+ pop rbp
+%endif
+ mov rcx, result_ptr
+
+ punpckldq mm4, mm5
+ punpckldq mm6, mm7
+
+ movq [rcx], mm4
+ movq [rcx+8], mm6
+
+ STACK_FRAME_DESTROY_X4
+
+;void int vp8_sad4x4x4d_sse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad4x4x4d_sse3) PRIVATE
+sym(vp8_sad4x4x4d_sse3):
+
+ STACK_FRAME_CREATE_X4
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm1, DWORD PTR [r0_ptr]
+
+ movd mm2, DWORD PTR [src_ptr+src_stride]
+ movd mm3, DWORD PTR [r0_ptr+ref_stride]
+
+ punpcklbw mm0, mm2
+ punpcklbw mm1, mm3
+
+ movd mm4, DWORD PTR [r1_ptr]
+ movd mm5, DWORD PTR [r2_ptr]
+
+ movd mm6, DWORD PTR [r3_ptr]
+ movd mm2, DWORD PTR [r1_ptr+ref_stride]
+
+ movd mm3, DWORD PTR [r2_ptr+ref_stride]
+ movd mm7, DWORD PTR [r3_ptr+ref_stride]
+
+ psadbw mm1, mm0
+
+ punpcklbw mm4, mm2
+ punpcklbw mm5, mm3
+
+ punpcklbw mm6, mm7
+ psadbw mm4, mm0
+
+ psadbw mm5, mm0
+ psadbw mm6, mm0
+
+
+
+ lea src_ptr, [src_ptr+src_stride*2]
+ lea r0_ptr, [r0_ptr+ref_stride*2]
+
+ lea r1_ptr, [r1_ptr+ref_stride*2]
+ lea r2_ptr, [r2_ptr+ref_stride*2]
+
+ lea r3_ptr, [r3_ptr+ref_stride*2]
+
+ movd mm0, DWORD PTR [src_ptr]
+ movd mm2, DWORD PTR [r0_ptr]
+
+ movd mm3, DWORD PTR [src_ptr+src_stride]
+ movd mm7, DWORD PTR [r0_ptr+ref_stride]
+
+ punpcklbw mm0, mm3
+ punpcklbw mm2, mm7
+
+ movd mm3, DWORD PTR [r1_ptr]
+ movd mm7, DWORD PTR [r2_ptr]
+
+ psadbw mm2, mm0
+%if ABI_IS_32BIT
+ mov rax, rbp
+
+ pop rbp
+%define ref_stride rax
+%endif
+ mov rsi, result_ptr
+
+ paddw mm1, mm2
+ movd [rsi], mm1
+
+ movd mm2, DWORD PTR [r1_ptr+ref_stride]
+ movd mm1, DWORD PTR [r2_ptr+ref_stride]
+
+ punpcklbw mm3, mm2
+ punpcklbw mm7, mm1
+
+ psadbw mm3, mm0
+ psadbw mm7, mm0
+
+ movd mm2, DWORD PTR [r3_ptr]
+ movd mm1, DWORD PTR [r3_ptr+ref_stride]
+
+ paddw mm3, mm4
+ paddw mm7, mm5
+
+ movd [rsi+4], mm3
+ punpcklbw mm2, mm1
+
+ movd [rsi+8], mm7
+ psadbw mm2, mm0
+
+ paddw mm2, mm6
+ movd [rsi+12], mm2
+
+
+ STACK_FRAME_DESTROY_X4
+
diff --git a/libvpx/vp8/common/x86/sad_sse4.asm b/libvpx/vp8/common/x86/sad_sse4.asm
new file mode 100644
index 0000000..f7fccd7
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_sse4.asm
@@ -0,0 +1,353 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X8 1
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ movq xmm2, MMWORD PTR [rdi+16]
+ punpcklqdq xmm1, xmm3
+ punpcklqdq xmm3, xmm2
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm1, xmm2
+ paddw xmm1, xmm3
+ paddw xmm1, xmm4
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ movq xmm2, MMWORD PTR [rdi+16]
+ punpcklqdq xmm5, xmm3
+ punpcklqdq xmm3, xmm2
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm5, xmm2
+ paddw xmm5, xmm3
+ paddw xmm5, xmm4
+
+ paddw xmm1, xmm5
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ movq xmm2, MMWORD PTR [rdi+ rdx+16]
+ punpcklqdq xmm5, xmm3
+ punpcklqdq xmm3, xmm2
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+
+ psrldq xmm0, 8
+ movdqa xmm4, xmm3
+ mpsadbw xmm3, xmm0, 0x0
+ mpsadbw xmm4, xmm0, 0x5
+
+ paddw xmm5, xmm2
+ paddw xmm5, xmm3
+ paddw xmm5, xmm4
+
+ paddw xmm1, xmm5
+%endmacro
+
+%macro PROCESS_8X2X8 1
+%if %1
+ movq xmm0, MMWORD PTR [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm1, xmm3
+
+ movdqa xmm2, xmm1
+ mpsadbw xmm1, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm1, xmm2
+%else
+ movq xmm0, MMWORD PTR [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm5, xmm3
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm5, xmm2
+
+ paddw xmm1, xmm5
+%endif
+ movq xmm0, MMWORD PTR [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ punpcklqdq xmm5, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ movdqa xmm2, xmm5
+ mpsadbw xmm5, xmm0, 0x0
+ mpsadbw xmm2, xmm0, 0x5
+ paddw xmm5, xmm2
+
+ paddw xmm1, xmm5
+%endmacro
+
+%macro PROCESS_4X2X8 1
+%if %1
+ movd xmm0, [rsi]
+ movq xmm1, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm1, xmm3
+
+ mpsadbw xmm1, xmm0, 0x0
+%else
+ movd xmm0, [rsi]
+ movq xmm5, MMWORD PTR [rdi]
+ movq xmm3, MMWORD PTR [rdi+8]
+ punpcklqdq xmm5, xmm3
+
+ mpsadbw xmm5, xmm0, 0x0
+
+ paddw xmm1, xmm5
+%endif
+ movd xmm0, [rsi + rax]
+ movq xmm5, MMWORD PTR [rdi+ rdx]
+ movq xmm3, MMWORD PTR [rdi+ rdx+8]
+ punpcklqdq xmm5, xmm3
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ mpsadbw xmm5, xmm0, 0x0
+
+ paddw xmm1, xmm5
+%endmacro
+
+
+;void vp8_sad16x16x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array);
+global sym(vp8_sad16x16x8_sse4) PRIVATE
+sym(vp8_sad16x16x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad16x8x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad16x8x8_sse4) PRIVATE
+sym(vp8_sad16x8x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad8x8x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad8x8x8_sse4) PRIVATE
+sym(vp8_sad8x8x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad8x16x8_sse4(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad8x16x8_sse4) PRIVATE
+sym(vp8_sad8x16x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_sad4x4x8_c(
+; const unsigned char *src_ptr,
+; int src_stride,
+; const unsigned char *ref_ptr,
+; int ref_stride,
+; unsigned short *sad_array
+;);
+global sym(vp8_sad4x4x8_sse4) PRIVATE
+sym(vp8_sad4x4x8_sse4):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ PROCESS_4X2X8 1
+ PROCESS_4X2X8 0
+
+ mov rdi, arg(4) ;Results
+ movdqa XMMWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
diff --git a/libvpx/vp8/common/x86/sad_ssse3.asm b/libvpx/vp8/common/x86/sad_ssse3.asm
new file mode 100644
index 0000000..278fc06
--- /dev/null
+++ b/libvpx/vp8/common/x86/sad_ssse3.asm
@@ -0,0 +1,370 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro PROCESS_16X2X3 1
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm5, XMMWORD PTR [rdi]
+ lddqu xmm6, XMMWORD PTR [rdi+1]
+ lddqu xmm7, XMMWORD PTR [rdi+2]
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ lddqu xmm1, XMMWORD PTR [rdi]
+ lddqu xmm2, XMMWORD PTR [rdi+1]
+ lddqu xmm3, XMMWORD PTR [rdi+2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi+rax]
+ lddqu xmm1, XMMWORD PTR [rdi+rdx]
+ lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
+ lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_16X2X3_OFFSET 2
+%if %1
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movdqa xmm4, XMMWORD PTR [rdi]
+ movdqa xmm7, XMMWORD PTR [rdi+16]
+
+ movdqa xmm5, xmm7
+ palignr xmm5, xmm4, %2
+
+ movdqa xmm6, xmm7
+ palignr xmm6, xmm4, (%2+1)
+
+ palignr xmm7, xmm4, (%2+2)
+
+ psadbw xmm5, xmm0
+ psadbw xmm6, xmm0
+ psadbw xmm7, xmm0
+%else
+ movdqa xmm0, XMMWORD PTR [rsi]
+ movdqa xmm4, XMMWORD PTR [rdi]
+ movdqa xmm3, XMMWORD PTR [rdi+16]
+
+ movdqa xmm1, xmm3
+ palignr xmm1, xmm4, %2
+
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm4, (%2+1)
+
+ palignr xmm3, xmm4, (%2+2)
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endif
+ movdqa xmm0, XMMWORD PTR [rsi+rax]
+ movdqa xmm4, XMMWORD PTR [rdi+rdx]
+ movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
+
+ movdqa xmm1, xmm3
+ palignr xmm1, xmm4, %2
+
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm4, (%2+1)
+
+ palignr xmm3, xmm4, (%2+2)
+
+ lea rsi, [rsi+rax*2]
+ lea rdi, [rdi+rdx*2]
+
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+ paddw xmm7, xmm3
+%endmacro
+
+%macro PROCESS_16X16X3_OFFSET 2
+%2_aligned_by_%1:
+
+ sub rdi, %1
+
+ PROCESS_16X2X3_OFFSET 1, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+
+ jmp %2_store_off
+
+%endmacro
+
+%macro PROCESS_16X8X3_OFFSET 2
+%2_aligned_by_%1:
+
+ sub rdi, %1
+
+ PROCESS_16X2X3_OFFSET 1, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+ PROCESS_16X2X3_OFFSET 0, %1
+
+ jmp %2_store_off
+
+%endmacro
+
+;void int vp8_sad16x16x3_ssse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x16x3_ssse3) PRIVATE
+sym(vp8_sad16x16x3_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rcx
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rdx, 0xf
+ and rdx, rdi
+
+ jmp .vp8_sad16x16x3_ssse3_skiptable
+.vp8_sad16x16x3_ssse3_jumptable:
+ dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
+ dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_skiptable:
+
+ call .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_do_jump:
+ pop rcx ; get the address of do_jump
+ mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
+
+ movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
+ add rcx, rax
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ jmp rcx
+
+ PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
+ PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
+
+.vp8_sad16x16x3_ssse3_aligned_by_15:
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+.vp8_sad16x16x3_ssse3_store_off:
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rcx
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void int vp8_sad16x8x3_ssse3(
+; unsigned char *src_ptr,
+; int src_stride,
+; unsigned char *ref_ptr,
+; int ref_stride,
+; int *results)
+global sym(vp8_sad16x8x3_ssse3) PRIVATE
+sym(vp8_sad16x8x3_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rcx
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ mov rdx, 0xf
+ and rdx, rdi
+
+ jmp .vp8_sad16x8x3_ssse3_skiptable
+.vp8_sad16x8x3_ssse3_jumptable:
+ dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
+ dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_skiptable:
+
+ call .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_do_jump:
+ pop rcx ; get the address of do_jump
+ mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
+ add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
+
+ movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
+ add rcx, rax
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
+
+ jmp rcx
+
+ PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
+ PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
+
+.vp8_sad16x8x3_ssse3_aligned_by_15:
+
+ PROCESS_16X2X3 1
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+ PROCESS_16X2X3 0
+
+.vp8_sad16x8x3_ssse3_store_off:
+ mov rdi, arg(4) ;Results
+
+ movq xmm0, xmm5
+ psrldq xmm5, 8
+
+ paddw xmm0, xmm5
+ movd [rdi], xmm0
+;-
+ movq xmm0, xmm6
+ psrldq xmm6, 8
+
+ paddw xmm0, xmm6
+ movd [rdi+4], xmm0
+;-
+ movq xmm0, xmm7
+ psrldq xmm7, 8
+
+ paddw xmm0, xmm7
+ movd [rdi+8], xmm0
+
+ ; begin epilog
+ pop rcx
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/libvpx/vp8/common/x86/subpixel_mmx.asm b/libvpx/vp8/common/x86/subpixel_mmx.asm
new file mode 100644
index 0000000..47dd452
--- /dev/null
+++ b/libvpx/vp8/common/x86/subpixel_mmx.asm
@@ -0,0 +1,702 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
+
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define vp8_filter_weight 128
+%define VP8_FILTER_SHIFT 7
+
+
+;void vp8_filter_block1d_h6_mmx
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp8_filter
+;)
+global sym(vp8_filter_block1d_h6_mmx) PRIVATE
+sym(vp8_filter_block1d_h6_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp8_filter
+
+ movq mm1, [rdx + 16] ; do both the negative taps first!!!
+ movq mm2, [rdx + 32] ;
+ movq mm6, [rdx + 48] ;
+ movq mm7, [rdx + 64] ;
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
+ pxor mm0, mm0 ; mm0 = 00000000
+
+.nextrow:
+ movq mm3, [rsi-2] ; mm3 = p-2..p5
+ movq mm4, mm3 ; mm4 = p-2..p5
+ psrlq mm3, 8 ; mm3 = p-1..p5
+ punpcklbw mm3, mm0 ; mm3 = p-1..p2
+ pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
+
+ movq mm5, mm4 ; mm5 = p-2..p5
+ punpckhbw mm4, mm0 ; mm5 = p2..p5
+ pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ movq mm4, mm5 ; mm4 = p-2..p5;
+ psrlq mm5, 16 ; mm5 = p0..p5;
+ punpcklbw mm5, mm0 ; mm5 = p0..p3
+ pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
+ paddsw mm3, mm5 ; mm3 += mm5
+
+ movq mm5, mm4 ; mm5 = p-2..p5
+ psrlq mm4, 24 ; mm4 = p1..p5
+ punpcklbw mm4, mm0 ; mm4 = p1..p4
+ pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ ; do outer positive taps
+ movd mm4, [rsi+3]
+ punpcklbw mm4, mm0 ; mm5 = p3..p6
+ pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
+ paddsw mm3, mm4 ; mm3 += mm5
+
+ punpcklbw mm5, mm0 ; mm5 = p-2..p1
+ pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
+ paddsw mm3, mm5 ; mm3 += mm5
+
+ paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
+ packuswb mm3, mm0 ; pack and unpack to saturate
+ punpcklbw mm3, mm0 ;
+
+ movq [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
+ add rdi, rax;
+%else
+ movsxd r8, dword ptr arg(2) ;src_pixels_per_line
+ add rdi, rax;
+
+ add rsi, r8 ; next line
+%endif
+
+ dec rcx ; decrement count
+ jnz .nextrow ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1dc_v6_mmx
+;(
+; short *src_ptr,
+; unsigned char *output_ptr,
+; int output_pitch,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp8_filter
+;)
+global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
+sym(vp8_filter_block1dc_v6_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movq mm5, [GLOBAL(rd)]
+ push rbx
+ mov rbx, arg(7) ;vp8_filter
+ movq mm1, [rbx + 16] ; do both the negative taps first!!!
+ movq mm2, [rbx + 32] ;
+ movq mm6, [rbx + 48] ;
+ movq mm7, [rbx + 64] ;
+
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+ sub rsi, rdx
+ sub rsi, rdx
+ movsxd rcx, DWORD PTR arg(5) ;output_height
+ movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
+ pxor mm0, mm0 ; mm0 = 00000000
+
+
+.nextrow_cv:
+ movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
+ pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
+
+
+ movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
+ pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
+ pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi] ; mm4 = p0..p3 = row -2
+ pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+
+ add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
+ movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
+ pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+ movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
+ pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
+ paddsw mm3, mm4 ; mm3 += mm4
+
+
+ paddsw mm3, mm5 ; mm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
+ packuswb mm3, mm0 ; pack and saturate
+
+ movd [rdi],mm3 ; store the results in the destination
+ ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
+ ; recon block should be in cache this shouldn't cost much. Its obviously
+ ; avoidable!!!.
+ lea rdi, [rdi+rax] ;
+ dec rcx ; decrement count
+ jnz .nextrow_cv ; next row
+
+ pop rbx
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void bilinear_predict8x8_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
+sym(vp8_bilinear_predict8x8_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ mov rdi, arg(4) ;dst_ptr ;
+
+ shl rax, 5 ; offset * 32
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+
+ add rax, rcx ; HFilter
+ mov rsi, arg(0) ;src_ptr ;
+
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+ movq mm1, [rax] ;
+
+ movq mm2, [rax+16] ;
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ pxor mm0, mm0 ;
+
+ shl rax, 5 ; offset*32
+ add rax, rcx ; VFilter
+
+ lea rcx, [rdi+rdx*8] ;
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
+
+
+
+ ; get the first horizontal line done ;
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+ add rsi, rdx ; next line
+.next_row_8x8:
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ movq mm5, mm7 ;
+ movq mm6, mm7 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0
+
+ pmullw mm5, [rax] ;
+ pmullw mm6, [rax] ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+
+ pmullw mm3, [rax+16] ;
+ pmullw mm4, [rax+16] ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ packuswb mm3, mm4
+
+ movq [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, rdx ; next line
+ add rdi, dword ptr arg(5) ;dst_pitch ;
+%else
+ movsxd r8, dword ptr arg(5) ;dst_pitch
+ add rsi, rdx ; next line
+ add rdi, r8 ;dst_pitch
+%endif
+ cmp rdi, rcx ;
+ jne .next_row_8x8
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void bilinear_predict8x4_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
+sym(vp8_bilinear_predict8x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ mov rdi, arg(4) ;dst_ptr ;
+
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+ shl rax, 5
+
+ mov rsi, arg(0) ;src_ptr ;
+ add rax, rcx
+
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+ movq mm1, [rax] ;
+
+ movq mm2, [rax+16] ;
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ pxor mm0, mm0 ;
+ shl rax, 5
+
+ add rax, rcx
+ lea rcx, [rdi+rdx*4] ;
+
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
+
+ ; get the first horizontal line done ;
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+ add rsi, rdx ; next line
+.next_row_8x4:
+ movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movq mm4, mm3 ; make a copy of current line
+
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, mm1 ;
+ pmullw mm4, mm1 ;
+
+ movq mm5, [rsi+1] ;
+ movq mm6, mm5 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0 ;
+
+ pmullw mm5, mm2 ;
+ pmullw mm6, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+ movq mm5, mm7 ;
+ movq mm6, mm7 ;
+
+ punpcklbw mm5, mm0 ;
+ punpckhbw mm6, mm0
+
+ pmullw mm5, [rax] ;
+ pmullw mm6, [rax] ;
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm4 ;
+
+
+ pmullw mm3, [rax+16] ;
+ pmullw mm4, [rax+16] ;
+
+ paddw mm3, mm5 ;
+ paddw mm4, mm6 ;
+
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw mm4, [GLOBAL(rd)] ;
+ psraw mm4, VP8_FILTER_SHIFT ;
+
+ packuswb mm3, mm4
+
+ movq [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, rdx ; next line
+ add rdi, dword ptr arg(5) ;dst_pitch ;
+%else
+ movsxd r8, dword ptr arg(5) ;dst_pitch
+ add rsi, rdx ; next line
+ add rdi, r8
+%endif
+ cmp rdi, rcx ;
+ jne .next_row_8x4
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void bilinear_predict4x4_mmx
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
+sym(vp8_bilinear_predict4x4_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ mov rdi, arg(4) ;dst_ptr ;
+
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+ shl rax, 5
+
+ add rax, rcx ; HFilter
+ mov rsi, arg(0) ;src_ptr ;
+
+ movsxd rdx, dword ptr arg(5) ;ldst_pitch
+ movq mm1, [rax] ;
+
+ movq mm2, [rax+16] ;
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ pxor mm0, mm0 ;
+ shl rax, 5
+
+ add rax, rcx
+ lea rcx, [rdi+rdx*4] ;
+
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
+
+ ; get the first horizontal line done ;
+ movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+
+ pmullw mm3, mm1 ;
+ movd mm5, [rsi+1] ;
+
+ punpcklbw mm5, mm0 ;
+ pmullw mm5, mm2 ;
+
+ paddw mm3, mm5 ;
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ movq mm7, mm3 ;
+ packuswb mm7, mm0 ;
+
+ add rsi, rdx ; next line
+.next_row_4x4:
+ movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
+
+ pmullw mm3, mm1 ;
+ movd mm5, [rsi+1] ;
+
+ punpcklbw mm5, mm0 ;
+ pmullw mm5, mm2 ;
+
+ paddw mm3, mm5 ;
+
+ movq mm5, mm7 ;
+ punpcklbw mm5, mm0 ;
+
+ pmullw mm5, [rax] ;
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+ movq mm7, mm3 ;
+
+ packuswb mm7, mm0 ;
+
+ pmullw mm3, [rax+16] ;
+ paddw mm3, mm5 ;
+
+
+ paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ packuswb mm3, mm0
+ movd [rdi], mm3 ; store the results in the destination
+
+%if ABI_IS_32BIT
+ add rsi, rdx ; next line
+ add rdi, dword ptr arg(5) ;dst_pitch ;
+%else
+ movsxd r8, dword ptr arg(5) ;dst_pitch ;
+ add rsi, rdx ; next line
+ add rdi, r8
+%endif
+
+ cmp rdi, rcx ;
+ jne .next_row_4x4
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+SECTION_RODATA
+align 16
+rd:
+ times 4 dw 0x40
+
+align 16
+global HIDDEN_DATA(sym(vp8_six_tap_mmx))
+sym(vp8_six_tap_mmx):
+ times 8 dw 0
+ times 8 dw 0
+ times 8 dw 128
+ times 8 dw 0
+ times 8 dw 0
+ times 8 dw 0
+
+ times 8 dw 0
+ times 8 dw -6
+ times 8 dw 123
+ times 8 dw 12
+ times 8 dw -1
+ times 8 dw 0
+
+ times 8 dw 2
+ times 8 dw -11
+ times 8 dw 108
+ times 8 dw 36
+ times 8 dw -8
+ times 8 dw 1
+
+ times 8 dw 0
+ times 8 dw -9
+ times 8 dw 93
+ times 8 dw 50
+ times 8 dw -6
+ times 8 dw 0
+
+ times 8 dw 3
+ times 8 dw -16
+ times 8 dw 77
+ times 8 dw 77
+ times 8 dw -16
+ times 8 dw 3
+
+ times 8 dw 0
+ times 8 dw -6
+ times 8 dw 50
+ times 8 dw 93
+ times 8 dw -9
+ times 8 dw 0
+
+ times 8 dw 1
+ times 8 dw -8
+ times 8 dw 36
+ times 8 dw 108
+ times 8 dw -11
+ times 8 dw 2
+
+ times 8 dw 0
+ times 8 dw -1
+ times 8 dw 12
+ times 8 dw 123
+ times 8 dw -6
+ times 8 dw 0
+
+
diff --git a/libvpx/vp8/common/x86/subpixel_sse2.asm b/libvpx/vp8/common/x86/subpixel_sse2.asm
new file mode 100644
index 0000000..69f8d10
--- /dev/null
+++ b/libvpx/vp8/common/x86/subpixel_sse2.asm
@@ -0,0 +1,1372 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT 7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+;void vp8_filter_block1d8_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short *vp8_filter
+;)
+global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
+sym(vp8_filter_block1d8_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;output_width
+%endif
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d8_h6_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm4
+ lea rsi, [rsi + rax]
+
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(5) ;[output_width]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+
+ jnz .filter_block1d8_h6_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d16_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short *vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;*************************************************************************************/
+global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
+sym(vp8_filter_block1d16_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(6) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;output_width
+%endif
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d16_h6_sse2_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ movq xmm2, MMWORD PTR [rsi +14]
+ pslldq xmm2, 8
+
+ por xmm2, xmm1
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm4
+
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm2
+
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm2
+
+ movdqa xmm7, xmm2
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm2
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+ punpcklbw xmm4, xmm0
+
+ movdqa XMMWORD Ptr [rdi+16], xmm4
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(5) ;[output_width]
+%else
+ add rdi, r8
+%endif
+
+ dec rcx
+ jnz .filter_block1d16_h6_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d8_v6_sse2
+;(
+; short *src_ptr,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; short * vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
+sym(vp8_filter_block1d8_v6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(7) ;vp8_filter
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+
+ sub rsi, rdx
+ sub rsi, rdx
+
+ movsxd rcx, DWORD PTR arg(5) ;[output_height]
+ pxor xmm0, xmm0 ; clear xmm0
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp8_filter_block1d8_v6_sse2_loop:
+ movdqa xmm1, XMMWORD PTR [rsi]
+ pmullw xmm1, [rax]
+
+ movdqa xmm2, XMMWORD PTR [rsi + rdx]
+ pmullw xmm2, [rax + 16]
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
+ pmullw xmm3, [rax + 32]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
+ pmullw xmm5, [rax + 64]
+
+ add rsi, rdx
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
+
+ pmullw xmm4, [rax + 48]
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
+
+ pmullw xmm6, [rax + 80]
+
+ paddsw xmm2, xmm5
+ paddsw xmm2, xmm3
+
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+
+ paddsw xmm2, xmm6
+ paddsw xmm2, xmm7
+
+ psraw xmm2, 7
+ packuswb xmm2, xmm0 ; pack and saturate
+
+ movq QWORD PTR [rdi], xmm2 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(2) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp8_filter_block1d8_v6_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d16_v6_sse2
+;(
+; unsigned short *src_ptr,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int pixels_per_line,
+; unsigned int pixel_step,
+; unsigned int output_height,
+; unsigned int output_width,
+; const short *vp8_filter
+;)
+;/************************************************************************************
+; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
+; input pixel array has output_height rows.
+;*************************************************************************************/
+global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
+sym(vp8_filter_block1d16_v6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(7) ;vp8_filter
+ movsxd rdx, dword ptr arg(3) ;pixels_per_line
+
+ mov rdi, arg(1) ;output_ptr
+ mov rsi, arg(0) ;src_ptr
+
+ sub rsi, rdx
+ sub rsi, rdx
+
+ movsxd rcx, DWORD PTR arg(5) ;[output_height]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(2) ; dst_ptich
+%endif
+
+.vp8_filter_block1d16_v6_sse2_loop:
+; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
+ movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
+ movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
+ pmullw xmm1, [rax + 16]
+ pmullw xmm2, [rax + 16]
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
+ pmullw xmm3, [rax + 64]
+ pmullw xmm4, [rax + 64]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
+ pmullw xmm5, [rax + 32]
+ pmullw xmm6, [rax + 32]
+
+ movdqa xmm7, XMMWORD PTR [rsi] ; line 1
+ movdqa xmm0, XMMWORD PTR [rsi + 16]
+ pmullw xmm7, [rax]
+ pmullw xmm0, [rax]
+
+ paddsw xmm1, xmm3
+ paddsw xmm2, xmm4
+ paddsw xmm1, xmm5
+ paddsw xmm2, xmm6
+ paddsw xmm1, xmm7
+ paddsw xmm2, xmm0
+
+ add rsi, rdx
+
+ movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
+ movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
+ pmullw xmm3, [rax + 48]
+ pmullw xmm4, [rax + 48]
+
+ movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
+ movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
+ pmullw xmm5, [rax + 80]
+ pmullw xmm6, [rax + 80]
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+ pxor xmm0, xmm0 ; clear xmm0
+
+ paddsw xmm1, xmm3
+ paddsw xmm2, xmm4
+ paddsw xmm1, xmm5
+ paddsw xmm2, xmm6
+
+ paddsw xmm1, xmm7
+ paddsw xmm2, xmm7
+
+ psraw xmm1, 7
+ psraw xmm2, 7
+
+ packuswb xmm1, xmm2 ; pack and saturate
+ movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(2) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp8_filter_block1d16_v6_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d8_h6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp8_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
+sym(vp8_filter_block1d8_h6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(5) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ;dst_ptich
+%endif
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d8_h6_only_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0
+
+ movq QWORD PTR [rdi], xmm4 ; store the results in the destination
+ lea rsi, [rsi + rax]
+
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(3) ;dst_ptich
+%else
+ add rdi, r8
+%endif
+ dec rcx
+
+ jnz .filter_block1d8_h6_only_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d16_h6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp8_filter
+;)
+; First-pass filter only when yoffset==0
+global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
+sym(vp8_filter_block1d16_h6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rdx, arg(5) ;vp8_filter
+ mov rsi, arg(0) ;src_ptr
+
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ;dst_ptich
+%endif
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+
+.filter_block1d16_h6_only_sse2_rowloop:
+ movq xmm3, MMWORD PTR [rsi - 2]
+ movq xmm1, MMWORD PTR [rsi + 6]
+
+ movq xmm2, MMWORD PTR [rsi +14]
+ pslldq xmm2, 8
+
+ por xmm2, xmm1
+ prefetcht2 [rsi+rax-2]
+
+ pslldq xmm1, 8
+ por xmm1, xmm3
+
+ movdqa xmm4, xmm1
+ movdqa xmm5, xmm1
+
+ movdqa xmm6, xmm1
+ movdqa xmm7, xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm1
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0 ; lower 8 bytes
+
+ movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
+
+ movdqa xmm3, xmm2
+ movdqa xmm4, xmm2
+
+ movdqa xmm5, xmm2
+ movdqa xmm6, xmm2
+
+ movdqa xmm7, xmm2
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
+
+ pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
+ punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
+
+ psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
+ pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
+
+ punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
+ psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
+
+ pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
+
+ punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
+ psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
+
+ pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
+
+ punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
+ psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
+
+ pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
+
+ punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
+ pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
+
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm5
+
+ paddsw xmm4, xmm3
+ paddsw xmm4, xmm6
+
+ paddsw xmm4, xmm2
+ paddsw xmm4, [GLOBAL(rd)]
+
+ psraw xmm4, 7
+
+ packuswb xmm4, xmm0 ; higher 8 bytes
+
+ movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(3) ;dst_ptich
+%else
+ add rdi, r8
+%endif
+
+ dec rcx
+ jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_filter_block1d8_v6_only_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; int dst_ptich,
+; unsigned int output_height,
+; const short *vp8_filter
+;)
+; Second-pass filter only when xoffset==0
+global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
+sym(vp8_filter_block1d8_v6_only_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ mov rax, arg(5) ;vp8_filter
+
+ pxor xmm0, xmm0 ; clear xmm0
+
+ movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(3) ; dst_ptich
+%endif
+
+.vp8_filter_block1d8_v6_only_sse2_loop:
+ movq xmm1, MMWORD PTR [rsi]
+ movq xmm2, MMWORD PTR [rsi + rdx]
+ movq xmm3, MMWORD PTR [rsi + rdx * 2]
+ movq xmm5, MMWORD PTR [rsi + rdx * 4]
+ add rsi, rdx
+ movq xmm4, MMWORD PTR [rsi + rdx * 2]
+ movq xmm6, MMWORD PTR [rsi + rdx * 4]
+
+ punpcklbw xmm1, xmm0
+ pmullw xmm1, [rax]
+
+ punpcklbw xmm2, xmm0
+ pmullw xmm2, [rax + 16]
+
+ punpcklbw xmm3, xmm0
+ pmullw xmm3, [rax + 32]
+
+ punpcklbw xmm5, xmm0
+ pmullw xmm5, [rax + 64]
+
+ punpcklbw xmm4, xmm0
+ pmullw xmm4, [rax + 48]
+
+ punpcklbw xmm6, xmm0
+ pmullw xmm6, [rax + 80]
+
+ paddsw xmm2, xmm5
+ paddsw xmm2, xmm3
+
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+
+ paddsw xmm2, xmm6
+ paddsw xmm2, xmm7
+
+ psraw xmm2, 7
+ packuswb xmm2, xmm0 ; pack and saturate
+
+ movq QWORD PTR [rdi], xmm2 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[dst_ptich]
+%else
+ add rdi, r8
+%endif
+ dec rcx ; decrement count
+ jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_unpack_block1d16_h6_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned short *output_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned int output_height,
+; unsigned int output_width
+;)
+global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
+sym(vp8_unpack_block1d16_h6_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(1) ;output_ptr
+
+ movsxd rcx, dword ptr arg(3) ;output_height
+ movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
+%endif
+
+.unpack_block1d16_h6_sse2_rowloop:
+ movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
+ movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
+
+ punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
+ punpcklbw xmm1, xmm0
+
+ movdqa XMMWORD Ptr [rdi], xmm1
+ movdqa XMMWORD Ptr [rdi + 16], xmm3
+
+ lea rsi, [rsi + rax]
+%if ABI_IS_32BIT
+ add rdi, DWORD Ptr arg(4) ;[output_width]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .unpack_block1d16_h6_sse2_rowloop ; next row
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_bilinear_predict16x16_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+extern sym(vp8_bilinear_filters_x86_8)
+global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
+sym(vp8_bilinear_predict16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
+
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+ movsxd rax, dword ptr arg(2) ;xoffset
+
+ cmp rax, 0 ;skip first_pass filter if xoffset=0
+ je .b16x16_sp_only
+
+ shl rax, 5
+ add rax, rcx ;HFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+
+ movdqa xmm1, [rax]
+ movdqa xmm2, [rax+16]
+
+ movsxd rax, dword ptr arg(3) ;yoffset
+
+ cmp rax, 0 ;skip second_pass filter if yoffset=0
+ je .b16x16_fp_only
+
+ shl rax, 5
+ add rax, rcx ;VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ pxor xmm0, xmm0
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ;dst_pitch
+%endif
+ ; get the first horizontal line done
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movdqa xmm4, xmm3 ; make a copy of current line
+
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm1
+
+ movdqu xmm5, [rsi+1]
+ movdqa xmm6, xmm5
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, xmm2
+ pmullw xmm6, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm4
+
+ add rsi, rdx ; next line
+.next_row:
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movdqa xmm4, xmm3 ; make a copy of current line
+
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm1
+
+ movdqu xmm5, [rsi+1]
+ movdqa xmm6, xmm5
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, xmm2
+ pmullw xmm6, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm7
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, [rax]
+ pmullw xmm6, [rax]
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm4
+
+ pmullw xmm3, [rax+16]
+ pmullw xmm4, [rax+16]
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ packuswb xmm3, xmm4
+ movdqa [rdi], xmm3 ; store the results in the destination
+
+ add rsi, rdx ; next line
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(5) ;dst_pitch
+%else
+ add rdi, r8
+%endif
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done
+
+.b16x16_sp_only:
+ movsxd rax, dword ptr arg(3) ;yoffset
+ shl rax, 5
+ add rax, rcx ;VFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+
+ movdqa xmm1, [rax]
+ movdqa xmm2, [rax+16]
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+
+ pxor xmm0, xmm0
+
+ ; get the first horizontal line done
+ movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+ add rsi, rax ; next line
+.next_row_spo:
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+
+ movdqa xmm5, xmm7
+ movdqa xmm6, xmm7
+
+ movdqa xmm4, xmm3 ; make a copy of current line
+ movdqa xmm7, xmm3
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm5, xmm1
+ pmullw xmm6, xmm1
+ pmullw xmm3, xmm2
+ pmullw xmm4, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ packuswb xmm3, xmm4
+ movdqa [rdi], xmm3 ; store the results in the destination
+
+ add rsi, rax ; next line
+ add rdi, rdx ;dst_pitch
+ cmp rdi, rcx
+ jne .next_row_spo
+
+ jmp .done
+
+.b16x16_fp_only:
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ pxor xmm0, xmm0
+
+.next_row_fpo:
+ movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
+ movdqa xmm4, xmm3 ; make a copy of current line
+
+ punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
+ punpckhbw xmm4, xmm0
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm1
+
+ movdqu xmm5, [rsi+1]
+ movdqa xmm6, xmm5
+
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm6, xmm0
+
+ pmullw xmm5, xmm2
+ pmullw xmm6, xmm2
+
+ paddw xmm3, xmm5
+ paddw xmm4, xmm6
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ packuswb xmm3, xmm4
+ movdqa [rdi], xmm3 ; store the results in the destination
+
+ add rsi, rax ; next line
+ add rdi, rdx ; dst_pitch
+ cmp rdi, rcx
+ jne .next_row_fpo
+
+.done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_bilinear_predict8x8_sse2
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
+sym(vp8_bilinear_predict8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 144 ; reserve 144 bytes
+
+ ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+ ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
+ lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ ;Read 9-line unaligned data in and put them on stack. This gives a big
+ ;performance boost.
+ movdqu xmm0, [rsi]
+ lea rax, [rdx + rdx*2]
+ movdqu xmm1, [rsi+rdx]
+ movdqu xmm2, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi+rdx]
+ movdqu xmm5, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm6, [rsi]
+ movdqu xmm7, [rsi+rdx]
+
+ movdqa XMMWORD PTR [rsp], xmm0
+
+ movdqu xmm0, [rsi+rdx*2]
+
+ movdqa XMMWORD PTR [rsp+16], xmm1
+ movdqa XMMWORD PTR [rsp+32], xmm2
+ movdqa XMMWORD PTR [rsp+48], xmm3
+ movdqa XMMWORD PTR [rsp+64], xmm4
+ movdqa XMMWORD PTR [rsp+80], xmm5
+ movdqa XMMWORD PTR [rsp+96], xmm6
+ movdqa XMMWORD PTR [rsp+112], xmm7
+ movdqa XMMWORD PTR [rsp+128], xmm0
+
+ movsxd rax, dword ptr arg(2) ;xoffset
+ shl rax, 5
+ add rax, rcx ;HFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ movsxd rdx, dword ptr arg(5) ;dst_pitch
+
+ movdqa xmm1, [rax]
+ movdqa xmm2, [rax+16]
+
+ movsxd rax, dword ptr arg(3) ;yoffset
+ shl rax, 5
+ add rax, rcx ;VFilter
+
+ lea rcx, [rdi+rdx*8]
+
+ movdqa xmm5, [rax]
+ movdqa xmm6, [rax+16]
+
+ pxor xmm0, xmm0
+
+ ; get the first horizontal line done
+ movdqa xmm3, XMMWORD PTR [rsp]
+ movdqa xmm4, xmm3 ; make a copy of current line
+ psrldq xmm4, 1
+
+ punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
+ punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm2
+
+ paddw xmm3, xmm4
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ movdqa xmm7, xmm3
+ add rsp, 16 ; next line
+.next_row8x8:
+ movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ movdqa xmm4, xmm3 ; make a copy of current line
+ psrldq xmm4, 1
+
+ punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
+ punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
+
+ pmullw xmm3, xmm1
+ pmullw xmm4, xmm2
+
+ paddw xmm3, xmm4
+ pmullw xmm7, xmm5
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ movdqa xmm4, xmm3
+
+ pmullw xmm3, xmm6
+ paddw xmm3, xmm7
+
+ movdqa xmm7, xmm4
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ packuswb xmm3, xmm0
+ movq [rdi], xmm3 ; store the results in the destination
+
+ add rsp, 16 ; next line
+ add rdi, rdx
+
+ cmp rdi, rcx
+ jne .next_row8x8
+
+ ;add rsp, 144
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+rd:
+ times 8 dw 0x40
diff --git a/libvpx/vp8/common/x86/subpixel_ssse3.asm b/libvpx/vp8/common/x86/subpixel_ssse3.asm
new file mode 100644
index 0000000..13bcaf6
--- /dev/null
+++ b/libvpx/vp8/common/x86/subpixel_ssse3.asm
@@ -0,0 +1,1507 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define BLOCK_HEIGHT_WIDTH 4
+%define VP8_FILTER_WEIGHT 128
+%define VP8_FILTER_SHIFT 7
+
+
+;/************************************************************************************
+; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
+; input pixel array has output_height rows. This routine assumes that output_height is an
+; even number. This function handles 8 pixels in horizontal direction, calculating ONE
+; rows each iteration to take advantage of the 128 bits operations.
+;
+; This is an implementation of some of the SSE optimizations first seen in ffvp8
+;
+;*************************************************************************************/
+;void vp8_filter_block1d8_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d8_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4
+
+ movdqa xmm7, [GLOBAL(rd)]
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+ mov rdi, arg(2) ;output_ptr
+
+ cmp esi, DWORD PTR [rax]
+ je vp8_filter_block1d8_h4_ssse3
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+ sub rdi, rdx
+;xmm3 free
+.filter_block1d8_h6_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm1, xmm0
+ pmaddubsw xmm0, xmm4
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+ pmaddubsw xmm1, xmm5
+
+ lea rdi, [rdi + rdx]
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+ dec rcx
+
+ paddsw xmm0, xmm1
+ paddsw xmm2, xmm7
+
+ paddsw xmm0, xmm2
+
+ psraw xmm0, 7
+
+ packuswb xmm0, xmm0
+
+ movq MMWORD Ptr [rdi], xmm0
+ jnz .filter_block1d8_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+vp8_filter_block1d8_h4_ssse3:
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
+ movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+ sub rdi, rdx
+
+.filter_block1d8_h4_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm2, xmm0
+ pshufb xmm0, xmm3
+
+ pshufb xmm2, xmm4
+ pmaddubsw xmm0, xmm5
+
+ lea rdi, [rdi + rdx]
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+ dec rcx
+
+ paddsw xmm0, xmm7
+
+ paddsw xmm0, xmm2
+
+ psraw xmm0, 7
+
+ packuswb xmm0, xmm0
+
+ movq MMWORD Ptr [rdi], xmm0
+
+ jnz .filter_block1d8_h4_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp8_filter_block1d16_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d16_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ mov rdi, arg(2) ;output_ptr
+
+ mov rsi, arg(0) ;src_ptr
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+.filter_block1d16_h6_rowloop_ssse3:
+ movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
+
+ movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
+
+ punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
+
+ movdqa xmm1, xmm0
+ pmaddubsw xmm0, xmm4
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+ movq xmm3, MMWORD PTR [rsi + 6]
+
+ pmaddubsw xmm1, xmm5
+ movq xmm7, MMWORD PTR [rsi + 11]
+
+ pmaddubsw xmm2, xmm6
+ punpcklbw xmm3, xmm7
+
+ paddsw xmm0, xmm1
+ movdqa xmm1, xmm3
+
+ pmaddubsw xmm3, xmm4
+ paddsw xmm0, xmm2
+
+ movdqa xmm2, xmm1
+ paddsw xmm0, [GLOBAL(rd)]
+
+ pshufb xmm1, [GLOBAL(shuf2bfrom1)]
+ pshufb xmm2, [GLOBAL(shuf3bfrom1)]
+
+ psraw xmm0, 7
+ pmaddubsw xmm1, xmm5
+
+ pmaddubsw xmm2, xmm6
+ packuswb xmm0, xmm0
+
+ lea rsi, [rsi + rax]
+ paddsw xmm3, xmm1
+
+ paddsw xmm3, xmm2
+
+ paddsw xmm3, [GLOBAL(rd)]
+
+ psraw xmm3, 7
+
+ packuswb xmm3, xmm3
+
+ punpcklqdq xmm0, xmm3
+
+ movdqa XMMWORD Ptr [rdi], xmm0
+
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .filter_block1d16_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_filter_block1d4_h6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
+sym(vp8_filter_block1d4_h6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+ movdqa xmm7, [GLOBAL(rd)]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d4_h4_ssse3
+
+ movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+;xmm3 free
+.filter_block1d4_h6_rowloop_ssse3:
+ movdqu xmm0, XMMWORD PTR [rsi - 2]
+
+ movdqa xmm1, xmm0
+ pshufb xmm0, [GLOBAL(shuf1b)]
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf2b)]
+ pmaddubsw xmm0, xmm4
+ pshufb xmm2, [GLOBAL(shuf3b)]
+ pmaddubsw xmm1, xmm5
+
+;--
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+;--
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ pxor xmm1, xmm1
+ paddsw xmm0, xmm2
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+ movd DWORD PTR [rdi], xmm0
+
+ add rdi, rdx
+ dec rcx
+ jnz .filter_block1d4_h6_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d4_h4_ssse3:
+ movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
+ movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
+ movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rcx, dword ptr arg(4) ;output_height
+
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+
+.filter_block1d4_h4_rowloop_ssse3:
+ movdqu xmm1, XMMWORD PTR [rsi - 2]
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
+ pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
+ pmaddubsw xmm1, xmm5
+
+;--
+ pmaddubsw xmm2, xmm6
+
+ lea rsi, [rsi + rax]
+;--
+ paddsw xmm1, xmm7
+ paddsw xmm1, xmm2
+ psraw xmm1, 7
+ packuswb xmm1, xmm1
+
+ movd DWORD PTR [rdi], xmm1
+
+ add rdi, rdx
+ dec rcx
+ jnz .filter_block1d4_h4_rowloop_ssse3
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;void vp8_filter_block1d16_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d16_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d16_v4_ssse3
+
+ movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+
+.vp8_filter_block1d16_v6_ssse3_loop:
+ movq xmm1, MMWORD PTR [rsi] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
+
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, [GLOBAL(rd)]
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2 ;store the results
+
+ movq xmm1, MMWORD PTR [rsi + 8] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, [GLOBAL(rd)]
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi+8], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d16_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d16_v4_ssse3:
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+.vp8_filter_block1d16_v4_ssse3_loop:
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ pmaddubsw xmm3, xmm6
+ pmaddubsw xmm2, xmm7
+ movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
+ movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
+
+ paddsw xmm2, [GLOBAL(rd)]
+ paddsw xmm2, xmm3
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ punpcklbw xmm5, xmm4 ;B D
+ punpcklbw xmm1, xmm0 ;C E
+
+ pmaddubsw xmm1, xmm6
+ pmaddubsw xmm5, xmm7
+
+ movdqa xmm4, [GLOBAL(rd)]
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm5, xmm1
+ paddsw xmm5, xmm4
+ psraw xmm5, 7
+ packuswb xmm5, xmm5
+
+ punpcklqdq xmm2, xmm5
+
+ movdqa XMMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d16_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_filter_block1d8_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d8_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ; out_pitch
+%endif
+ movsxd rcx, DWORD PTR arg(4) ;[output_height]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d8_v4_ssse3
+
+ movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d8_v6_ssse3_loop:
+ movq xmm1, MMWORD PTR [rsi] ;A
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
+ movdqa xmm4, [GLOBAL(rd)]
+
+ pmaddubsw xmm3, xmm6
+ punpcklbw xmm1, xmm0 ;A F
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm5
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm1
+ paddsw xmm2, xmm4
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d8_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d8_v4_ssse3:
+ movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
+ movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
+ movdqa xmm5, [GLOBAL(rd)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d8_v4_ssse3_loop:
+ movq xmm2, MMWORD PTR [rsi + rdx] ;B
+ movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
+ movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
+ movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw xmm2, xmm4 ;B D
+ punpcklbw xmm3, xmm0 ;C E
+
+ pmaddubsw xmm3, xmm6
+ pmaddubsw xmm2, xmm7
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw xmm2, xmm3
+ paddsw xmm2, xmm5
+ psraw xmm2, 7
+ packuswb xmm2, xmm2
+
+ movq MMWORD PTR [rdi], xmm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d8_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+;void vp8_filter_block1d4_v6_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; unsigned int vp8_filter_index
+;)
+global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
+sym(vp8_filter_block1d4_v6_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movsxd rdx, DWORD PTR arg(5) ;table index
+ xor rsi, rsi
+ shl rdx, 4 ;
+
+ lea rax, [GLOBAL(k0_k5)]
+ add rax, rdx
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+ mov rdi, arg(2) ;output_ptr
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ; out_pitch
+%endif
+ movsxd rcx, DWORD PTR arg(4) ;[output_height]
+
+ cmp esi, DWORD PTR [rax]
+ je .vp8_filter_block1d4_v4_ssse3
+
+ movq mm5, MMWORD PTR [rax] ;k0_k5
+ movq mm6, MMWORD PTR [rax+256] ;k2_k4
+ movq mm7, MMWORD PTR [rax+128] ;k1_k3
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d4_v6_ssse3_loop:
+ movd mm1, DWORD PTR [rsi] ;A
+ movd mm2, DWORD PTR [rsi + rdx] ;B
+ movd mm3, DWORD PTR [rsi + rdx * 2] ;C
+ movd mm4, DWORD PTR [rax + rdx * 2] ;D
+ movd mm0, DWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw mm2, mm4 ;B D
+ punpcklbw mm3, mm0 ;C E
+
+ movd mm0, DWORD PTR [rax + rdx * 4] ;F
+
+ movq mm4, [GLOBAL(rd)]
+
+ pmaddubsw mm3, mm6
+ punpcklbw mm1, mm0 ;A F
+ pmaddubsw mm2, mm7
+ pmaddubsw mm1, mm5
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw mm2, mm3
+ paddsw mm2, mm1
+ paddsw mm2, mm4
+ psraw mm2, 7
+ packuswb mm2, mm2
+
+ movd DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d4_v6_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+.vp8_filter_block1d4_v4_ssse3:
+ movq mm6, MMWORD PTR [rax+256] ;k2_k4
+ movq mm7, MMWORD PTR [rax+128] ;k1_k3
+ movq mm5, MMWORD PTR [GLOBAL(rd)]
+
+ mov rsi, arg(0) ;src_ptr
+
+ mov rax, rsi
+ add rax, rdx
+
+.vp8_filter_block1d4_v4_ssse3_loop:
+ movd mm2, DWORD PTR [rsi + rdx] ;B
+ movd mm3, DWORD PTR [rsi + rdx * 2] ;C
+ movd mm4, DWORD PTR [rax + rdx * 2] ;D
+ movd mm0, DWORD PTR [rsi + rdx * 4] ;E
+
+ punpcklbw mm2, mm4 ;B D
+ punpcklbw mm3, mm0 ;C E
+
+ pmaddubsw mm3, mm6
+ pmaddubsw mm2, mm7
+ add rsi, rdx
+ add rax, rdx
+;--
+;--
+ paddsw mm2, mm3
+ paddsw mm2, mm5
+ psraw mm2, 7
+ packuswb mm2, mm2
+
+ movd DWORD PTR [rdi], mm2
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;[out_pitch]
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .vp8_filter_block1d4_v4_ssse3_loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_bilinear_predict16x16_ssse3
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
+sym(vp8_bilinear_predict16x16_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
+ movsxd rax, dword ptr arg(2) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je .b16x16_sp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; HFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ mov rsi, arg(0) ; src_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm1, [rax]
+
+ movsxd rax, dword ptr arg(3) ; yoffset
+
+ cmp rax, 0 ; skip second_pass filter if yoffset=0
+ je .b16x16_fp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
+
+ movdqa xmm2, [rax]
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(5) ; dst_pitch
+%endif
+ movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+ movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
+
+ lea rsi, [rsi + rdx] ; next line
+
+ pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
+
+ punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
+ pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
+ psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+ movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm6, xmm5
+ movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
+ lea rsi, [rsi + rdx] ; next line
+
+ pmaddubsw xmm6, xmm1
+
+ punpcklbw xmm4, xmm5
+ pmaddubsw xmm4, xmm1
+
+ paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
+ psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
+
+ paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
+ psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
+
+ packuswb xmm6, xmm4
+ movdqa xmm5, xmm7
+
+ punpcklbw xmm5, xmm6
+ pmaddubsw xmm5, xmm2
+
+ punpckhbw xmm7, xmm6
+ pmaddubsw xmm7, xmm2
+
+ paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
+ psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128
+
+ paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
+ psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
+
+ packuswb xmm5, xmm7
+ movdqa xmm7, xmm6
+
+ movdqa [rdi], xmm5 ; store the results in the destination
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(5) ; dst_pitch
+%else
+ add rdi, r8
+%endif
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done
+
+.b16x16_sp_only:
+ movsxd rax, dword ptr arg(3) ; yoffset
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ mov rsi, arg(0) ; src_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm1, [rax] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ; src_pixels_per_line
+
+ ; get the first horizontal line done
+ movq xmm4, [rsi] ; load row 0
+ movq xmm2, [rsi + 8] ; load row 0
+
+ lea rsi, [rsi + rax] ; next line
+.next_row_sp:
+ movq xmm3, [rsi] ; load row + 1
+ movq xmm5, [rsi + 8] ; load row + 1
+
+ punpcklbw xmm4, xmm3
+ punpcklbw xmm2, xmm5
+
+ pmaddubsw xmm4, xmm1
+ movq xmm7, [rsi + rax] ; load row + 2
+
+ pmaddubsw xmm2, xmm1
+ movq xmm6, [rsi + rax + 8] ; load row + 2
+
+ punpcklbw xmm3, xmm7
+ punpcklbw xmm5, xmm6
+
+ pmaddubsw xmm3, xmm1
+ paddw xmm4, [GLOBAL(rd)]
+
+ pmaddubsw xmm5, xmm1
+ paddw xmm2, [GLOBAL(rd)]
+
+ psraw xmm4, VP8_FILTER_SHIFT
+ psraw xmm2, VP8_FILTER_SHIFT
+
+ packuswb xmm4, xmm2
+ paddw xmm3, [GLOBAL(rd)]
+
+ movdqa [rdi], xmm4 ; store row 0
+ paddw xmm5, [GLOBAL(rd)]
+
+ psraw xmm3, VP8_FILTER_SHIFT
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ packuswb xmm3, xmm5
+ movdqa xmm4, xmm7
+
+ movdqa [rdi + rdx],xmm3 ; store row 1
+ lea rsi, [rsi + 2*rax]
+
+ movdqa xmm2, xmm6
+ lea rdi, [rdi + 2*rdx]
+
+ cmp rdi, rcx
+ jne .next_row_sp
+
+ jmp .done
+
+.b16x16_fp_only:
+ lea rcx, [rdi+rdx*8]
+ lea rcx, [rcx+rdx*8]
+ movsxd rax, dword ptr arg(1) ; src_pixels_per_line
+
+.next_row_fp:
+ movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
+ movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
+
+ punpcklbw xmm2, xmm4
+ movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
+
+ pmaddubsw xmm2, xmm1
+ movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
+
+ lea rsi, [rsi + rax] ; next line
+ punpcklbw xmm3, xmm4
+
+ pmaddubsw xmm3, xmm1
+ movq xmm5, [rsi]
+
+ paddw xmm2, [GLOBAL(rd)]
+ movq xmm7, [rsi+1]
+
+ movq xmm6, [rsi+8]
+ psraw xmm2, VP8_FILTER_SHIFT
+
+ punpcklbw xmm5, xmm7
+ movq xmm7, [rsi+9]
+
+ paddw xmm3, [GLOBAL(rd)]
+ pmaddubsw xmm5, xmm1
+
+ psraw xmm3, VP8_FILTER_SHIFT
+ punpcklbw xmm6, xmm7
+
+ packuswb xmm2, xmm3
+ pmaddubsw xmm6, xmm1
+
+ movdqa [rdi], xmm2 ; store the results in the destination
+ paddw xmm5, [GLOBAL(rd)]
+
+ lea rdi, [rdi + rdx] ; dst_pitch
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ paddw xmm6, [GLOBAL(rd)]
+ psraw xmm6, VP8_FILTER_SHIFT
+
+ packuswb xmm5, xmm6
+ lea rsi, [rsi + rax] ; next line
+
+ movdqa [rdi], xmm5 ; store the results in the destination
+ lea rdi, [rdi + rdx] ; dst_pitch
+
+ cmp rdi, rcx
+
+ jne .next_row_fp
+
+.done:
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_bilinear_predict8x8_ssse3
+;(
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; int xoffset,
+; int yoffset,
+; unsigned char *dst_ptr,
+; int dst_pitch
+;)
+global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
+sym(vp8_bilinear_predict8x8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 144 ; reserve 144 bytes
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
+
+ mov rsi, arg(0) ;src_ptr
+ movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
+
+ ;Read 9-line unaligned data in and put them on stack. This gives a big
+ ;performance boost.
+ movdqu xmm0, [rsi]
+ lea rax, [rdx + rdx*2]
+ movdqu xmm1, [rsi+rdx]
+ movdqu xmm2, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi+rdx]
+ movdqu xmm5, [rsi+rdx*2]
+ add rsi, rax
+ movdqu xmm6, [rsi]
+ movdqu xmm7, [rsi+rdx]
+
+ movdqa XMMWORD PTR [rsp], xmm0
+
+ movdqu xmm0, [rsi+rdx*2]
+
+ movdqa XMMWORD PTR [rsp+16], xmm1
+ movdqa XMMWORD PTR [rsp+32], xmm2
+ movdqa XMMWORD PTR [rsp+48], xmm3
+ movdqa XMMWORD PTR [rsp+64], xmm4
+ movdqa XMMWORD PTR [rsp+80], xmm5
+ movdqa XMMWORD PTR [rsp+96], xmm6
+ movdqa XMMWORD PTR [rsp+112], xmm7
+ movdqa XMMWORD PTR [rsp+128], xmm0
+
+ movsxd rax, dword ptr arg(2) ; xoffset
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je .b8x8_sp_only
+
+ shl rax, 4
+ add rax, rcx ; HFilter
+
+ mov rdi, arg(4) ; dst_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm0, [rax]
+
+ movsxd rax, dword ptr arg(3) ; yoffset
+ cmp rax, 0 ; skip second_pass filter if yoffset=0
+ je .b8x8_fp_only
+
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ lea rcx, [rdi+rdx*8]
+
+ movdqa xmm1, [rax]
+
+ ; get the first horizontal line done
+ movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
+
+ psrldq xmm5, 1
+ lea rsp, [rsp + 16] ; next line
+
+ punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
+ pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
+
+ paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
+ psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
+
+ movdqa xmm7, xmm3
+ packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+
+.next_row:
+ movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
+ lea rsp, [rsp + 16] ; next line
+
+ movdqa xmm5, xmm6
+
+ psrldq xmm5, 1
+
+ punpcklbw xmm6, xmm5
+ pmaddubsw xmm6, xmm0
+
+ paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
+ psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
+
+ packuswb xmm6, xmm6
+
+ punpcklbw xmm7, xmm6
+ pmaddubsw xmm7, xmm1
+
+ paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
+ psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
+
+ packuswb xmm7, xmm7
+
+ movq [rdi], xmm7 ; store the results in the destination
+ lea rdi, [rdi + rdx]
+
+ movdqa xmm7, xmm6
+
+ cmp rdi, rcx
+ jne .next_row
+
+ jmp .done8x8
+
+.b8x8_sp_only:
+ movsxd rax, dword ptr arg(3) ; yoffset
+ shl rax, 4
+ lea rax, [rax + rcx] ; VFilter
+
+ mov rdi, arg(4) ;dst_ptr
+ movsxd rdx, dword ptr arg(5) ; dst_pitch
+
+ movdqa xmm0, [rax] ; VFilter
+
+ movq xmm1, XMMWORD PTR [rsp]
+ movq xmm2, XMMWORD PTR [rsp+16]
+
+ movq xmm3, XMMWORD PTR [rsp+32]
+ punpcklbw xmm1, xmm2
+
+ movq xmm4, XMMWORD PTR [rsp+48]
+ punpcklbw xmm2, xmm3
+
+ movq xmm5, XMMWORD PTR [rsp+64]
+ punpcklbw xmm3, xmm4
+
+ movq xmm6, XMMWORD PTR [rsp+80]
+ punpcklbw xmm4, xmm5
+
+ movq xmm7, XMMWORD PTR [rsp+96]
+ punpcklbw xmm5, xmm6
+
+ pmaddubsw xmm1, xmm0
+ pmaddubsw xmm2, xmm0
+
+ pmaddubsw xmm3, xmm0
+ pmaddubsw xmm4, xmm0
+
+ pmaddubsw xmm5, xmm0
+ punpcklbw xmm6, xmm7
+
+ pmaddubsw xmm6, xmm0
+ paddw xmm1, [GLOBAL(rd)]
+
+ paddw xmm2, [GLOBAL(rd)]
+ psraw xmm1, VP8_FILTER_SHIFT
+
+ paddw xmm3, [GLOBAL(rd)]
+ psraw xmm2, VP8_FILTER_SHIFT
+
+ paddw xmm4, [GLOBAL(rd)]
+ psraw xmm3, VP8_FILTER_SHIFT
+
+ paddw xmm5, [GLOBAL(rd)]
+ psraw xmm4, VP8_FILTER_SHIFT
+
+ paddw xmm6, [GLOBAL(rd)]
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ psraw xmm6, VP8_FILTER_SHIFT
+ packuswb xmm1, xmm1
+
+ packuswb xmm2, xmm2
+ movq [rdi], xmm1
+
+ packuswb xmm3, xmm3
+ movq [rdi+rdx], xmm2
+
+ packuswb xmm4, xmm4
+ movq xmm1, XMMWORD PTR [rsp+112]
+
+ lea rdi, [rdi + 2*rdx]
+ movq xmm2, XMMWORD PTR [rsp+128]
+
+ packuswb xmm5, xmm5
+ movq [rdi], xmm3
+
+ packuswb xmm6, xmm6
+ movq [rdi+rdx], xmm4
+
+ lea rdi, [rdi + 2*rdx]
+ punpcklbw xmm7, xmm1
+
+ movq [rdi], xmm5
+ pmaddubsw xmm7, xmm0
+
+ movq [rdi+rdx], xmm6
+ punpcklbw xmm1, xmm2
+
+ pmaddubsw xmm1, xmm0
+ paddw xmm7, [GLOBAL(rd)]
+
+ psraw xmm7, VP8_FILTER_SHIFT
+ paddw xmm1, [GLOBAL(rd)]
+
+ psraw xmm1, VP8_FILTER_SHIFT
+ packuswb xmm7, xmm7
+
+ packuswb xmm1, xmm1
+ lea rdi, [rdi + 2*rdx]
+
+ movq [rdi], xmm7
+
+ movq [rdi+rdx], xmm1
+ lea rsp, [rsp + 144]
+
+ jmp .done8x8
+
+.b8x8_fp_only:
+ lea rcx, [rdi+rdx*8]
+
+.next_row_fp:
+ movdqa xmm1, XMMWORD PTR [rsp]
+ movdqa xmm3, XMMWORD PTR [rsp+16]
+
+ movdqa xmm2, xmm1
+ movdqa xmm5, XMMWORD PTR [rsp+32]
+
+ psrldq xmm2, 1
+ movdqa xmm7, XMMWORD PTR [rsp+48]
+
+ movdqa xmm4, xmm3
+ psrldq xmm4, 1
+
+ movdqa xmm6, xmm5
+ psrldq xmm6, 1
+
+ punpcklbw xmm1, xmm2
+ pmaddubsw xmm1, xmm0
+
+ punpcklbw xmm3, xmm4
+ pmaddubsw xmm3, xmm0
+
+ punpcklbw xmm5, xmm6
+ pmaddubsw xmm5, xmm0
+
+ movdqa xmm2, xmm7
+ psrldq xmm2, 1
+
+ punpcklbw xmm7, xmm2
+ pmaddubsw xmm7, xmm0
+
+ paddw xmm1, [GLOBAL(rd)]
+ psraw xmm1, VP8_FILTER_SHIFT
+
+ paddw xmm3, [GLOBAL(rd)]
+ psraw xmm3, VP8_FILTER_SHIFT
+
+ paddw xmm5, [GLOBAL(rd)]
+ psraw xmm5, VP8_FILTER_SHIFT
+
+ paddw xmm7, [GLOBAL(rd)]
+ psraw xmm7, VP8_FILTER_SHIFT
+
+ packuswb xmm1, xmm1
+ packuswb xmm3, xmm3
+
+ packuswb xmm5, xmm5
+ movq [rdi], xmm1
+
+ packuswb xmm7, xmm7
+ movq [rdi+rdx], xmm3
+
+ lea rdi, [rdi + 2*rdx]
+ movq [rdi], xmm5
+
+ lea rsp, [rsp + 4*16]
+ movq [rdi+rdx], xmm7
+
+ lea rdi, [rdi + 2*rdx]
+ cmp rdi, rcx
+
+ jne .next_row_fp
+
+ lea rsp, [rsp + 16]
+
+.done8x8:
+ ;add rsp, 144
+ pop rsp
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+shuf1b:
+ db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
+shuf2b:
+ db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
+shuf3b:
+ db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
+
+align 16
+shuf2bfrom1:
+ db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
+align 16
+shuf3bfrom1:
+ db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
+
+align 16
+rd:
+ times 8 dw 0x40
+
+align 16
+k0_k5:
+ times 8 db 0, 0 ;placeholder
+ times 8 db 0, 0
+ times 8 db 2, 1
+ times 8 db 0, 0
+ times 8 db 3, 3
+ times 8 db 0, 0
+ times 8 db 1, 2
+ times 8 db 0, 0
+k1_k3:
+ times 8 db 0, 0 ;placeholder
+ times 8 db -6, 12
+ times 8 db -11, 36
+ times 8 db -9, 50
+ times 8 db -16, 77
+ times 8 db -6, 93
+ times 8 db -8, 108
+ times 8 db -1, 123
+k2_k4:
+ times 8 db 128, 0 ;placeholder
+ times 8 db 123, -1
+ times 8 db 108, -8
+ times 8 db 93, -6
+ times 8 db 77, -16
+ times 8 db 50, -9
+ times 8 db 36, -11
+ times 8 db 12, -6
+align 16
+vp8_bilinear_filters_ssse3:
+ times 8 db 128, 0
+ times 8 db 112, 16
+ times 8 db 96, 32
+ times 8 db 80, 48
+ times 8 db 64, 64
+ times 8 db 48, 80
+ times 8 db 32, 96
+ times 8 db 16, 112
+
diff --git a/libvpx/vp8/common/x86/variance_impl_mmx.asm b/libvpx/vp8/common/x86/variance_impl_mmx.asm
new file mode 100644
index 0000000..d9120d0
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_impl_mmx.asm
@@ -0,0 +1,851 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
+global sym(vp8_get_mb_ss_mmx) PRIVATE
+sym(vp8_get_mb_ss_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 8
+ ; end prolog
+
+ mov rax, arg(0) ;src_ptr
+ mov rcx, 16
+ pxor mm4, mm4
+
+.NEXTROW:
+ movq mm0, [rax]
+ movq mm1, [rax+8]
+ movq mm2, [rax+16]
+ movq mm3, [rax+24]
+ pmaddwd mm0, mm0
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+ pmaddwd mm3, mm3
+
+ paddd mm4, mm0
+ paddd mm4, mm1
+ paddd mm4, mm2
+ paddd mm4, mm3
+
+ add rax, 32
+ dec rcx
+ ja .NEXTROW
+ movq QWORD PTR [rsp], mm4
+
+ ;return sum[0]+sum[1];
+ movsxd rax, dword ptr [rsp]
+ movsxd rcx, dword ptr [rsp+4]
+ add rax, rcx
+
+
+ ; begin epilog
+ add rsp, 8
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_get8x8var_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *SSE,
+; int *Sum
+;)
+global sym(vp8_get8x8var_mmx) PRIVATE
+sym(vp8_get8x8var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ push rbx
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm5, mm5 ; Blank mmx6
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+
+ ; Row 1
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+
+ ; Row 2
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 3
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 4
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 5
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ ; movq mm4, [rbx + rdx]
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 6
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 7
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Row 8
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm2, mm0 ; Take copies
+ movq mm3, mm1 ; Take copies
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ punpckhbw mm2, mm6 ; unpack to higher prrcision
+ punpckhbw mm3, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ psubsw mm2, mm3 ; A-B (high order) to MM2
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+ paddw mm5, mm2 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ pmaddwd mm2, mm2 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ paddd mm7, mm0 ; accumulate in mm7
+ paddd mm7, mm2 ; accumulate in mm7
+
+ ; Now accumulate the final results.
+ movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
+ movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
+ movsx rdx, WORD PTR [rsp+8]
+ movsx rcx, WORD PTR [rsp+10]
+ movsx rbx, WORD PTR [rsp+12]
+ movsx rax, WORD PTR [rsp+14]
+ add rdx, rcx
+ add rbx, rax
+ add rdx, rbx ;XSum
+ movsxd rax, DWORD PTR [rsp]
+ movsxd rcx, DWORD PTR [rsp+4]
+ add rax, rcx ;XXSum
+ mov rsi, arg(4) ;SSE
+ mov rdi, arg(5) ;Sum
+ mov dword ptr [rsi], eax
+ mov dword ptr [rdi], edx
+ xor rax, rax ; return 0
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;unsigned int
+;vp8_get4x4var_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride,
+; unsigned int *SSE,
+; int *Sum
+;)
+global sym(vp8_get4x4var_mmx) PRIVATE
+sym(vp8_get4x4var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ push rbx
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm5, mm5 ; Blank mmx6
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+
+ ; Row 1
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+
+ ; Row 2
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 3
+ movq mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movq mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 4
+ movq mm0, [rax] ; Copy eight bytes to mm0
+
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+
+ paddw mm5, mm0 ; accumulate differences in mm5
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ paddd mm7, mm0 ; accumulate in mm7
+
+
+ ; Now accumulate the final results.
+ movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
+ movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
+ movsx rdx, WORD PTR [rsp+8]
+ movsx rcx, WORD PTR [rsp+10]
+ movsx rbx, WORD PTR [rsp+12]
+ movsx rax, WORD PTR [rsp+14]
+ add rdx, rcx
+ add rbx, rax
+ add rdx, rbx ;XSum
+ movsxd rax, DWORD PTR [rsp]
+ movsxd rcx, DWORD PTR [rsp+4]
+ add rax, rcx ;XXSum
+ mov rsi, arg(4) ;SSE
+ mov rdi, arg(5) ;Sum
+ mov dword ptr [rsi], eax
+ mov dword ptr [rdi], edx
+ xor rax, rax ; return 0
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+;unsigned int
+;vp8_get4x4sse_cs_mmx
+;(
+; unsigned char *src_ptr,
+; int source_stride,
+; unsigned char *ref_ptr,
+; int recon_stride
+;)
+global sym(vp8_get4x4sse_cs_mmx) PRIVATE
+sym(vp8_get4x4sse_cs_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 4
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+
+ pxor mm6, mm6 ; Blank mmx7
+ pxor mm7, mm7 ; Blank mmx7
+
+ mov rax, arg(0) ;[src_ptr] ; Load base addresses
+ mov rbx, arg(2) ;[ref_ptr]
+ movsxd rcx, dword ptr arg(1) ;[source_stride]
+ movsxd rdx, dword ptr arg(3) ;[recon_stride]
+ ; Row 1
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 2
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 3
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm1, mm6
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+
+ pmaddwd mm0, mm0 ; square and accumulate
+ add rbx,rdx ; Inc pointer into ref data
+ add rax,rcx ; Inc pointer into the new data
+ movd mm1, [rbx] ; Copy eight bytes to mm1
+ paddd mm7, mm0 ; accumulate in mm7
+
+ ; Row 4
+ movd mm0, [rax] ; Copy eight bytes to mm0
+ punpcklbw mm0, mm6 ; unpack to higher prrcision
+ punpcklbw mm1, mm6
+ psubsw mm0, mm1 ; A-B (low order) to MM0
+ pmaddwd mm0, mm0 ; square and accumulate
+ paddd mm7, mm0 ; accumulate in mm7
+
+ movq mm0, mm7 ;
+ psrlq mm7, 32
+
+ paddd mm0, mm7
+ movq rax, mm0
+
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+%define mmx_filter_shift 7
+
+;void vp8_filter_block2d_bil4x4_var_mmx
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
+sym(vp8_filter_block2d_bil4x4_var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+
+ mov rax, arg(4) ;HFilter ;
+ mov rdx, arg(5) ;VFilter ;
+
+ mov rsi, arg(0) ;ref_ptr ;
+ mov rdi, arg(2) ;src_ptr ;
+
+ mov rcx, 4 ;
+ pxor mm0, mm0 ;
+
+ movd mm1, [rsi] ;
+ movd mm3, [rsi+1] ;
+
+ punpcklbw mm1, mm0 ;
+ pmullw mm1, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ movq mm5, mm1
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rsi, r8
+%endif
+
+.filter_block2d_bil4x4_var_mmx_loop:
+
+ movd mm1, [rsi] ;
+ movd mm3, [rsi+1] ;
+
+ punpcklbw mm1, mm0 ;
+ pmullw mm1, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ movq mm3, mm5 ;
+
+ movq mm5, mm1 ;
+ pmullw mm3, [rdx] ;
+
+ pmullw mm1, [rdx+8] ;
+ paddw mm1, mm3 ;
+
+
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+ psraw mm1, mmx_filter_shift ;
+
+ movd mm3, [rdi] ;
+ punpcklbw mm3, mm0 ;
+
+ psubw mm1, mm3 ;
+ paddw mm6, mm1 ;
+
+ pmaddwd mm1, mm1 ;
+ paddd mm7, mm1 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz .filter_block2d_bil4x4_var_mmx_loop ;
+
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rdi, arg(6) ;sum
+ mov rsi, arg(7) ;sumsquared
+
+ movd dword ptr [rdi], mm2 ;
+ movd dword ptr [rsi], mm4 ;
+
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
+;void vp8_filter_block2d_bil_var_mmx
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; unsigned short *HFilter,
+; unsigned short *VFilter,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
+sym(vp8_filter_block2d_bil_var_mmx):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ pxor mm6, mm6 ;
+ pxor mm7, mm7 ;
+ mov rax, arg(5) ;HFilter ;
+
+ mov rdx, arg(6) ;VFilter ;
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+
+ pxor mm0, mm0 ;
+ movq mm1, [rsi] ;
+
+ movq mm3, [rsi+1] ;
+ movq mm2, mm1 ;
+
+ movq mm4, mm3 ;
+ punpcklbw mm1, mm0 ;
+
+ punpckhbw mm2, mm0 ;
+ pmullw mm1, [rax] ;
+
+ pmullw mm2, [rax] ;
+ punpcklbw mm3, mm0 ;
+
+ punpckhbw mm4, mm0 ;
+ pmullw mm3, [rax+8] ;
+
+ pmullw mm4, [rax+8] ;
+ paddw mm1, mm3 ;
+
+ paddw mm2, mm4 ;
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm2, mmx_filter_shift ;
+ movq mm5, mm1
+
+ packuswb mm5, mm2 ;
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ add rsi, r8
+%endif
+
+.filter_block2d_bil_var_mmx_loop:
+
+ movq mm1, [rsi] ;
+ movq mm3, [rsi+1] ;
+
+ movq mm2, mm1 ;
+ movq mm4, mm3 ;
+
+ punpcklbw mm1, mm0 ;
+ punpckhbw mm2, mm0 ;
+
+ pmullw mm1, [rax] ;
+ pmullw mm2, [rax] ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ pmullw mm3, [rax+8] ;
+ pmullw mm4, [rax+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm2, mm4 ;
+
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+ psraw mm1, mmx_filter_shift ;
+
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
+ psraw mm2, mmx_filter_shift ;
+
+ movq mm3, mm5 ;
+ movq mm4, mm5 ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ movq mm5, mm1 ;
+ packuswb mm5, mm2 ;
+
+ pmullw mm3, [rdx] ;
+ pmullw mm4, [rdx] ;
+
+ pmullw mm1, [rdx+8] ;
+ pmullw mm2, [rdx+8] ;
+
+ paddw mm1, mm3 ;
+ paddw mm2, mm4 ;
+
+ paddw mm1, [GLOBAL(mmx_bi_rd)] ;
+ paddw mm2, [GLOBAL(mmx_bi_rd)] ;
+
+ psraw mm1, mmx_filter_shift ;
+ psraw mm2, mmx_filter_shift ;
+
+ movq mm3, [rdi] ;
+ movq mm4, mm3 ;
+
+ punpcklbw mm3, mm0 ;
+ punpckhbw mm4, mm0 ;
+
+ psubw mm1, mm3 ;
+ psubw mm2, mm4 ;
+
+ paddw mm6, mm1 ;
+ pmaddwd mm1, mm1 ;
+
+ paddw mm6, mm2 ;
+ pmaddwd mm2, mm2 ;
+
+ paddd mm7, mm1 ;
+ paddd mm7, mm2 ;
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
+ add rdi, dword ptr arg(3) ;src_pixels_per_line ;
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz .filter_block2d_bil_var_mmx_loop ;
+
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rdi, arg(7) ;sum
+ mov rsi, arg(8) ;sumsquared
+
+ movd dword ptr [rdi], mm2 ;
+ movd dword ptr [rsi], mm4 ;
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+;short mmx_bi_rd[4] = { 64, 64, 64, 64};
+align 16
+mmx_bi_rd:
+ times 4 dw 64
diff --git a/libvpx/vp8/common/x86/variance_impl_sse2.asm b/libvpx/vp8/common/x86/variance_impl_sse2.asm
new file mode 100644
index 0000000..761433c
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_impl_sse2.asm
@@ -0,0 +1,1359 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift 7
+
+;unsigned int vp8_get_mb_ss_sse2
+;(
+; short *src_ptr
+;)
+global sym(vp8_get_mb_ss_sse2) PRIVATE
+sym(vp8_get_mb_ss_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 1
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+
+ mov rax, arg(0) ;[src_ptr]
+ mov rcx, 8
+ pxor xmm4, xmm4
+
+.NEXTROW:
+ movdqa xmm0, [rax]
+ movdqa xmm1, [rax+16]
+ movdqa xmm2, [rax+32]
+ movdqa xmm3, [rax+48]
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+
+ paddd xmm0, xmm1
+ paddd xmm2, xmm3
+ paddd xmm4, xmm0
+ paddd xmm4, xmm2
+
+ add rax, 0x40
+ dec rcx
+ ja .NEXTROW
+
+ movdqa xmm3,xmm4
+ psrldq xmm4,8
+ paddd xmm4,xmm3
+ movdqa xmm3,xmm4
+ psrldq xmm4,4
+ paddd xmm4,xmm3
+ movq rax,xmm4
+
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int vp8_get16x16var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp8_get16x16var_sse2) PRIVATE
+sym(vp8_get16x16var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+
+ ; Prefetch data
+ lea rcx, [rax+rax*2]
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+rax]
+ prefetcht0 [rsi+rax*2]
+ prefetcht0 [rsi+rcx]
+ lea rbx, [rsi+rax*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax*2]
+ prefetcht0 [rbx+rcx]
+
+ lea rcx, [rdx+rdx*2]
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+rdx]
+ prefetcht0 [rdi+rdx*2]
+ prefetcht0 [rdi+rcx]
+ lea rbx, [rdi+rdx*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx*2]
+ prefetcht0 [rbx+rcx]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+.var16loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ prefetcht0 [rsi+rax*8]
+ prefetcht0 [rdi+rdx*8]
+
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm4, xmm0
+
+
+ psubw xmm1, xmm2
+ psubw xmm3, xmm4
+
+ paddw xmm7, xmm1
+ pmaddwd xmm1, xmm1
+
+ paddw xmm7, xmm3
+ pmaddwd xmm3, xmm3
+
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+
+ add rsi, rax
+ add rdi, rdx
+
+ sub rcx, 1
+ jnz .var16loop
+
+
+ movdqa xmm1, xmm6
+ pxor xmm6, xmm6
+
+ pxor xmm5, xmm5
+ punpcklwd xmm6, xmm7
+
+ punpckhwd xmm5, xmm7
+ psrad xmm5, 16
+
+ psrad xmm6, 16
+ paddd xmm6, xmm5
+
+ movdqa xmm2, xmm1
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ movdqa xmm7, xmm6
+
+ paddd xmm1, xmm2
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm7, xmm0
+ paddd xmm6, xmm7
+
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm6
+
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+
+ paddd xmm7, xmm6
+ paddd xmm1, xmm2
+
+ mov rax, arg(5) ;[Sum]
+ mov rdi, arg(4) ;[SSE]
+
+ movd DWORD PTR [rax], xmm7
+ movd DWORD PTR [rdi], xmm1
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+
+
+;unsigned int vp8_get8x8var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+global sym(vp8_get8x8var_sse2) PRIVATE
+sym(vp8_get8x8var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ sub rsp, 16
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ movq xmm1, QWORD PTR [rsi]
+ movq xmm2, QWORD PTR [rdi]
+
+ punpcklbw xmm1, xmm0
+ punpcklbw xmm2, xmm0
+
+ psubsw xmm1, xmm2
+ paddw xmm7, xmm1
+
+ pmaddwd xmm1, xmm1
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ movq xmm2, QWORD PTR[rsi + rax * 2]
+ movq xmm3, QWORD PTR[rdi + rdx * 2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+ movq xmm2, QWORD PTR[rsi + rax *2]
+ movq xmm3, QWORD PTR[rdi + rdx *2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+ movq xmm2, QWORD PTR[rsi + rax *2]
+ movq xmm3, QWORD PTR[rdi + rdx *2]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ lea rsi, [rsi + rax * 2]
+ lea rdi, [rdi + rdx * 2]
+
+ movq xmm2, QWORD PTR[rsi + rax]
+ movq xmm3, QWORD PTR[rdi + rdx]
+
+ punpcklbw xmm2, xmm0
+ punpcklbw xmm3, xmm0
+
+ psubsw xmm2, xmm3
+ paddw xmm7, xmm2
+
+ pmaddwd xmm2, xmm2
+ paddd xmm1, xmm2
+
+
+ movdqa xmm6, xmm7
+ punpcklwd xmm6, xmm0
+
+ punpckhwd xmm7, xmm0
+ movdqa xmm2, xmm1
+
+ paddw xmm6, xmm7
+ punpckldq xmm1, xmm0
+
+ punpckhdq xmm2, xmm0
+ movdqa xmm7, xmm6
+
+ paddd xmm1, xmm2
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm7, xmm0
+ paddw xmm6, xmm7
+
+ movdqa xmm2, xmm1
+ movdqa xmm7, xmm6
+
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+
+ paddw xmm7, xmm6
+ paddd xmm1, xmm2
+
+ mov rax, arg(5) ;[Sum]
+ mov rdi, arg(4) ;[SSE]
+
+ movq rdx, xmm7
+ movsx rcx, dx
+
+ mov dword ptr [rax], ecx
+ movd DWORD PTR [rdi], xmm1
+
+ ; begin epilog
+ add rsp, 16
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_filter_block2d_bil_var_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int xoffset,
+; int yoffset,
+; int *sum,
+; unsigned int *sumsquared;;
+;
+;)
+global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
+sym(vp8_filter_block2d_bil_var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ pxor xmm6, xmm6 ;
+ pxor xmm7, xmm7 ;
+
+ lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
+ movdqa xmm4, XMMWORD PTR [rsi]
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)]
+ movsxd rax, dword ptr arg(5) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je filter_block2d_bil_var_sse2_sp_only
+
+ shl rax, 5 ; point to filter coeff with xoffset
+ lea rax, [rax + rcx] ; HFilter
+
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip second_pass filter if yoffset=0
+ je filter_block2d_bil_var_sse2_fp_only
+
+ shl rdx, 5
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+
+ pxor xmm0, xmm0 ;
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
+
+ punpcklbw xmm1, xmm0 ;
+ pmullw xmm1, [rax] ;
+ punpcklbw xmm3, xmm0
+ pmullw xmm3, [rax+16] ;
+
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+ movdqa xmm5, xmm1
+
+ movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
+ lea rsi, [rsi + rbx]
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_sse2_loop:
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
+
+ punpcklbw xmm1, xmm0 ;
+ pmullw xmm1, [rax] ;
+ punpcklbw xmm3, xmm0 ;
+ pmullw xmm3, [rax+16] ;
+
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movdqa xmm3, xmm5 ;
+ movdqa xmm5, xmm1 ;
+
+ pmullw xmm3, [rdx] ;
+ pmullw xmm1, [rdx+16] ;
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+ lea rsi, [rsi + rbx] ;ref_pixels_per_line
+%if ABI_IS_32BIT
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_var_sse2_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_sp_only:
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
+ je filter_block2d_bil_var_sse2_full_pixel
+
+ shl rdx, 5
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+ movq xmm1, QWORD PTR [rsi] ;
+ punpcklbw xmm1, xmm0 ;
+
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ lea rsi, [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+ movq xmm3, QWORD PTR [rsi] ;
+ punpcklbw xmm3, xmm0 ;
+ movdqa xmm5, xmm3
+
+ pmullw xmm1, [rdx] ;
+ pmullw xmm3, [rdx+16] ;
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+ movdqa xmm1, xmm5 ;
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_sp_only_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_full_pixel:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ pxor xmm0, xmm0 ;
+
+filter_block2d_bil_full_pixel_loop:
+ movq xmm1, QWORD PTR [rsi] ;
+ punpcklbw xmm1, xmm0 ;
+
+ movq xmm2, QWORD PTR [rdi] ;
+ punpcklbw xmm2, xmm0 ;
+
+ psubw xmm1, xmm2 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_full_pixel_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_fp_only:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm3, QWORD PTR [rsi+1] ;
+
+ punpcklbw xmm1, xmm0 ;
+ pmullw xmm1, [rax] ;
+ punpcklbw xmm3, xmm0 ;
+ pmullw xmm3, [rax+16] ;
+
+ paddw xmm1, xmm3 ;
+ paddw xmm1, xmm4 ;
+ psraw xmm1, xmm_filter_shift ;
+
+ movq xmm3, QWORD PTR [rdi] ;
+ punpcklbw xmm3, xmm0 ;
+
+ psubw xmm1, xmm3 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+ lea rsi, [rsi + rdx]
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_fp_only_loop ;
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(7) ; sum
+ mov rdi, arg(8) ; sumsquared
+
+ movd [rsi], mm2 ; xsum
+ movd [rdi], mm4 ; xxsum
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_horiz_vert_variance8x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
+sym(vp8_half_horiz_vert_variance8x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+%else
+ add rsi, r8
+%endif
+
+vp8_half_horiz_vert_variance8x_h_1:
+
+ movq xmm1, QWORD PTR [rsi] ;
+ movq xmm2, QWORD PTR [rsi+1] ;
+ pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+ pavgb xmm5, xmm1 ; xmm = vertical average of the above
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+ movdqa xmm5, xmm1 ; save xmm1 for use on the next row
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_vert_variance8x_h_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_half_horiz_vert_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
+sym(vp8_half_horiz_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+ movdqu xmm5, XMMWORD PTR [rsi]
+ movdqu xmm3, XMMWORD PTR [rsi+1]
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+ lea rsi, [rsi + rax]
+
+vp8_half_horiz_vert_variance16x_h_1:
+ movdqu xmm1, XMMWORD PTR [rsi] ;
+ movdqu xmm2, XMMWORD PTR [rsi+1] ;
+ pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+ pavgb xmm5, xmm1 ; xmm = vertical average of the above
+
+ movdqa xmm4, xmm5
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+ punpckhbw xmm4, xmm0
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+
+ movq xmm3, QWORD PTR [rdi+8]
+ punpcklbw xmm3, xmm0
+ psubw xmm4, xmm3
+
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm4
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm4, xmm4
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm4
+
+ movdqa xmm5, xmm1 ; save xmm1 for use on the next row
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_vert_variance16x_h_1 ;
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_vert_variance8x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
+sym(vp8_half_vert_variance8x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+vp8_half_vert_variance8x_h_1:
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+
+ sub rcx, 1 ;
+ jnz vp8_half_vert_variance8x_h_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_half_vert_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
+sym(vp8_half_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr
+
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ movdqu xmm5, XMMWORD PTR [rsi]
+ lea rsi, [rsi + rax ]
+ pxor xmm0, xmm0
+
+vp8_half_vert_variance16x_h_1:
+ movdqu xmm3, XMMWORD PTR [rsi]
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ movdqa xmm4, xmm5
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm4, xmm0
+
+ movq xmm2, QWORD PTR [rdi]
+ punpcklbw xmm2, xmm0
+ psubw xmm5, xmm2
+ movq xmm2, QWORD PTR [rdi+8]
+ punpcklbw xmm2, xmm0
+ psubw xmm4, xmm2
+
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm4
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm4, xmm4
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm4
+
+ movdqa xmm5, xmm3
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1
+ jnz vp8_half_vert_variance16x_h_1
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_horiz_variance8x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
+sym(vp8_half_horiz_variance8x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+%if ABI_IS_32BIT=0
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+
+ pxor xmm0, xmm0 ;
+vp8_half_horiz_variance8x_h_1:
+ movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
+ movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+ add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
+ add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
+%else
+ add rsi, r8
+ add rdi, r9
+%endif
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_variance8x_h_1 ;
+
+ movdq2q mm6, xmm6 ;
+ movdq2q mm7, xmm7 ;
+
+ psrldq xmm6, 8
+ psrldq xmm7, 8
+
+ movdq2q mm2, xmm6
+ movdq2q mm3, xmm7
+
+ paddw mm6, mm2
+ paddd mm7, mm3
+
+ pxor mm3, mm3 ;
+ pxor mm2, mm2 ;
+
+ punpcklwd mm2, mm6 ;
+ punpckhwd mm3, mm6 ;
+
+ paddd mm2, mm3 ;
+ movq mm6, mm2 ;
+
+ psrlq mm6, 32 ;
+ paddd mm2, mm6 ;
+
+ psrad mm2, 16 ;
+ movq mm4, mm7 ;
+
+ psrlq mm4, 32 ;
+ paddd mm4, mm7 ;
+
+ mov rsi, arg(5) ; sum
+ mov rdi, arg(6) ; sumsquared
+
+ movd [rsi], mm2 ;
+ movd [rdi], mm4 ;
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp8_half_horiz_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
+sym(vp8_half_horiz_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+vp8_half_horiz_variance16x_h_1:
+ movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
+ movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ movdqa xmm1, xmm5
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+ punpckhbw xmm1, xmm0
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+ movq xmm2, QWORD PTR [rdi+8]
+ punpcklbw xmm2, xmm0
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ psubw xmm1, xmm2
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm1
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm1, xmm1
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_variance16x_h_1 ;
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+ times 8 dw 64
+align 16
+vp8_bilinear_filters_sse2:
+ dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
+ dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+ dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+ dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+ dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+ dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+ dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+ dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
diff --git a/libvpx/vp8/common/x86/variance_impl_ssse3.asm b/libvpx/vp8/common/x86/variance_impl_ssse3.asm
new file mode 100644
index 0000000..686b4a9
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_impl_ssse3.asm
@@ -0,0 +1,364 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift 7
+
+
+;void vp8_filter_block2d_bil_var_ssse3
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int xoffset,
+; int yoffset,
+; int *sum,
+; unsigned int *sumsquared;;
+;
+;)
+;Note: The filter coefficient at offset=0 is 128. Since the second register
+;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
+global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
+sym(vp8_filter_block2d_bil_var_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
+ movsxd rax, dword ptr arg(5) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je .filter_block2d_bil_var_ssse3_sp_only
+
+ shl rax, 4 ; point to filter coeff with xoffset
+ lea rax, [rax + rcx] ; HFilter
+
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip second_pass filter if yoffset=0
+ je .filter_block2d_bil_var_ssse3_fp_only
+
+ shl rdx, 4
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi+1]
+ movdqa xmm2, xmm0
+
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm2, xmm1
+ pmaddubsw xmm0, [rax]
+ pmaddubsw xmm2, [rax]
+
+ paddw xmm0, [GLOBAL(xmm_bi_rd)]
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ psraw xmm0, xmm_filter_shift
+ psraw xmm2, xmm_filter_shift
+
+ packuswb xmm0, xmm2
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
+%else
+ movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+ lea rsi, [rsi + r8]
+%endif
+
+.filter_block2d_bil_var_ssse3_loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rsi+1]
+ movdqa xmm3, xmm1
+
+ punpcklbw xmm1, xmm2
+ punpckhbw xmm3, xmm2
+ pmaddubsw xmm1, [rax]
+ pmaddubsw xmm3, [rax]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+ packuswb xmm1, xmm3
+
+ movdqa xmm2, xmm0
+ movdqa xmm0, xmm1
+ movdqa xmm3, xmm2
+
+ punpcklbw xmm2, xmm1
+ punpckhbw xmm3, xmm1
+ pmaddubsw xmm2, [rdx]
+ pmaddubsw xmm3, [rdx]
+
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm2, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+
+ movq xmm1, QWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm1, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm2, xmm1
+ psubw xmm3, xmm5
+ paddw xmm6, xmm2
+ paddw xmm6, xmm3
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ paddd xmm7, xmm2
+ paddd xmm7, xmm3
+
+%if ABI_IS_32BIT
+ add rsi, dword ptr arg(1) ;ref_pixels_per_line
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rsi, [rsi + r8]
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1
+ jnz .filter_block2d_bil_var_ssse3_loop
+
+ jmp .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_sp_only:
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; Both xoffset =0 and yoffset=0
+ je .filter_block2d_bil_var_ssse3_full_pixel
+
+ shl rdx, 4
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqa xmm0, xmm1
+
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+ lea rsi, [rsi + rax]
+
+.filter_block2d_bil_sp_only_loop:
+ movdqu xmm3, XMMWORD PTR [rsi]
+ movdqa xmm2, xmm1
+ movdqa xmm0, xmm3
+
+ punpcklbw xmm1, xmm3
+ punpckhbw xmm2, xmm3
+ pmaddubsw xmm1, [rdx]
+ pmaddubsw xmm2, [rdx]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm2, xmm_filter_shift
+
+ movq xmm3, QWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm3, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm1, xmm3
+ psubw xmm2, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, xmm2
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm7, xmm1
+ paddd xmm7, xmm2
+
+ movdqa xmm1, xmm0
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+
+%if ABI_IS_32BIT
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1
+ jnz .filter_block2d_bil_sp_only_loop
+
+ jmp .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_full_pixel:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+ pxor xmm0, xmm0
+
+.filter_block2d_bil_full_pixel_loop:
+ movq xmm1, QWORD PTR [rsi]
+ punpcklbw xmm1, xmm0
+ movq xmm2, QWORD PTR [rsi+8]
+ punpcklbw xmm2, xmm0
+
+ movq xmm3, QWORD PTR [rdi]
+ punpcklbw xmm3, xmm0
+ movq xmm4, QWORD PTR [rdi+8]
+ punpcklbw xmm4, xmm0
+
+ psubw xmm1, xmm3
+ psubw xmm2, xmm4
+ paddw xmm6, xmm1
+ paddw xmm6, xmm2
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm7, xmm1
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rdx] ;src_pixels_per_line
+ sub rcx, 1
+ jnz .filter_block2d_bil_full_pixel_loop
+
+ jmp .filter_block2d_bil_variance
+
+.filter_block2d_bil_var_ssse3_fp_only:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0
+
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+.filter_block2d_bil_fp_only_loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rsi+1]
+ movdqa xmm3, xmm1
+
+ punpcklbw xmm1, xmm2
+ punpckhbw xmm3, xmm2
+ pmaddubsw xmm1, [rax]
+ pmaddubsw xmm3, [rax]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+
+ movq xmm2, XMMWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm2, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm1, xmm2
+ psubw xmm3, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, xmm3
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd xmm7, xmm1
+ paddd xmm7, xmm3
+
+ lea rsi, [rsi + rdx]
+%if ABI_IS_32BIT
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1
+ jnz .filter_block2d_bil_fp_only_loop
+
+ jmp .filter_block2d_bil_variance
+
+.filter_block2d_bil_variance:
+ pxor xmm0, xmm0
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(7) ;[Sum]
+ mov rdi, arg(8) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+xmm_bi_rd:
+ times 8 dw 64
+align 16
+vp8_bilinear_filters_ssse3:
+ times 8 db 128, 0
+ times 8 db 112, 16
+ times 8 db 96, 32
+ times 8 db 80, 48
+ times 8 db 64, 64
+ times 8 db 48, 80
+ times 8 db 32, 96
+ times 8 db 16, 112
diff --git a/libvpx/vp8/common/x86/variance_mmx.c b/libvpx/vp8/common/x86/variance_mmx.c
new file mode 100644
index 0000000..0c4dd4a
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_mmx.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/common/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"
+
+extern void filter_block1d_h6_mmx
+(
+ const unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ short *filter
+);
+extern void filter_block1d_v6_mmx
+(
+ const short *src_ptr,
+ unsigned char *output_ptr,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ short *filter
+);
+
+extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
+extern unsigned int vp8_get8x8var_mmx
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern unsigned int vp8_get4x4var_mmx
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_mmx
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+
+unsigned int vp8_variance4x4_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 4));
+
+}
+
+unsigned int vp8_variance8x8_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ *sse = var;
+
+ return (var - ((unsigned int)(avg * avg) >> 6));
+
+}
+
+unsigned int vp8_mse16x16_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3;
+
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ *sse = var;
+ return var;
+}
+
+
+unsigned int vp8_variance16x16_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, sse2, sse3, var;
+ int sum0, sum1, sum2, sum3, avg;
+
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+ var = sse0 + sse1 + sse2 + sse3;
+ avg = sum0 + sum1 + sum2 + sum3;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 8));
+}
+
+unsigned int vp8_variance16x8_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_variance8x16_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+
+ return (var - ((unsigned int)(avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_sub_pixel_variance4x4_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil4x4_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((unsigned int)(xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((unsigned int)(xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+
+
+}
+
+unsigned int vp8_sub_pixel_mse16x16_mmx(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+ return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum0, &xxsum0
+ );
+
+
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum1, &xxsum1
+ );
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_mmx
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((unsigned int)(xsum * xsum) >> 7));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
+ ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
+ ref_ptr, recon_stride, sse);
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
+ ref_ptr, recon_stride, sse);
+}
diff --git a/libvpx/vp8/common/x86/variance_sse2.c b/libvpx/vp8/common/x86/variance_sse2.c
new file mode 100644
index 0000000..afd6429
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_sse2.c
@@ -0,0 +1,558 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/common/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"
+
+extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const short *HFilter,
+ const short *VFilter,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+extern unsigned int vp8_get4x4var_mmx
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+
+unsigned int vp8_get_mb_ss_sse2
+(
+ const short *src_ptr
+);
+unsigned int vp8_get16x16var_sse2
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+unsigned int vp8_get8x8var_sse2
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+void vp8_filter_block2d_bil_var_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int xoffset,
+ int yoffset,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_horiz_vert_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_horiz_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_horiz_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_horiz_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_vert_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+void vp8_half_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+unsigned int vp8_variance4x4_wmt(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 4));
+
+}
+
+unsigned int vp8_variance8x8_wmt
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int var;
+ int avg;
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 6));
+
+}
+
+
+unsigned int vp8_variance16x16_wmt
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0;
+ int sum0;
+
+
+ vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ *sse = sse0;
+ return (sse0 - ((unsigned int)(sum0 * sum0) >> 8));
+}
+unsigned int vp8_mse16x16_wmt(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+
+ unsigned int sse0;
+ int sum0;
+ vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ *sse = sse0;
+ return sse0;
+
+}
+
+
+unsigned int vp8_variance16x8_wmt
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 7));
+
+}
+
+unsigned int vp8_variance8x16_wmt
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *sse)
+{
+ unsigned int sse0, sse1, var;
+ int sum0, sum1, avg;
+
+ vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+ vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+ var = sse0 + sse1;
+ avg = sum0 + sum1;
+ *sse = var;
+ return (var - ((unsigned int)(avg * avg) >> 7));
+
+}
+
+unsigned int vp8_sub_pixel_variance4x4_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+ vp8_filter_block2d_bil4x4_var_mmx(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line,
+ vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
+ &xsum, &xxsum
+ );
+ *sse = xxsum;
+ return (xxsum - ((unsigned int)(xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum, &xxsum);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum, &xxsum);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum, &xxsum);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum, &xxsum);
+ }
+
+ *sse = xxsum;
+ return (xxsum - ((unsigned int)(xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+
+ /* note we could avoid these if statements if the calling function
+ * just called the appropriate functions inside.
+ */
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum0, &xxsum0
+ );
+
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum1, &xxsum1
+ );
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp8_sub_pixel_mse16x16_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+ return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
+
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum1, &xxsum1);
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_wmt
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum;
+ unsigned int xxsum;
+
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum, &xxsum);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum, &xxsum);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance8x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum, &xxsum);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum, &xxsum);
+ }
+
+ *sse = xxsum;
+ return (xxsum - ((unsigned int)(xsum * xsum) >> 7));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_h_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+{
+ int xsum0;
+ unsigned int xxsum0;
+
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_v_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+{
+ int xsum0;
+ unsigned int xxsum0;
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse)
+{
+ int xsum0;
+ unsigned int xxsum0;
+
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
diff --git a/libvpx/vp8/common/x86/variance_ssse3.c b/libvpx/vp8/common/x86/variance_ssse3.c
new file mode 100644
index 0000000..ba2055c
--- /dev/null
+++ b/libvpx/vp8/common/x86/variance_ssse3.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/common/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern unsigned int vp8_get16x16var_sse2
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern void vp8_half_horiz_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_half_horiz_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_half_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_ssse3
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int xoffset,
+ int yoffset,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+unsigned int vp8_sub_pixel_variance16x16_ssse3
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0;
+ unsigned int xxsum0;
+
+ /* note we could avoid these if statements if the calling function
+ * just called the appropriate functions inside.
+ */
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_ssse3(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp8_sub_pixel_variance16x8_ssse3
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+
+)
+{
+ int xsum0;
+ unsigned int xxsum0;
+
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ &xsum0, &xxsum0);
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_ssse3(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 8,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((unsigned int)(xsum0 * xsum0) >> 7));
+}
diff --git a/libvpx/vp8/common/x86/vp8_asm_stubs.c b/libvpx/vp8/common/x86/vp8_asm_stubs.c
new file mode 100644
index 0000000..3437a23
--- /dev/null
+++ b/libvpx/vp8/common/x86/vp8_asm_stubs.c
@@ -0,0 +1,629 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "filter_x86.h"
+
+extern const short vp8_six_tap_mmx[8][6*8];
+
+extern void vp8_filter_block1d_h6_mmx
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1dc_v6_mmx
+(
+ unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int output_pitch,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d8_h6_sse2
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d16_h6_sse2
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d8_v6_sse2
+(
+ unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d16_v6_sse2
+(
+ unsigned short *src_ptr,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const short *vp8_filter
+);
+extern void vp8_unpack_block1d16_h6_sse2
+(
+ unsigned char *src_ptr,
+ unsigned short *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int output_height,
+ unsigned int output_width
+);
+extern void vp8_filter_block1d8_h6_only_sse2
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int output_height,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d16_h6_only_sse2
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int output_height,
+ const short *vp8_filter
+);
+extern void vp8_filter_block1d8_v6_only_sse2
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ int dst_ptich,
+ unsigned int output_height,
+ const short *vp8_filter
+);
+
+
+#if HAVE_MMX
+void vp8_sixtap_predict4x4_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16); /* Temp data bufffer used in filtering */
+ const short *HFilter, *VFilter;
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
+
+}
+
+
+void vp8_sixtap_predict16x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */
+
+ const short *HFilter, *VFilter;
+
+
+ HFilter = vp8_six_tap_mmx[xoffset];
+
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
+
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
+
+}
+
+
+void vp8_sixtap_predict8x8_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
+
+ const short *HFilter, *VFilter;
+
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
+
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
+
+}
+
+
+void vp8_sixtap_predict8x4_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
+
+ const short *HFilter, *VFilter;
+
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
+ vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
+
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter);
+ vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
+
+}
+
+
+
+void vp8_bilinear_predict16x16_mmx
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch);
+ vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
+ vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch);
+ vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
+}
+#endif
+
+
+#if HAVE_SSE2
+void vp8_sixtap_predict16x16_sse2
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */
+
+ const short *HFilter, *VFilter;
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
+ }
+ else
+ {
+ /* First-pass only */
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
+ }
+ }
+ else
+ {
+ /* Second-pass only */
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32);
+ vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
+ }
+}
+
+
+void vp8_sixtap_predict8x8_sse2
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
+ const short *HFilter, *VFilter;
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
+ }
+ else
+ {
+ /* First-pass only */
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
+ }
+ }
+ else
+ {
+ /* Second-pass only */
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
+ }
+}
+
+
+void vp8_sixtap_predict8x4_sse2
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
+ const short *HFilter, *VFilter;
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
+ }
+ else
+ {
+ /* First-pass only */
+ HFilter = vp8_six_tap_mmx[xoffset];
+ vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
+ }
+ }
+ else
+ {
+ /* Second-pass only */
+ VFilter = vp8_six_tap_mmx[yoffset];
+ vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
+ }
+}
+
+#endif
+
+#if HAVE_SSSE3
+
+extern void vp8_filter_block1d8_h6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d16_h6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d16_v6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d8_v6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d4_h6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned char *output_ptr,
+ unsigned int output_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+extern void vp8_filter_block1d4_v6_ssse3
+(
+ unsigned char *src_ptr,
+ unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ unsigned int vp8_filter_index
+);
+
+void vp8_sixtap_predict16x16_ssse3
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24);
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, FData2,
+ 16, 21, xoffset);
+ vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch,
+ 16, yoffset);
+ }
+ else
+ {
+ /* First-pass only */
+ vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 16, xoffset);
+ }
+ }
+ else
+ {
+ if (yoffset)
+ {
+ /* Second-pass only */
+ vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 16, yoffset);
+ }
+ else
+ {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+ }
+ }
+}
+
+void vp8_sixtap_predict8x8_ssse3
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, FData2,
+ 8, 13, xoffset);
+ vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+ 8, yoffset);
+ }
+ else
+ {
+ vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, xoffset);
+ }
+ }
+ else
+ {
+ if (yoffset)
+ {
+ /* Second-pass only */
+ vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 8, yoffset);
+ }
+ else
+ {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+ }
+ }
+}
+
+
+void vp8_sixtap_predict8x4_ssse3
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line, FData2,
+ 8, 9, xoffset);
+ vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+ 4, yoffset);
+ }
+ else
+ {
+ /* First-pass only */
+ vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, xoffset);
+ }
+ }
+ else
+ {
+ if (yoffset)
+ {
+ /* Second-pass only */
+ vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, yoffset);
+ }
+ else
+ {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+ }
+ }
+}
+
+void vp8_sixtap_predict4x4_ssse3
+(
+ unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_ptr,
+ int dst_pitch
+)
+{
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
+
+ if (xoffset)
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ FData2, 4, 9, xoffset);
+ vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch,
+ 4, yoffset);
+ }
+ else
+ {
+ vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, xoffset);
+ }
+ }
+ else
+ {
+ if (yoffset)
+ {
+ vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+ src_pixels_per_line,
+ dst_ptr, dst_pitch, 4, yoffset);
+ }
+ else
+ {
+ /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+ * yoffset==0) case correctly. Add copy function here to guarantee
+ * six-tap function handles all possible offsets. */
+ int r;
+
+ for (r = 0; r < 4; r++)
+ {
+ #if !(CONFIG_FAST_UNALIGNED)
+ dst_ptr[0] = src_ptr[0];
+ dst_ptr[1] = src_ptr[1];
+ dst_ptr[2] = src_ptr[2];
+ dst_ptr[3] = src_ptr[3];
+ #else
+ *(uint32_t *)dst_ptr = *(uint32_t *)src_ptr ;
+ #endif
+ dst_ptr += dst_pitch;
+ src_ptr += src_pixels_per_line;
+ }
+ }
+ }
+}
+
+#endif