summaryrefslogtreecommitdiffstats
path: root/libvpx/vp9/common/x86/vp9_asm_stubs.c
diff options
context:
space:
mode:
Diffstat (limited to 'libvpx/vp9/common/x86/vp9_asm_stubs.c')
-rw-r--r--libvpx/vp9/common/x86/vp9_asm_stubs.c693
1 files changed, 282 insertions, 411 deletions
diff --git a/libvpx/vp9/common/x86/vp9_asm_stubs.c b/libvpx/vp9/common/x86/vp9_asm_stubs.c
index 106e6d4..1b4904c 100644
--- a/libvpx/vp9/common/x86/vp9_asm_stubs.c
+++ b/libvpx/vp9/common/x86/vp9_asm_stubs.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -13,45 +13,205 @@
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vpx_ports/mem.h"
-///////////////////////////////////////////////////////////////////////////
-// the mmx function that does the bilinear filtering and var calculation //
-// int one pass //
-///////////////////////////////////////////////////////////////////////////
-DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
- { 128, 128, 128, 128, 0, 0, 0, 0 },
- { 120, 120, 120, 120, 8, 8, 8, 8 },
- { 112, 112, 112, 112, 16, 16, 16, 16 },
- { 104, 104, 104, 104, 24, 24, 24, 24 },
- { 96, 96, 96, 96, 32, 32, 32, 32 },
- { 88, 88, 88, 88, 40, 40, 40, 40 },
- { 80, 80, 80, 80, 48, 48, 48, 48 },
- { 72, 72, 72, 72, 56, 56, 56, 56 },
- { 64, 64, 64, 64, 64, 64, 64, 64 },
- { 56, 56, 56, 56, 72, 72, 72, 72 },
- { 48, 48, 48, 48, 80, 80, 80, 80 },
- { 40, 40, 40, 40, 88, 88, 88, 88 },
- { 32, 32, 32, 32, 96, 96, 96, 96 },
- { 24, 24, 24, 24, 104, 104, 104, 104 },
- { 16, 16, 16, 16, 112, 112, 112, 112 },
- { 8, 8, 8, 8, 120, 120, 120, 120 }
-};
typedef void filter8_1dfunction (
const unsigned char *src_ptr,
- const unsigned int src_pitch,
+ const ptrdiff_t src_pitch,
unsigned char *output_ptr,
- unsigned int out_pitch,
+ ptrdiff_t out_pitch,
unsigned int output_height,
const short *filter
);
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
+ uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, \
+ int w, int h) { \
+ if (step_q4 == 16 && filter[3] != 128) { \
+ if (filter[0] || filter[1] || filter[2]) { \
+ while (w >= 16) { \
+ vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ vp9_filter_block1d16_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vp9_filter_block1d8_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vp9_filter_block1d4_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ } \
+ if (w) { \
+ vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h); \
+ } \
+}
+
+#define FUN_CONV_2D(avg, opt) \
+void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+ uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, \
+ int w, int h) { \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ if (x_step_q4 == 16 && y_step_q4 == 16) { \
+ if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+ filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
+ vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h + 7); \
+ vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h); \
+ } else { \
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \
+ vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h + 1); \
+ vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h); \
+ } \
+ } else { \
+ vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
+ } \
+}
+#if HAVE_AVX2
+filter8_1dfunction vp9_filter_block1d16_v8_avx2;
+filter8_1dfunction vp9_filter_block1d16_h8_avx2;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
+#else
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+#endif
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
+#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
+#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
+#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
+#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3
+#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3
+#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3
+// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, avx2);
+#endif
#if HAVE_SSSE3
+#if (ARCH_X86_64)
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
+#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
+#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
+#else
filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#endif
filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
@@ -59,201 +219,57 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- /* Ensure the filter can be compressed to int16_t. */
- if (x_step_q4 == 16 && filter_x[3] != 128) {
- while (w >= 16) {
- vp9_filter_block1d16_h8_ssse3(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 16;
- dst += 16;
- w -= 16;
- }
- while (w >= 8) {
- vp9_filter_block1d8_h8_ssse3(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 8;
- dst += 8;
- w -= 8;
- }
- while (w >= 4) {
- vp9_filter_block1d4_h8_ssse3(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 4;
- dst += 4;
- w -= 4;
- }
- }
- if (w) {
- vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- }
-}
-
-void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- if (y_step_q4 == 16 && filter_y[3] != 128) {
- while (w >= 16) {
- vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 16;
- dst += 16;
- w -= 16;
- }
- while (w >= 8) {
- vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 8;
- dst += 8;
- w -= 8;
- }
- while (w >= 4) {
- vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 4;
- dst += 4;
- w -= 4;
- }
- }
- if (w) {
- vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- }
-}
-
-void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- if (x_step_q4 == 16 && filter_x[3] != 128) {
- while (w >= 16) {
- vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 16;
- dst += 16;
- w -= 16;
- }
- while (w >= 8) {
- vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 8;
- dst += 8;
- w -= 8;
- }
- while (w >= 4) {
- vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 4;
- dst += 4;
- w -= 4;
- }
- }
- if (w) {
- vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- }
-}
-
-void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- if (y_step_q4 == 16 && filter_y[3] != 128) {
- while (w >= 16) {
- vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 16;
- dst += 16;
- w -= 16;
- }
- while (w >= 8) {
- vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 8;
- dst += 8;
- w -= 8;
- }
- while (w >= 4) {
- vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 4;
- dst += 4;
- w -= 4;
- }
- }
- if (w) {
- vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- }
-}
-
-void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
- assert(w <= 64);
- assert(h <= 64);
- if (x_step_q4 == 16 && y_step_q4 == 16) {
- vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h + 7);
- vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4, w, h);
- } else {
- vp9_convolve8_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4, w, h);
- }
-}
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
-void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
+// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+ ssse3);
- assert(w <= 64);
- assert(h <= 64);
- if (x_step_q4 == 16 && y_step_q4 == 16) {
- vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h + 7);
- vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- } else {
- vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4, w, h);
- }
-}
+// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, ssse3);
+FUN_CONV_2D(avg_ , ssse3);
#endif
#if HAVE_SSE2
@@ -270,199 +286,54 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
-void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- /* Ensure the filter can be compressed to int16_t. */
- if (x_step_q4 == 16 && filter_x[3] != 128) {
- while (w >= 16) {
- vp9_filter_block1d16_h8_sse2(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 16;
- dst += 16;
- w -= 16;
- }
- while (w >= 8) {
- vp9_filter_block1d8_h8_sse2(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 8;
- dst += 8;
- w -= 8;
- }
- while (w >= 4) {
- vp9_filter_block1d4_h8_sse2(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 4;
- dst += 4;
- w -= 4;
- }
- }
- if (w) {
- vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- }
-}
+filter8_1dfunction vp9_filter_block1d16_v2_sse2;
+filter8_1dfunction vp9_filter_block1d16_h2_sse2;
+filter8_1dfunction vp9_filter_block1d8_v2_sse2;
+filter8_1dfunction vp9_filter_block1d8_h2_sse2;
+filter8_1dfunction vp9_filter_block1d4_v2_sse2;
+filter8_1dfunction vp9_filter_block1d4_h2_sse2;
+filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
+filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
-void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- if (y_step_q4 == 16 && filter_y[3] != 128) {
- while (w >= 16) {
- vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 16;
- dst += 16;
- w -= 16;
- }
- while (w >= 8) {
- vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 8;
- dst += 8;
- w -= 8;
- }
- while (w >= 4) {
- vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 4;
- dst += 4;
- w -= 4;
- }
- }
- if (w) {
- vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- }
-}
+// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
-void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- if (x_step_q4 == 16 && filter_x[3] != 128) {
- while (w >= 16) {
- vp9_filter_block1d16_h8_avg_sse2(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 16;
- dst += 16;
- w -= 16;
- }
- while (w >= 8) {
- vp9_filter_block1d8_h8_avg_sse2(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 8;
- dst += 8;
- w -= 8;
- }
- while (w >= 4) {
- vp9_filter_block1d4_h8_avg_sse2(src, src_stride,
- dst, dst_stride,
- h, filter_x);
- src += 4;
- dst += 4;
- w -= 4;
- }
- }
- if (w) {
- vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- }
-}
-
-void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- if (y_step_q4 == 16 && filter_y[3] != 128) {
- while (w >= 16) {
- vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 16;
- dst += 16;
- w -= 16;
- }
- while (w >= 8) {
- vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 8;
- dst += 8;
- w -= 8;
- }
- while (w >= 4) {
- vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,
- dst, dst_stride,
- h, filter_y);
- src += 4;
- dst += 4;
- w -= 4;
- }
- }
- if (w) {
- vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- }
-}
-
-void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
- assert(w <= 64);
- assert(h <= 64);
- if (x_step_q4 == 16 && y_step_q4 == 16) {
- vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h + 7);
- vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4, w, h);
- } else {
- vp9_convolve8_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4, w, h);
- }
-}
-
-void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
-
- assert(w <= 64);
- assert(h <= 64);
- if (x_step_q4 == 16 && y_step_q4 == 16) {
- vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h + 7);
- vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4,
- w, h);
- } else {
- vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4, filter_y, y_step_q4, w, h);
- }
-}
+// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, sse2);
+FUN_CONV_2D(avg_ , sse2);
#endif