diff options
author | hkuang <hkuang@google.com> | 2013-09-16 15:09:58 -0700 |
---|---|---|
committer | Hangyu Kuang <hkuang@google.com> | 2013-09-17 22:05:28 +0000 |
commit | 1184aebb761cbeac9124c37189a80a1a58f04b6b (patch) | |
tree | b1ce6b3d29c43ffd22eb18999c5c3bad26513a48 | |
parent | f3bed9137f66ef693bd406e43b17e9a1114f1e14 (diff) | |
download | android_external_libvpx-1184aebb761cbeac9124c37189a80a1a58f04b6b.tar.gz android_external_libvpx-1184aebb761cbeac9124c37189a80a1a58f04b6b.tar.bz2 android_external_libvpx-1184aebb761cbeac9124c37189a80a1a58f04b6b.zip |
Roll latest libvpx into Android.
The latest libvpx has more neon optimizations and a lot of
algorithm optimizations which make the vp9 decode much more faster.
bug:10804666
Change-Id: I75eaacea57ecc7542a780be778f0e9e157978524
(cherry picked from commit 3df0563f1b24dac6c0bd122fc922a48211269061)
189 files changed, 16429 insertions, 9643 deletions
diff --git a/armv7a-neon/libvpx_srcs.txt b/armv7a-neon/libvpx_srcs.txt index 25ca5e0..8f8b655 100644 --- a/armv7a-neon/libvpx_srcs.txt +++ b/armv7a-neon/libvpx_srcs.txt @@ -203,13 +203,25 @@ vp8/vp8_cx_iface.c vp8/vp8cx.mk vp8/vp8_dx_iface.c vp8/vp8dx.mk +vp9/common/arm/neon/vp9_avg_neon.asm.s vp9/common/arm/neon/vp9_convolve8_avg_neon.asm.s vp9/common/arm/neon/vp9_convolve8_neon.asm.s vp9/common/arm/neon/vp9_convolve_neon.c +vp9/common/arm/neon/vp9_copy_neon.asm.s vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm.s +vp9/common/arm/neon/vp9_idct16x16_neon.c +vp9/common/arm/neon/vp9_idct32x32_neon.c vp9/common/arm/neon/vp9_loopfilter_neon.asm.s vp9/common/arm/neon/vp9_mb_lpf_neon.asm.s +vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm.s +vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm.s +vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm.s +vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm.s +vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm.s +vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm.s vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm.s +vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm.s +vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm.s vp9/common/generic/vp9_systemdependent.c vp9/common/vp9_alloccommon.c vp9/common/vp9_alloccommon.h @@ -257,6 +269,8 @@ vp9/common/vp9_reconintra.h vp9/common/vp9_rtcd.c vp9/common/vp9_rtcd_defs.sh vp9/common/vp9_sadmxn.h +vp9/common/vp9_scale.c +vp9/common/vp9_scale.h vp9/common/vp9_seg_common.c vp9/common/vp9_seg_common.h vp9/common/vp9_subpelvar.h diff --git a/armv7a-neon/vp9_rtcd.h b/armv7a-neon/vp9_rtcd.h index 4ebb497..fdca309 100644 --- a/armv7a-neon/vp9_rtcd.h +++ b/armv7a-neon/vp9_rtcd.h @@ -36,160 +36,160 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob); #define vp9_idct_add_32x32 vp9_idct_add_32x32_c -void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c -void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c -void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c -void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c -void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c -void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c -void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c -void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c -void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c -void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c -void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c -void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c -void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c -void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c -void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c -void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c -void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c -void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c -void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c -void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c -void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c -void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c -void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c -void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c -void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c -void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c -void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c -void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c -void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c -void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c -void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c -void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c -void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c -void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c -void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c -void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c -void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c -void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c -void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c -void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c -void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c -void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c -void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c -void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c -void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c -void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c -void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c -void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c -void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c -void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c -void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c -void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride); @@ -238,10 +238,12 @@ void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, i #define vp9_blend_b vp9_blend_b_c void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_copy vp9_convolve_copy_c +void vp9_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_copy vp9_convolve_copy_neon void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_avg vp9_convolve_avg_c +void vp9_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_avg vp9_convolve_avg_neon void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -268,41 +270,51 @@ void vp9_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8 #define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_neon void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_short_idct4x4_1_add vp9_short_idct4x4_1_add_c +void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_short_idct4x4_1_add vp9_short_idct4x4_1_add_neon void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_short_idct4x4_add vp9_short_idct4x4_add_c +void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_short_idct4x4_add vp9_short_idct4x4_add_neon void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_short_idct8x8_1_add vp9_short_idct8x8_1_add_c +void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_short_idct8x8_1_add vp9_short_idct8x8_1_add_neon void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride); void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_idct8x8_add vp9_short_idct8x8_add_neon void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_short_idct10_8x8_add vp9_short_idct10_8x8_add_c +void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_short_idct10_8x8_add vp9_short_idct10_8x8_add_neon void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_short_idct16x16_1_add vp9_short_idct16x16_1_add_c +void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_short_idct16x16_1_add vp9_short_idct16x16_1_add_neon void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_short_idct16x16_add vp9_short_idct16x16_add_c +void vp9_short_idct16x16_add_neon(int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_short_idct16x16_add vp9_short_idct16x16_add_neon void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_short_idct10_16x16_add vp9_short_idct10_16x16_add_c +void vp9_short_idct10_16x16_add_neon(int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_short_idct10_16x16_add vp9_short_idct10_16x16_add_neon void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_short_idct32x32_add vp9_short_idct32x32_add_c +void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_short_idct32x32_add vp9_short_idct32x32_add_neon void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output); #define vp9_short_idct1_32x32 vp9_short_idct1_32x32_c void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride, int tx_type); -#define vp9_short_iht4x4_add vp9_short_iht4x4_add_c +void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride, int tx_type); +#define vp9_short_iht4x4_add vp9_short_iht4x4_add_neon void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride, int tx_type); -#define vp9_short_iht8x8_add vp9_short_iht8x8_add_c +void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride, int tx_type); +#define vp9_short_iht8x8_add vp9_short_iht8x8_add_neon void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_short_iht16x16_add vp9_short_iht16x16_add_c @@ -316,12 +328,6 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_iwalsh4x4_add vp9_short_iwalsh4x4_add_c -unsigned int vp9_sad32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad); -#define vp9_sad32x3 vp9_sad32x3_c - -unsigned int vp9_sad3x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad); -#define vp9_sad3x32 vp9_sad3x32_c - void vp9_rtcd(void); #include "vpx_config.h" diff --git a/armv7a-neon/vpx_config.c b/armv7a-neon/vpx_config.c index 77be6fb..dc64ce3 100644 --- a/armv7a-neon/vpx_config.c +++ b/armv7a-neon/vpx_config.c @@ -5,5 +5,5 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ -static const char* const cfg = "--target=armv7-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/johannkoenig/android-ndk --disable-vp9-encoder --disable-examples --disable-docs --enable-realtime-only"; +static const char* const cfg = "--target=armv7-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/hkuang/Downloads/android-ndk-r8e --disable-vp9-encoder --disable-examples --disable-docs --enable-realtime-only"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/armv7a-neon/vpx_config.h b/armv7a-neon/vpx_config.h index d132e4d..452bc91 100644 --- a/armv7a-neon/vpx_config.h +++ b/armv7a-neon/vpx_config.h @@ -59,6 +59,7 @@ #define CONFIG_DC_RECON 0 #define CONFIG_RUNTIME_CPU_DETECT 0 #define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 #define CONFIG_MULTITHREAD 1 #define CONFIG_INTERNAL_STATS 0 #define CONFIG_VP8_ENCODER 1 diff --git a/armv7a-neon/vpx_scale_rtcd.h b/armv7a-neon/vpx_scale_rtcd.h index 9972777..8c2ab2f 100644 --- a/armv7a-neon/vpx_scale_rtcd.h +++ b/armv7a-neon/vpx_scale_rtcd.h @@ -34,13 +34,13 @@ void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); void vp8_yv12_extend_frame_borders_neon(struct yv12_buffer_config *ybf); #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_neon -void vp8_yv12_copy_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -void vp8_yv12_copy_frame_neon(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void vp8_yv12_copy_frame_neon(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_neon -void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -void vp8_yv12_copy_y_neon(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_y vp8_yv12_copy_y_neon +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void vpx_yv12_copy_y_neon(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_neon void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); #define vp9_extend_frame_borders vp9_extend_frame_borders_c diff --git a/armv7a/libvpx_srcs.txt b/armv7a/libvpx_srcs.txt index 2ddb1bd..8f41f9f 100644 --- a/armv7a/libvpx_srcs.txt +++ b/armv7a/libvpx_srcs.txt @@ -212,6 +212,8 @@ vp9/common/vp9_reconintra.h vp9/common/vp9_rtcd.c vp9/common/vp9_rtcd_defs.sh vp9/common/vp9_sadmxn.h +vp9/common/vp9_scale.c +vp9/common/vp9_scale.h vp9/common/vp9_seg_common.c vp9/common/vp9_seg_common.h vp9/common/vp9_subpelvar.h diff --git a/armv7a/vp9_rtcd.h b/armv7a/vp9_rtcd.h index 1ce24c5..36202d2 100644 --- a/armv7a/vp9_rtcd.h +++ b/armv7a/vp9_rtcd.h @@ -36,160 +36,160 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob); #define vp9_idct_add_32x32 vp9_idct_add_32x32_c -void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c -void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c -void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c -void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c -void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c -void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c -void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c -void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c -void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c -void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c -void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c -void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c -void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c -void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c -void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c -void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c -void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c -void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c -void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c -void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c -void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c -void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c -void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c -void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c -void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c -void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c -void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c -void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c -void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c -void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c -void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c -void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c -void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c -void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c -void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c -void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c -void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c -void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c -void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c -void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c -void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c -void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c -void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c -void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c -void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c -void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c -void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c -void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c -void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c -void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c -void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c -void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride); @@ -300,12 +300,6 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_iwalsh4x4_add vp9_short_iwalsh4x4_add_c -unsigned int vp9_sad32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad); -#define vp9_sad32x3 vp9_sad32x3_c - -unsigned int vp9_sad3x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad); -#define vp9_sad3x32 vp9_sad3x32_c - void vp9_rtcd(void); #include "vpx_config.h" diff --git a/armv7a/vpx_config.c b/armv7a/vpx_config.c index a246c39..ecdb0cf 100644 --- a/armv7a/vpx_config.c +++ b/armv7a/vpx_config.c @@ -5,5 +5,5 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ -static const char* const cfg = "--target=armv7-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/johannkoenig/android-ndk --disable-vp9-encoder --disable-neon --disable-examples --disable-docs --enable-realtime-only"; +static const char* const cfg = "--target=armv7-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/hkuang/Downloads/android-ndk-r8e --disable-vp9-encoder --disable-neon --disable-examples --disable-docs --enable-realtime-only"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/armv7a/vpx_config.h b/armv7a/vpx_config.h index a330023..c789546 100644 --- a/armv7a/vpx_config.h +++ b/armv7a/vpx_config.h @@ -59,6 +59,7 @@ #define CONFIG_DC_RECON 0 #define CONFIG_RUNTIME_CPU_DETECT 0 #define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 #define CONFIG_MULTITHREAD 1 #define CONFIG_INTERNAL_STATS 0 #define CONFIG_VP8_ENCODER 1 diff --git a/armv7a/vpx_scale_rtcd.h b/armv7a/vpx_scale_rtcd.h index d4212f2..0df8b37 100644 --- a/armv7a/vpx_scale_rtcd.h +++ b/armv7a/vpx_scale_rtcd.h @@ -33,11 +33,11 @@ void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pit void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c -void vp8_yv12_copy_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c -void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_y vp8_yv12_copy_y_c +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); #define vp9_extend_frame_borders vp9_extend_frame_borders_c diff --git a/generic/libvpx_srcs.txt b/generic/libvpx_srcs.txt index 055f5fb..8e6fad7 100644 --- a/generic/libvpx_srcs.txt +++ b/generic/libvpx_srcs.txt @@ -172,6 +172,8 @@ vp9/common/vp9_reconintra.h vp9/common/vp9_rtcd.c vp9/common/vp9_rtcd_defs.sh vp9/common/vp9_sadmxn.h +vp9/common/vp9_scale.c +vp9/common/vp9_scale.h vp9/common/vp9_seg_common.c vp9/common/vp9_seg_common.h vp9/common/vp9_subpelvar.h diff --git a/generic/vp9_rtcd.h b/generic/vp9_rtcd.h index 2562e82..4dcc1f6 100644 --- a/generic/vp9_rtcd.h +++ b/generic/vp9_rtcd.h @@ -36,160 +36,160 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob); #define vp9_idct_add_32x32 vp9_idct_add_32x32_c -void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c -void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c -void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c -void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c -void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c -void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c -void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c -void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c -void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c -void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c -void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c -void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c -void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c -void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c -void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c -void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c -void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c -void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c -void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c -void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c -void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c -void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c -void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c -void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c -void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c -void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c -void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c -void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c -void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c -void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c -void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c -void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c -void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c -void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c -void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c -void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c -void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c -void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c -void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c -void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c -void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c -void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c -void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c -void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c -void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c -void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c -void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c -void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c -void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c -void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c -void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c -void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride); @@ -300,12 +300,6 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_iwalsh4x4_add vp9_short_iwalsh4x4_add_c -unsigned int vp9_sad32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad); -#define vp9_sad32x3 vp9_sad32x3_c - -unsigned int vp9_sad3x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad); -#define vp9_sad3x32 vp9_sad3x32_c - void vp9_rtcd(void); #include "vpx_config.h" diff --git a/generic/vpx_config.h b/generic/vpx_config.h index 4d6172b..c856d4d 100644 --- a/generic/vpx_config.h +++ b/generic/vpx_config.h @@ -59,6 +59,7 @@ #define CONFIG_DC_RECON 0 #define CONFIG_RUNTIME_CPU_DETECT 0 #define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 #define CONFIG_MULTITHREAD 1 #define CONFIG_INTERNAL_STATS 0 #define CONFIG_VP8_ENCODER 1 diff --git a/generic/vpx_scale_rtcd.h b/generic/vpx_scale_rtcd.h index c2842ee..472a290 100644 --- a/generic/vpx_scale_rtcd.h +++ b/generic/vpx_scale_rtcd.h @@ -33,11 +33,11 @@ void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pit void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c -void vp8_yv12_copy_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c -void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_y vp8_yv12_copy_y_c +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); #define vp9_extend_frame_borders vp9_extend_frame_borders_c diff --git a/libvpx/build/make/armlink_adapter.sh b/libvpx/build/make/armlink_adapter.sh index b53669c..75c342e 100755 --- a/libvpx/build/make/armlink_adapter.sh +++ b/libvpx/build/make/armlink_adapter.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh ## ## Copyright (c) 2010 The WebM project authors. All Rights Reserved. ## @@ -13,20 +13,20 @@ verbose=0 set -- $* for i; do - if [ "$i" == "-o" ]; then + if [ "$i" = "-o" ]; then on_of=1 - elif [ "$i" == "-v" ]; then + elif [ "$i" = "-v" ]; then verbose=1 - elif [ "$i" == "-g" ]; then + elif [ "$i" = "-g" ]; then args="${args} --debug" - elif [ "$on_of" == "1" ]; then + elif [ "$on_of" = "1" ]; then outfile=$i on_of=0 elif [ -f "$i" ]; then infiles="$infiles $i" - elif [ "${i:0:2}" == "-l" ]; then + elif [ "${i#-l}" != "$i" ]; then libs="$libs ${i#-l}" - elif [ "${i:0:2}" == "-L" ]; then + elif [ "${i#-L}" != "$i" ]; then libpaths="${libpaths} ${i#-L}" else args="${args} ${i}" diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh index e2566b0..bb7ab41 100755 --- a/libvpx/build/make/configure.sh +++ b/libvpx/build/make/configure.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh ## ## configure.sh ## @@ -198,11 +198,11 @@ add_extralibs() { # # Boolean Manipulation Functions # -enable(){ +enable_feature(){ set_all yes $* } -disable(){ +disable_feature(){ set_all no $* } @@ -219,7 +219,7 @@ soft_enable() { for var in $*; do if ! disabled $var; then log_echo " enabling $var" - enable $var + enable_feature $var fi done } @@ -228,7 +228,7 @@ soft_disable() { for var in $*; do if ! enabled $var; then log_echo " disabling $var" - disable $var + disable_feature $var fi done } @@ -251,10 +251,10 @@ tolower(){ # Temporary File Functions # source_path=${0%/*} -enable source_path_used +enable_feature source_path_used if test -z "$source_path" -o "$source_path" = "." ; then source_path="`pwd`" - disable source_path_used + disable_feature source_path_used fi if test ! -z "$TMPDIR" ; then @@ -264,12 +264,13 @@ elif test ! -z "$TEMPDIR" ; then else TMPDIRx="/tmp" fi -TMP_H="${TMPDIRx}/vpx-conf-$$-${RANDOM}.h" -TMP_C="${TMPDIRx}/vpx-conf-$$-${RANDOM}.c" -TMP_CC="${TMPDIRx}/vpx-conf-$$-${RANDOM}.cc" -TMP_O="${TMPDIRx}/vpx-conf-$$-${RANDOM}.o" -TMP_X="${TMPDIRx}/vpx-conf-$$-${RANDOM}.x" -TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RANDOM}.asm" +RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}') +TMP_H="${TMPDIRx}/vpx-conf-$$-${RAND}.h" +TMP_C="${TMPDIRx}/vpx-conf-$$-${RAND}.c" +TMP_CC="${TMPDIRx}/vpx-conf-$$-${RAND}.cc" +TMP_O="${TMPDIRx}/vpx-conf-$$-${RAND}.o" +TMP_X="${TMPDIRx}/vpx-conf-$$-${RAND}.x" +TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RAND}.asm" clean_temp_files() { rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM} @@ -316,8 +317,8 @@ check_header(){ header=$1 shift var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'` - disable $var - check_cpp "$@" <<EOF && enable $var + disable_feature $var + check_cpp "$@" <<EOF && enable_feature $var #include "$header" int x; EOF @@ -479,7 +480,7 @@ process_common_cmdline() { for opt in "$@"; do optval="${opt#*=}" case "$opt" in - --child) enable child + --child) enable_feature child ;; --log*) logging="$optval" @@ -491,7 +492,7 @@ process_common_cmdline() { ;; --target=*) toolchain="${toolchain:-${optval}}" ;; - --force-target=*) toolchain="${toolchain:-${optval}}"; enable force_toolchain + --force-target=*) toolchain="${toolchain:-${optval}}"; enable_feature force_toolchain ;; --cpu) ;; @@ -511,7 +512,7 @@ process_common_cmdline() { echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null || die_unknown $opt fi - $action $option + ${action}_feature $option ;; --require-?*) eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'` @@ -523,11 +524,11 @@ process_common_cmdline() { ;; --force-enable-?*|--force-disable-?*) eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'` - $action $option + ${action}_feature $option ;; --libc=*) [ -d "${optval}" ] || die "Not a directory: ${optval}" - disable builtin_libc + disable_feature builtin_libc alt_libc="${optval}" ;; --as=*) @@ -696,13 +697,13 @@ process_common_toolchain() { # Mark the specific ISA requested as enabled soft_enable ${tgt_isa} - enable ${tgt_os} - enable ${tgt_cc} + enable_feature ${tgt_os} + enable_feature ${tgt_cc} # Enable the architecture family case ${tgt_isa} in - arm*) enable arm;; - mips*) enable mips;; + arm*) enable_feature arm;; + mips*) enable_feature mips;; esac # PIC is probably what we want when building shared libs @@ -765,7 +766,7 @@ process_common_toolchain() { case ${toolchain} in sparc-solaris-*) add_extralibs -lposix4 - disable fast_unaligned + disable_feature fast_unaligned ;; *-solaris-*) add_extralibs -lposix4 @@ -790,7 +791,7 @@ process_common_toolchain() { ;; armv5te) soft_enable edsp - disable fast_unaligned + disable_feature fast_unaligned ;; esac @@ -805,7 +806,7 @@ process_common_toolchain() { arch_int=${arch_int%%te} check_add_asflags --defsym ARCHITECTURE=${arch_int} tune_cflags="-mtune=" - if [ ${tgt_isa} == "armv7" ]; then + if [ ${tgt_isa} = "armv7" ]; then if [ -z "${float_abi}" ]; then check_cpp <<EOF && float_abi=hard || float_abi=softfp #ifndef __ARM_PCS_VFP @@ -842,8 +843,8 @@ EOF asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl" AS_SFX=.s msvs_arch_dir=arm-msvs - disable multithread - disable unit_tests + disable_feature multithread + disable_feature unit_tests ;; rvct) CC=armcc @@ -855,7 +856,7 @@ EOF tune_cflags="--cpu=" tune_asflags="--cpu=" if [ -z "${tune_cpu}" ]; then - if [ ${tgt_isa} == "armv7" ]; then + if [ ${tgt_isa} = "armv7" ]; then if enabled neon then check_add_cflags --fpu=softvfp+vfpv3 @@ -880,8 +881,8 @@ EOF case ${tgt_os} in none*) - disable multithread - disable os_support + disable_feature multithread + disable_feature os_support ;; android*) @@ -913,9 +914,9 @@ EOF # Cortex-A8 implementations (NDK Dev Guide) add_ldflags "-Wl,--fix-cortex-a8" - enable pic + enable_feature pic soft_enable realtime_only - if [ ${tgt_isa} == "armv7" ]; then + if [ ${tgt_isa} = "armv7" ]; then soft_enable runtime_cpu_detect fi if enabled runtime_cpu_detect; then @@ -969,7 +970,7 @@ EOF ;; linux*) - enable linux + enable_feature linux if enabled rvct; then # Check if we have CodeSourcery GCC in PATH. Needed for # libraries @@ -1000,14 +1001,14 @@ EOF tune_cflags="-mtune=" if enabled dspr2; then check_add_cflags -mips32r2 -mdspr2 - disable fast_unaligned + disable_feature fast_unaligned fi check_add_cflags -march=${tgt_isa} check_add_asflags -march=${tgt_isa} check_add_asflags -KPIC ;; ppc*) - enable ppc + enable_feature ppc bits=${tgt_isa##ppc} link_with_cc=gcc setup_gnu_toolchain @@ -1155,7 +1156,7 @@ EOF ;; universal*|*-gcc|generic-gnu) link_with_cc=gcc - enable gcc + enable_feature gcc setup_gnu_toolchain ;; esac @@ -1191,7 +1192,7 @@ EOF # default use_x86inc to yes if pic is no or 64bit or we are not on darwin echo " checking here for x86inc \"${tgt_isa}\" \"$pic\" " - if [ ${tgt_isa} = x86_64 -o ! "$pic" == "yes" -o ! ${tgt_os:0:6} = darwin ]; then + if [ ${tgt_isa} = x86_64 -o ! "$pic" = "yes" -o "${tgt_os#darwin}" = "${tgt_os}" ]; then soft_enable use_x86inc fi @@ -1204,14 +1205,14 @@ EOF enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 # Check for strip utility variant - ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable gnu_strip + ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip # Try to determine target endianness check_cc <<EOF unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E'; EOF [ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' | - grep '4f *32 *42 *45' >/dev/null 2>&1 && enable big_endian + grep '4f *32 *42 *45' >/dev/null 2>&1 && enable_feature big_endian # Try to find which inline keywords are supported check_cc <<EOF && INLINE="inline" @@ -1236,7 +1237,7 @@ EOF if enabled dspr2; then if enabled big_endian; then echo "dspr2 optimizations are available only for little endian platforms" - disable dspr2 + disable_feature dspr2 fi fi ;; @@ -1287,8 +1288,8 @@ print_config_h() { print_webm_license() { local destination=$1 - local prefix=$2 - local suffix=$3 + local prefix="$2" + local suffix="$3" shift 3 cat <<EOF > ${destination} ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix} @@ -1309,7 +1310,7 @@ process_detect() { true; } -enable logging +enable_feature logging logfile="config.log" self=$0 process() { diff --git a/libvpx/build/make/gen_asm_deps.sh b/libvpx/build/make/gen_asm_deps.sh index 0b4e3aa..6a7bff9 100755 --- a/libvpx/build/make/gen_asm_deps.sh +++ b/libvpx/build/make/gen_asm_deps.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh ## ## Copyright (c) 2010 The WebM project authors. All Rights Reserved. ## diff --git a/libvpx/build/make/version.sh b/libvpx/build/make/version.sh index 3efb956..e31e568 100755 --- a/libvpx/build/make/version.sh +++ b/libvpx/build/make/version.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh ## ## Copyright (c) 2010 The WebM project authors. All Rights Reserved. ## diff --git a/libvpx/configure b/libvpx/configure index 24be893..297cec4 100755 --- a/libvpx/configure +++ b/libvpx/configure @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh ## ## configure ## @@ -38,6 +38,7 @@ Advanced options: ${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders) ${toggle_mem_tracker} track memory usage ${toggle_postproc} postprocessing + ${toggle_vp9_postproc} vp9 specific postprocessing ${toggle_multithread} multithreaded encoding and decoding ${toggle_spatial_resampling} spatial sampling (scaling) support ${toggle_realtime_only} enable this option while building for real-time encoding @@ -153,7 +154,7 @@ all_targets="libs examples docs" # all targets available are enabled, by default. for t in ${all_targets}; do - [ -f ${source_path}/${t}.mk ] && enable ${t} + [ -f ${source_path}/${t}.mk ] && enable_feature ${t} done # check installed doxygen version @@ -164,30 +165,30 @@ if [ ${doxy_major:-0} -ge 1 ]; then doxy_minor=${doxy_version%%.*} doxy_patch=${doxy_version##*.} - [ $doxy_major -gt 1 ] && enable doxygen - [ $doxy_minor -gt 5 ] && enable doxygen - [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable doxygen + [ $doxy_major -gt 1 ] && enable_feature doxygen + [ $doxy_minor -gt 5 ] && enable_feature doxygen + [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen fi # install everything except the sources, by default. sources will have # to be enabled when doing dist builds, since that's no longer a common # case. -enabled doxygen && php -v >/dev/null 2>&1 && enable install_docs -enable install_bins -enable install_libs - -enable static -enable optimizations -enable fast_unaligned #allow unaligned accesses, if supported by hw -enable md5 -enable spatial_resampling -enable multithread -enable os_support -enable temporal_denoising - -[ -d ${source_path}/../include ] && enable alt_tree_layout +enabled doxygen && php -v >/dev/null 2>&1 && enable_feature install_docs +enable_feature install_bins +enable_feature install_libs + +enable_feature static +enable_feature optimizations +enable_feature fast_unaligned #allow unaligned accesses, if supported by hw +enable_feature md5 +enable_feature spatial_resampling +enable_feature multithread +enable_feature os_support +enable_feature temporal_denoising + +[ -d ${source_path}/../include ] && enable_feature alt_tree_layout for d in vp8 vp9; do - [ -d ${source_path}/${d} ] && disable alt_tree_layout; + [ -d ${source_path}/${d} ] && disable_feature alt_tree_layout; done if ! enabled alt_tree_layout; then @@ -200,10 +201,10 @@ else [ -f ${source_path}/../include/vpx/vp8dx.h ] && CODECS="${CODECS} vp8_decoder" [ -f ${source_path}/../include/vpx/vp9cx.h ] && CODECS="${CODECS} vp9_encoder" [ -f ${source_path}/../include/vpx/vp9dx.h ] && CODECS="${CODECS} vp9_decoder" -[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable vp8_encoder -[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable vp8_decoder -[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable vp9_encoder -[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable vp9_decoder +[ -f ${source_path}/../include/vpx/vp8cx.h ] || disable_feature vp8_encoder +[ -f ${source_path}/../include/vpx/vp8dx.h ] || disable_feature vp8_decoder +[ -f ${source_path}/../include/vpx/vp9cx.h ] || disable_feature vp9_encoder +[ -f ${source_path}/../include/vpx/vp9dx.h ] || disable_feature vp9_decoder [ -f ${source_path}/../lib/*/*mt.lib ] && soft_enable static_msvcrt fi @@ -279,6 +280,7 @@ CONFIG_LIST=" dc_recon runtime_cpu_detect postproc + vp9_postproc multithread internal_stats ${CODECS} @@ -314,6 +316,7 @@ CMDLINE_SELECT=" gprof gcov pic + use_x86inc optimizations ccache runtime_cpu_detect @@ -332,6 +335,7 @@ CMDLINE_SELECT=" dequant_tokens dc_recon postproc + vp9_postproc multithread internal_stats ${CODECS} @@ -357,12 +361,12 @@ process_cmdline() { for opt do optval="${opt#*=}" case "$opt" in - --disable-codecs) for c in ${CODECS}; do disable $c; done ;; + --disable-codecs) for c in ${CODECS}; do disable_feature $c; done ;; --enable-?*|--disable-?*) eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'` if echo "${EXPERIMENT_LIST}" | grep "^ *$option\$" >/dev/null; then if enabled experimental; then - $action $option + ${action}_feature $option else log_echo "Ignoring $opt -- not in experimental mode." fi @@ -383,8 +387,8 @@ post_process_cmdline() { # If the codec family is enabled, enable all components of that family. log_echo "Configuring selected codecs" for c in ${CODECS}; do - disabled ${c%%_*} && disable ${c} - enabled ${c%%_*} && enable ${c} + disabled ${c%%_*} && disable_feature ${c} + enabled ${c%%_*} && enable_feature ${c} done # Enable all detected codecs, if they haven't been disabled @@ -392,12 +396,12 @@ post_process_cmdline() { # Enable the codec family if any component of that family is enabled for c in ${CODECS}; do - enabled $c && enable ${c%_*} + enabled $c && enable_feature ${c%_*} done # Set the {en,de}coders variable if any algorithm in that class is enabled for c in ${CODECS}; do - enabled ${c} && enable ${c##*_}s + enabled ${c} && enable_feature ${c##*_}s done } @@ -437,7 +441,7 @@ process_targets() { done enabled debug_libs && DIST_DIR="${DIST_DIR}-debug" enabled codec_srcs && DIST_DIR="${DIST_DIR}-src" - ! enabled postproc && DIST_DIR="${DIST_DIR}-nopost" + ! enabled postproc && ! enabled vp9_postproc && DIST_DIR="${DIST_DIR}-nopost" ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt" ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs" DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}" @@ -507,13 +511,13 @@ process_detect() { fi if [ -z "$CC" ] || enabled external_build; then echo "Bypassing toolchain for environment detection." - enable external_build + enable_feature external_build check_header() { log fake_check_header "$@" header=$1 shift var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'` - disable $var + disable_feature $var # Headers common to all environments case $header in stdio.h) @@ -525,7 +529,7 @@ process_detect() { [ -f "${d##-I}/$header" ] && result=true && break done ${result:-true} - esac && enable $var + esac && enable_feature $var # Specialize windows and POSIX environments. case $toolchain in @@ -533,7 +537,7 @@ process_detect() { case $header-$toolchain in stdint*-gcc) true;; *) false;; - esac && enable $var + esac && enable_feature $var ;; *) case $header in @@ -542,7 +546,7 @@ process_detect() { sys/mman.h) true;; unistd.h) true;; *) false;; - esac && enable $var + esac && enable_feature $var esac enabled $var } @@ -560,7 +564,7 @@ EOF check_header sys/mman.h check_header unistd.h # for sysconf(3) and friends. - check_header vpx/vpx_integer.h -I${source_path} && enable vpx_ports + check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports } process_toolchain() { @@ -642,14 +646,18 @@ process_toolchain() { # ccache only really works on gcc toolchains enabled gcc || soft_disable ccache if enabled mips; then - enable dequant_tokens - enable dc_recon + enable_feature dequant_tokens + enable_feature dc_recon + fi + + if enabled internal_stats; then + enable_feature vp9_postproc fi # Enable the postbuild target if building for visual studio. case "$tgt_cc" in - vs*) enable msvs - enable solution + vs*) enable_feature msvs + enable_feature solution vs_version=${tgt_cc##vs} case $vs_version in [789]) diff --git a/libvpx/examples.mk b/libvpx/examples.mk index 5b5ca23..c17fac9 100644 --- a/libvpx/examples.mk +++ b/libvpx/examples.mk @@ -49,6 +49,9 @@ vpxenc.DESCRIPTION = Full featured encoder UTILS-$(CONFIG_VP8_ENCODER) += vp8_scalable_patterns.c vp8_scalable_patterns.GUID = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C vp8_scalable_patterns.DESCRIPTION = Temporal Scalability Encoder +UTILS-$(CONFIG_VP8_ENCODER) += vp9_spatial_scalable_encoder.c +vp8_scalable_patterns.GUID = 4A38598D-627D-4505-9C7B-D4020C84100D +vp8_scalable_patterns.DESCRIPTION = Spatial Scalable Encoder # Clean up old ivfenc, ivfdec binaries. ifeq ($(CONFIG_MSVS),yes) diff --git a/libvpx/test/acm_random.h b/libvpx/test/acm_random.h index cd33d12..de94186 100644 --- a/libvpx/test/acm_random.h +++ b/libvpx/test/acm_random.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef LIBVPX_TEST_ACM_RANDOM_H_ -#define LIBVPX_TEST_ACM_RANDOM_H_ +#ifndef TEST_ACM_RANDOM_H_ +#define TEST_ACM_RANDOM_H_ #include "third_party/googletest/src/include/gtest/gtest.h" @@ -59,4 +59,4 @@ class ACMRandom { } // namespace libvpx_test -#endif // LIBVPX_TEST_ACM_RANDOM_H_ +#endif // TEST_ACM_RANDOM_H_ diff --git a/libvpx/test/borders_test.cc b/libvpx/test/borders_test.cc index 7bfece8..dcdedcf 100644 --- a/libvpx/test/borders_test.cc +++ b/libvpx/test/borders_test.cc @@ -29,8 +29,8 @@ class BordersTest : public ::libvpx_test::EncoderTest, virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { - if ( video->frame() == 1) { - encoder->Control(VP8E_SET_CPUUSED, 0); + if (video->frame() == 1) { + encoder->Control(VP8E_SET_CPUUSED, 1); encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); diff --git a/libvpx/test/clear_system_state.h b/libvpx/test/clear_system_state.h index e240981..8f08a4c 100644 --- a/libvpx/test/clear_system_state.h +++ b/libvpx/test/clear_system_state.h @@ -10,7 +10,7 @@ #ifndef TEST_CLEAR_SYSTEM_STATE_H_ #define TEST_CLEAR_SYSTEM_STATE_H_ -#include "vpx_config.h" +#include "./vpx_config.h" extern "C" { #if ARCH_X86 || ARCH_X86_64 # include "vpx_ports/x86.h" diff --git a/libvpx/test/convolve_test.cc b/libvpx/test/convolve_test.cc index b1510c6..3100571 100644 --- a/libvpx/test/convolve_test.cc +++ b/libvpx/test/convolve_test.cc @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <string.h> #include "test/acm_random.h" #include "test/register_state_check.h" #include "test/util.h" @@ -187,7 +188,7 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) { protected: static const int kDataAlignment = 16; - static const int kOuterBlockSize = 128; + static const int kOuterBlockSize = 256; static const int kInputStride = kOuterBlockSize; static const int kOutputStride = kOuterBlockSize; static const int kMaxDimension = 64; @@ -224,6 +225,10 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) { input_[i] = prng.Rand8Extremes(); } + void SetConstantInput(int value) { + memset(input_, value, kInputBufferSize); + } + void CheckGuardBlocks() { for (int i = 0; i < kOutputBufferSize; ++i) { if (IsIndexInBorder(i)) @@ -456,45 +461,86 @@ DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = { { 128} }; +/* This test exercises the horizontal and vertical filter functions. */ TEST_P(ConvolveTest, ChangeFilterWorks) { uint8_t* const in = input(); uint8_t* const out = output(); + + /* Assume that the first input sample is at the 8/16th position. */ + const int kInitialSubPelOffset = 8; + + /* Filters are 8-tap, so the first filter tap will be applied to the pixel + * at position -3 with respect to the current filtering position. Since + * kInitialSubPelOffset is set to 8, we first select sub-pixel filter 8, + * which is non-zero only in the last tap. So, applying the filter at the + * current input position will result in an output equal to the pixel at + * offset +4 (-3 + 7) with respect to the current filtering position. + */ const int kPixelSelected = 4; + /* Assume that each output pixel requires us to step on by 17/16th pixels in + * the input. + */ + const int kInputPixelStep = 17; + + /* The filters are setup in such a way that the expected output produces + * sets of 8 identical output samples. As the filter position moves to the + * next 1/16th pixel position the only active (=128) filter tap moves one + * position to the left, resulting in the same input pixel being replicated + * in to the output for 8 consecutive samples. After each set of 8 positions + * the filters select a different input pixel. kFilterPeriodAdjust below + * computes which input pixel is written to the output for a specified + * x or y position. + */ + + /* Test the horizontal filter. */ REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride, - kChangeFilters[8], 17, kChangeFilters[4], 16, - Width(), Height())); + kChangeFilters[kInitialSubPelOffset], + kInputPixelStep, NULL, 0, Width(), Height())); for (int x = 0; x < Width(); ++x) { - const int kQ4StepAdjust = x >> 4; const int kFilterPeriodAdjust = (x >> 3) << 3; - const int ref_x = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected; - ASSERT_EQ(in[ref_x], out[x]) << "x == " << x; + const int ref_x = + kPixelSelected + ((kInitialSubPelOffset + + kFilterPeriodAdjust * kInputPixelStep) + >> SUBPEL_BITS); + ASSERT_EQ(in[ref_x], out[x]) << "x == " << x << "width = " << Width(); } + /* Test the vertical filter. */ REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride, - kChangeFilters[4], 16, kChangeFilters[8], 17, - Width(), Height())); + NULL, 0, kChangeFilters[kInitialSubPelOffset], + kInputPixelStep, Width(), Height())); for (int y = 0; y < Height(); ++y) { - const int kQ4StepAdjust = y >> 4; const int kFilterPeriodAdjust = (y >> 3) << 3; - const int ref_y = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected; + const int ref_y = + kPixelSelected + ((kInitialSubPelOffset + + kFilterPeriodAdjust * kInputPixelStep) + >> SUBPEL_BITS); ASSERT_EQ(in[ref_y * kInputStride], out[y * kInputStride]) << "y == " << y; } + /* Test the horizontal and vertical filters in combination. */ REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride, - kChangeFilters[8], 17, kChangeFilters[8], 17, + kChangeFilters[kInitialSubPelOffset], + kInputPixelStep, + kChangeFilters[kInitialSubPelOffset], + kInputPixelStep, Width(), Height())); for (int y = 0; y < Height(); ++y) { - const int kQ4StepAdjustY = y >> 4; const int kFilterPeriodAdjustY = (y >> 3) << 3; - const int ref_y = kQ4StepAdjustY + kFilterPeriodAdjustY + kPixelSelected; + const int ref_y = + kPixelSelected + ((kInitialSubPelOffset + + kFilterPeriodAdjustY * kInputPixelStep) + >> SUBPEL_BITS); for (int x = 0; x < Width(); ++x) { - const int kQ4StepAdjustX = x >> 4; const int kFilterPeriodAdjustX = (x >> 3) << 3; - const int ref_x = kQ4StepAdjustX + kFilterPeriodAdjustX + kPixelSelected; + const int ref_x = + kPixelSelected + ((kInitialSubPelOffset + + kFilterPeriodAdjustX * kInputPixelStep) + >> SUBPEL_BITS); ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x]) << "x == " << x << ", y == " << y; @@ -502,6 +548,34 @@ TEST_P(ConvolveTest, ChangeFilterWorks) { } } +/* This test exercises that enough rows and columns are filtered with every + possible initial fractional positions and scaling steps. */ +TEST_P(ConvolveTest, CheckScalingFiltering) { + uint8_t* const in = input(); + uint8_t* const out = output(); + + SetConstantInput(127); + + for (int frac = 0; frac < 16; ++frac) { + for (int step = 1; step <= 32; ++step) { + /* Test the horizontal and vertical filters in combination. */ + REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride, + vp9_sub_pel_filters_8[frac], step, + vp9_sub_pel_filters_8[frac], step, + Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + ASSERT_EQ(in[y * kInputStride + x], out[y * kOutputStride + x]) + << "x == " << x << ", y == " << y + << ", frac == " << frac << ", step == " << step; + } + } + } + } +} using std::tr1::make_tuple; diff --git a/libvpx/test/cpu_speed_test.cc b/libvpx/test/cpu_speed_test.cc index e6ad75b..c92e723 100644 --- a/libvpx/test/cpu_speed_test.cc +++ b/libvpx/test/cpu_speed_test.cc @@ -108,5 +108,5 @@ using std::tr1::make_tuple; VP9_INSTANTIATE_TEST_CASE( CpuSpeedTest, ::testing::Values(::libvpx_test::kTwoPassGood), - ::testing::Range(0, 3)); + ::testing::Range(0, 5)); } // namespace diff --git a/libvpx/test/datarate_test.cc b/libvpx/test/datarate_test.cc index 287e805..f020a99 100644 --- a/libvpx/test/datarate_test.cc +++ b/libvpx/test/datarate_test.cc @@ -75,7 +75,7 @@ class DatarateTest : public ::libvpx_test::EncoderTest, bits_in_buffer_model_ -= frame_size_in_bits; // Update the running total of bits for end of test datarate checks. - bits_total_ += frame_size_in_bits ; + bits_total_ += frame_size_in_bits; // If first drop not set and we have a drop set it to this time. if (!first_drop_ && duration > 1) diff --git a/libvpx/test/dct16x16_test.cc b/libvpx/test/dct16x16_test.cc index 0795054..7d49c12 100644 --- a/libvpx/test/dct16x16_test.cc +++ b/libvpx/test/dct16x16_test.cc @@ -13,15 +13,16 @@ #include <string.h> #include "third_party/googletest/src/include/gtest/gtest.h" -#include "vpx_ports/mem.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" extern "C" { #include "vp9/common/vp9_entropy.h" -#include "vp9_rtcd.h" -void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch); +#include "./vp9_rtcd.h" +void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *output, int pitch); } - -#include "acm_random.h" #include "vpx/vpx_integer.h" using libvpx_test::ACMRandom; @@ -31,12 +32,13 @@ namespace { #ifdef _MSC_VER static int round(double x) { if (x < 0) - return (int)ceil(x - 0.5); + return static_cast<int>(ceil(x - 0.5)); else - return (int)floor(x + 0.5); + return static_cast<int>(floor(x + 0.5)); } #endif +const int kNumCoeffs = 256; const double PI = 3.1415926535898; void reference2_16x16_idct_2d(double *input, double *output) { double x; @@ -45,7 +47,9 @@ void reference2_16x16_idct_2d(double *input, double *output) { double s = 0; for (int i = 0; i < 16; ++i) { for (int j = 0; j < 16; ++j) { - x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/256; + x = cos(PI * j * (l + 0.5) / 16.0) * + cos(PI * i * (k + 0.5) / 16.0) * + input[i * 16 + j] / 256; if (i != 0) x *= sqrt(2.0); if (j != 0) @@ -59,23 +63,23 @@ void reference2_16x16_idct_2d(double *input, double *output) { } -static const double C1 = 0.995184726672197; -static const double C2 = 0.98078528040323; -static const double C3 = 0.956940335732209; -static const double C4 = 0.923879532511287; -static const double C5 = 0.881921264348355; -static const double C6 = 0.831469612302545; -static const double C7 = 0.773010453362737; -static const double C8 = 0.707106781186548; -static const double C9 = 0.634393284163646; -static const double C10 = 0.555570233019602; -static const double C11 = 0.471396736825998; -static const double C12 = 0.38268343236509; -static const double C13 = 0.290284677254462; -static const double C14 = 0.195090322016128; -static const double C15 = 0.098017140329561; - -static void butterfly_16x16_dct_1d(double input[16], double output[16]) { +const double C1 = 0.995184726672197; +const double C2 = 0.98078528040323; +const double C3 = 0.956940335732209; +const double C4 = 0.923879532511287; +const double C5 = 0.881921264348355; +const double C6 = 0.831469612302545; +const double C7 = 0.773010453362737; +const double C8 = 0.707106781186548; +const double C9 = 0.634393284163646; +const double C10 = 0.555570233019602; +const double C11 = 0.471396736825998; +const double C12 = 0.38268343236509; +const double C13 = 0.290284677254462; +const double C14 = 0.195090322016128; +const double C15 = 0.098017140329561; + +void butterfly_16x16_dct_1d(double input[16], double output[16]) { double step[16]; double intermediate[16]; double temp1, temp2; @@ -108,36 +112,36 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) { output[6] = step[1] - step[6]; output[7] = step[0] - step[7]; - temp1 = step[ 8]*C7; - temp2 = step[15]*C9; + temp1 = step[ 8] * C7; + temp2 = step[15] * C9; output[ 8] = temp1 + temp2; - temp1 = step[ 9]*C11; - temp2 = step[14]*C5; + temp1 = step[ 9] * C11; + temp2 = step[14] * C5; output[ 9] = temp1 - temp2; - temp1 = step[10]*C3; - temp2 = step[13]*C13; + temp1 = step[10] * C3; + temp2 = step[13] * C13; output[10] = temp1 + temp2; - temp1 = step[11]*C15; - temp2 = step[12]*C1; + temp1 = step[11] * C15; + temp2 = step[12] * C1; output[11] = temp1 - temp2; - temp1 = step[11]*C1; - temp2 = step[12]*C15; + temp1 = step[11] * C1; + temp2 = step[12] * C15; output[12] = temp2 + temp1; - temp1 = step[10]*C13; - temp2 = step[13]*C3; + temp1 = step[10] * C13; + temp2 = step[13] * C3; output[13] = temp2 - temp1; - temp1 = step[ 9]*C5; - temp2 = step[14]*C11; + temp1 = step[ 9] * C5; + temp2 = step[14] * C11; output[14] = temp2 + temp1; - temp1 = step[ 8]*C9; - temp2 = step[15]*C7; + temp1 = step[ 8] * C9; + temp2 = step[15] * C7; output[15] = temp2 - temp1; // step 3 @@ -146,20 +150,20 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) { step[ 2] = output[1] - output[2]; step[ 3] = output[0] - output[3]; - temp1 = output[4]*C14; - temp2 = output[7]*C2; + temp1 = output[4] * C14; + temp2 = output[7] * C2; step[ 4] = temp1 + temp2; - temp1 = output[5]*C10; - temp2 = output[6]*C6; + temp1 = output[5] * C10; + temp2 = output[6] * C6; step[ 5] = temp1 + temp2; - temp1 = output[5]*C6; - temp2 = output[6]*C10; + temp1 = output[5] * C6; + temp2 = output[6] * C10; step[ 6] = temp2 - temp1; - temp1 = output[4]*C2; - temp2 = output[7]*C14; + temp1 = output[4] * C2; + temp2 = output[7] * C14; step[ 7] = temp2 - temp1; step[ 8] = output[ 8] + output[11]; @@ -176,18 +180,18 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) { output[ 0] = (step[ 0] + step[ 1]); output[ 8] = (step[ 0] - step[ 1]); - temp1 = step[2]*C12; - temp2 = step[3]*C4; + temp1 = step[2] * C12; + temp2 = step[3] * C4; temp1 = temp1 + temp2; - output[ 4] = 2*(temp1*C8); + output[ 4] = 2*(temp1 * C8); - temp1 = step[2]*C4; - temp2 = step[3]*C12; + temp1 = step[2] * C4; + temp2 = step[3] * C12; temp1 = temp2 - temp1; - output[12] = 2*(temp1*C8); + output[12] = 2 * (temp1 * C8); - output[ 2] = 2*((step[4] + step[ 5])*C8); - output[14] = 2*((step[7] - step[ 6])*C8); + output[ 2] = 2 * ((step[4] + step[ 5]) * C8); + output[14] = 2 * ((step[7] - step[ 6]) * C8); temp1 = step[4] - step[5]; temp2 = step[6] + step[7]; @@ -197,17 +201,17 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) { intermediate[8] = step[8] + step[14]; intermediate[9] = step[9] + step[15]; - temp1 = intermediate[8]*C12; - temp2 = intermediate[9]*C4; + temp1 = intermediate[8] * C12; + temp2 = intermediate[9] * C4; temp1 = temp1 - temp2; - output[3] = 2*(temp1*C8); + output[3] = 2 * (temp1 * C8); - temp1 = intermediate[8]*C4; - temp2 = intermediate[9]*C12; + temp1 = intermediate[8] * C4; + temp2 = intermediate[9] * C12; temp1 = temp2 + temp1; - output[13] = 2*(temp1*C8); + output[13] = 2 * (temp1 * C8); - output[ 9] = 2*((step[10] + step[11])*C8); + output[ 9] = 2 * ((step[10] + step[11]) * C8); intermediate[11] = step[10] - step[11]; intermediate[12] = step[12] + step[13]; @@ -218,207 +222,300 @@ static void butterfly_16x16_dct_1d(double input[16], double output[16]) { output[15] = (intermediate[11] + intermediate[12]); output[ 1] = -(intermediate[11] - intermediate[12]); - output[ 7] = 2*(intermediate[13]*C8); + output[ 7] = 2 * (intermediate[13] * C8); - temp1 = intermediate[14]*C12; - temp2 = intermediate[15]*C4; + temp1 = intermediate[14] * C12; + temp2 = intermediate[15] * C4; temp1 = temp1 - temp2; - output[11] = -2*(temp1*C8); + output[11] = -2 * (temp1 * C8); - temp1 = intermediate[14]*C4; - temp2 = intermediate[15]*C12; + temp1 = intermediate[14] * C4; + temp2 = intermediate[15] * C12; temp1 = temp2 + temp1; - output[ 5] = 2*(temp1*C8); + output[ 5] = 2 * (temp1 * C8); } -static void reference_16x16_dct_1d(double in[16], double out[16]) { - const double kPi = 3.141592653589793238462643383279502884; - const double kInvSqrt2 = 0.707106781186547524400844362104; - for (int k = 0; k < 16; k++) { - out[k] = 0.0; - for (int n = 0; n < 16; n++) - out[k] += in[n]*cos(kPi*(2*n+1)*k/32.0); - if (k == 0) - out[k] = out[k]*kInvSqrt2; - } -} - -void reference_16x16_dct_2d(int16_t input[16*16], double output[16*16]) { +void reference_16x16_dct_2d(int16_t input[256], double output[256]) { // First transform columns for (int i = 0; i < 16; ++i) { double temp_in[16], temp_out[16]; for (int j = 0; j < 16; ++j) - temp_in[j] = input[j*16 + i]; + temp_in[j] = input[j * 16 + i]; butterfly_16x16_dct_1d(temp_in, temp_out); for (int j = 0; j < 16; ++j) - output[j*16 + i] = temp_out[j]; + output[j * 16 + i] = temp_out[j]; } // Then transform rows for (int i = 0; i < 16; ++i) { double temp_in[16], temp_out[16]; for (int j = 0; j < 16; ++j) - temp_in[j] = output[j + i*16]; + temp_in[j] = output[j + i * 16]; butterfly_16x16_dct_1d(temp_in, temp_out); // Scale by some magic number for (int j = 0; j < 16; ++j) - output[j + i*16] = temp_out[j]/2; + output[j + i * 16] = temp_out[j]/2; } } -void fdct16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/, - int stride, int /*tx_type*/) { +typedef void (*fdct_t)(int16_t *in, int16_t *out, int stride); +typedef void (*idct_t)(int16_t *in, uint8_t *out, int stride); +typedef void (*fht_t) (int16_t *in, int16_t *out, int stride, int tx_type); +typedef void (*iht_t) (int16_t *in, uint8_t *dst, int stride, int tx_type); + +void fdct16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) { vp9_short_fdct16x16_c(in, out, stride); } -void idct16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst, - int stride, int /*tx_type*/) { - vp9_short_idct16x16_add_c(out, dst, stride >> 1); -} -void fht16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/, - int stride, int tx_type) { - // FIXME(jingning): need to test both SSE2 and c -#if HAVE_SSE2 - vp9_short_fht16x16_sse2(in, out, stride >> 1, tx_type); -#else - vp9_short_fht16x16_c(in, out, stride >> 1, tx_type); -#endif -} -void iht16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst, - int stride, int tx_type) { - vp9_short_iht16x16_add_c(out, dst, stride >> 1, tx_type); + +void fht16x16_ref(int16_t *in, int16_t *out, int stride, int tx_type) { + vp9_short_fht16x16_c(in, out, stride, tx_type); } -class FwdTrans16x16Test : public ::testing::TestWithParam<int> { +class Trans16x16TestBase { public: - virtual ~FwdTrans16x16Test() {} + virtual ~Trans16x16TestBase() {} - virtual void SetUp() { - tx_type_ = GetParam(); - if (tx_type_ == 0) { - fwd_txfm = fdct16x16; - inv_txfm = idct16x16_add; - } else { - fwd_txfm = fht16x16; - inv_txfm = iht16x16_add; + protected: + virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0; + + virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0; + + void RunAccuracyCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + uint32_t max_error = 0; + int64_t total_error = 0; + const int count_test_block = 10000; + for (int i = 0; i < count_test_block; ++i) { + DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs); + + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < kNumCoeffs; ++j) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + test_input_block[j] = src[j] - dst[j]; + } + + REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block, + test_temp_block, pitch_)); + REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_)); + + for (int j = 0; j < kNumCoeffs; ++j) { + const uint32_t diff = dst[j] - src[j]; + const uint32_t error = diff * diff; + if (max_error < error) + max_error = error; + total_error += error; + } } - } - protected: - void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst, - int stride, int tx_type) { - (*fwd_txfm)(in, out, dst, stride, tx_type); - } - void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst, - int stride, int tx_type) { - (*inv_txfm)(in, out, dst, stride, tx_type); + EXPECT_GE(1u, max_error) + << "Error: 16x16 FHT/IHT has an individual round trip error > 1"; + + EXPECT_GE(count_test_block , total_error) + << "Error: 16x16 FHT/IHT has average round trip error > 1 per block"; } - int tx_type_; - void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int); - void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int); -}; + void RunCoeffCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs); -TEST_P(FwdTrans16x16Test, AccuracyCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - int max_error = 0; - double total_error = 0; - const int count_test_block = 10000; - for (int i = 0; i < count_test_block; ++i) { - DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 256); - DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 256); - DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256); - DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 256); - - for (int j = 0; j < 256; ++j) { - src[j] = rnd.Rand8(); - dst[j] = rnd.Rand8(); + for (int i = 0; i < count_test_block; ++i) { // Initialize a test block with input range [-255, 255]. - test_input_block[j] = src[j] - dst[j]; + for (int j = 0; j < kNumCoeffs; ++j) + input_block[j] = rnd.Rand8() - rnd.Rand8(); + + fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_); + REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_)); + + // The minimum quant value is 4. + for (int j = 0; j < kNumCoeffs; ++j) + EXPECT_EQ(output_block[j], output_ref_block[j]); } + } - const int pitch = 32; - RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); - RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); + void RunMemCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs); - for (int j = 0; j < 256; ++j) { - const int diff = dst[j] - src[j]; - const int error = diff * diff; - if (max_error < error) - max_error = error; - total_error += error; + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < kNumCoeffs; ++j) { + input_block[j] = rnd.Rand8() - rnd.Rand8(); + input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255; + } + if (i == 0) + for (int j = 0; j < kNumCoeffs; ++j) + input_extreme_block[j] = 255; + if (i == 1) + for (int j = 0; j < kNumCoeffs; ++j) + input_extreme_block[j] = -255; + + fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_); + REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block, + output_block, pitch_)); + + // The minimum quant value is 4. + for (int j = 0; j < kNumCoeffs; ++j) { + EXPECT_EQ(output_block[j], output_ref_block[j]); + EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j])) + << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE"; + } } } - EXPECT_GE(1, max_error) - << "Error: 16x16 FHT/IHT has an individual round trip error > 1"; + void RunInvAccuracyCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs); - EXPECT_GE(count_test_block , total_error) - << "Error: 16x16 FHT/IHT has average round trip error > 1 per block"; -} + for (int i = 0; i < count_test_block; ++i) { + double out_r[kNumCoeffs]; -TEST_P(FwdTrans16x16Test, CoeffSizeCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 1000; - for (int i = 0; i < count_test_block; ++i) { - DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, 256); - DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, 256); - DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, 256); - DECLARE_ALIGNED_ARRAY(16, int16_t, output_extreme_block, 256); - DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256); - - // Initialize a test block with input range [-255, 255]. - for (int j = 0; j < 256; ++j) { - input_block[j] = rnd.Rand8() - rnd.Rand8(); - input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255; - } - if (i == 0) - for (int j = 0; j < 256; ++j) - input_extreme_block[j] = 255; - - const int pitch = 32; - RunFwdTxfm(input_block, output_block, dst, pitch, tx_type_); - RunFwdTxfm(input_extreme_block, output_extreme_block, dst, pitch, tx_type_); - - // The minimum quant value is 4. - for (int j = 0; j < 256; ++j) { - EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j])) - << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE"; - EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_extreme_block[j])) - << "Error: 16x16 FDCT extreme has coefficient larger " - << "than 4*DCT_MAX_VALUE"; + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < kNumCoeffs; ++j) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + in[j] = src[j] - dst[j]; + } + + reference_16x16_dct_2d(in, out_r); + for (int j = 0; j < kNumCoeffs; ++j) + coeff[j] = round(out_r[j]); + + const int pitch = 32; + REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch)); + + for (int j = 0; j < kNumCoeffs; ++j) { + const uint32_t diff = dst[j] - src[j]; + const uint32_t error = diff * diff; + EXPECT_GE(1u, error) + << "Error: 16x16 IDCT has error " << error + << " at index " << j; + } } } + int pitch_; + int tx_type_; + fht_t fwd_txfm_ref; +}; + +class Trans16x16DCT : public Trans16x16TestBase, + public PARAMS(fdct_t, idct_t, int) { + public: + virtual ~Trans16x16DCT() {} + + virtual void SetUp() { + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + tx_type_ = GET_PARAM(2); + pitch_ = 32; + fwd_txfm_ref = fdct16x16_ref; + } + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + void RunFwdTxfm(int16_t *in, int16_t *out, int stride) { + fwd_txfm_(in, out, stride); + } + void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) { + inv_txfm_(out, dst, stride >> 1); + } + + fdct_t fwd_txfm_; + idct_t inv_txfm_; +}; + +TEST_P(Trans16x16DCT, AccuracyCheck) { + RunAccuracyCheck(); } -INSTANTIATE_TEST_CASE_P(VP9, FwdTrans16x16Test, ::testing::Range(0, 4)); +TEST_P(Trans16x16DCT, CoeffCheck) { + RunCoeffCheck(); +} -TEST(VP9Idct16x16Test, AccuracyCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 1000; - for (int i = 0; i < count_test_block; ++i) { - int16_t in[256], coeff[256]; - uint8_t dst[256], src[256]; - double out_r[256]; +TEST_P(Trans16x16DCT, MemCheck) { + RunMemCheck(); +} - for (int j = 0; j < 256; ++j) { - src[j] = rnd.Rand8(); - dst[j] = rnd.Rand8(); - } - // Initialize a test block with input range [-255, 255]. - for (int j = 0; j < 256; ++j) - in[j] = src[j] - dst[j]; - - reference_16x16_dct_2d(in, out_r); - for (int j = 0; j < 256; j++) - coeff[j] = round(out_r[j]); - vp9_short_idct16x16_add_c(coeff, dst, 16); - for (int j = 0; j < 256; ++j) { - const int diff = dst[j] - src[j]; - const int error = diff * diff; - EXPECT_GE(1, error) - << "Error: 16x16 IDCT has error " << error - << " at index " << j; - } +TEST_P(Trans16x16DCT, InvAccuracyCheck) { + RunInvAccuracyCheck(); +} + +class Trans16x16HT : public Trans16x16TestBase, + public PARAMS(fht_t, iht_t, int) { + public: + virtual ~Trans16x16HT() {} + + virtual void SetUp() { + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + tx_type_ = GET_PARAM(2); + pitch_ = 16; + fwd_txfm_ref = fht16x16_ref; + } + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + void RunFwdTxfm(int16_t *in, int16_t *out, int stride) { + fwd_txfm_(in, out, stride, tx_type_); + } + void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) { + inv_txfm_(out, dst, stride, tx_type_); } + + fht_t fwd_txfm_; + iht_t inv_txfm_; +}; + +TEST_P(Trans16x16HT, AccuracyCheck) { + RunAccuracyCheck(); +} + +TEST_P(Trans16x16HT, CoeffCheck) { + RunCoeffCheck(); +} + +TEST_P(Trans16x16HT, MemCheck) { + RunMemCheck(); } +using std::tr1::make_tuple; + +INSTANTIATE_TEST_CASE_P( + C, Trans16x16DCT, + ::testing::Values( + make_tuple(&vp9_short_fdct16x16_c, &vp9_short_idct16x16_add_c, 0))); +INSTANTIATE_TEST_CASE_P( + C, Trans16x16HT, + ::testing::Values( + make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 0), + make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 1), + make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 2), + make_tuple(&vp9_short_fht16x16_c, &vp9_short_iht16x16_add_c, 3))); + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, Trans16x16DCT, + ::testing::Values( + make_tuple(&vp9_short_fdct16x16_sse2, &vp9_short_idct16x16_add_c, 0))); +INSTANTIATE_TEST_CASE_P( + SSE2, Trans16x16HT, + ::testing::Values( + make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 0), + make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 1), + make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 2), + make_tuple(&vp9_short_fht16x16_sse2, &vp9_short_iht16x16_add_sse2, 3))); +#endif } // namespace diff --git a/libvpx/test/dct32x32_test.cc b/libvpx/test/dct32x32_test.cc index e05d482..f331886 100644 --- a/libvpx/test/dct32x32_test.cc +++ b/libvpx/test/dct32x32_test.cc @@ -13,15 +13,17 @@ #include <string.h> #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" extern "C" { +#include "./vpx_config.h" #include "vp9/common/vp9_entropy.h" #include "./vp9_rtcd.h" - void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch); - void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch); } -#include "test/acm_random.h" #include "vpx/vpx_integer.h" using libvpx_test::ACMRandom; @@ -30,35 +32,15 @@ namespace { #ifdef _MSC_VER static int round(double x) { if (x < 0) - return (int)ceil(x - 0.5); + return static_cast<int>(ceil(x - 0.5)); else - return (int)floor(x + 0.5); + return static_cast<int>(floor(x + 0.5)); } #endif -static const double kPi = 3.141592653589793238462643383279502884; -static void reference2_32x32_idct_2d(double *input, double *output) { - double x; - for (int l = 0; l < 32; ++l) { - for (int k = 0; k < 32; ++k) { - double s = 0; - for (int i = 0; i < 32; ++i) { - for (int j = 0; j < 32; ++j) { - x = cos(kPi * j * (l + 0.5) / 32.0) * - cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024; - if (i != 0) - x *= sqrt(2.0); - if (j != 0) - x *= sqrt(2.0); - s += x; - } - } - output[k * 32 + l] = s / 4; - } - } -} - -static void reference_32x32_dct_1d(double in[32], double out[32], int stride) { +const int kNumCoeffs = 1024; +const double kPi = 3.141592653589793238462643383279502884; +void reference_32x32_dct_1d(const double in[32], double out[32], int stride) { const double kInvSqrt2 = 0.707106781186547524400844362104; for (int k = 0; k < 32; k++) { out[k] = 0.0; @@ -69,7 +51,8 @@ static void reference_32x32_dct_1d(double in[32], double out[32], int stride) { } } -static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) { +void reference_32x32_dct_2d(const int16_t input[kNumCoeffs], + double output[kNumCoeffs]) { // First transform columns for (int i = 0; i < 32; ++i) { double temp_in[32], temp_out[32]; @@ -91,102 +74,189 @@ static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) { } } -TEST(VP9Idct32x32Test, AccuracyCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 1000; - for (int i = 0; i < count_test_block; ++i) { - int16_t in[1024], coeff[1024]; - uint8_t dst[1024], src[1024]; - double out_r[1024]; +typedef void (*fwd_txfm_t)(int16_t *in, int16_t *out, int stride); +typedef void (*inv_txfm_t)(int16_t *in, uint8_t *dst, int stride); - for (int j = 0; j < 1024; ++j) { - src[j] = rnd.Rand8(); - dst[j] = rnd.Rand8(); - } - // Initialize a test block with input range [-255, 255]. - for (int j = 0; j < 1024; ++j) - in[j] = src[j] - dst[j]; - - reference_32x32_dct_2d(in, out_r); - for (int j = 0; j < 1024; j++) - coeff[j] = round(out_r[j]); - vp9_short_idct32x32_add_c(coeff, dst, 32); - for (int j = 0; j < 1024; ++j) { - const int diff = dst[j] - src[j]; - const int error = diff * diff; - EXPECT_GE(1, error) - << "Error: 32x32 IDCT has error " << error - << " at index " << j; - } +class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) { + public: + virtual ~Trans32x32Test() {} + virtual void SetUp() { + fwd_txfm_ = GET_PARAM(0); + inv_txfm_ = GET_PARAM(1); + version_ = GET_PARAM(2); // 0: high precision forward transform + // 1: low precision version for rd loop } -} -TEST(VP9Fdct32x32Test, AccuracyCheck) { + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + int version_; + fwd_txfm_t fwd_txfm_; + inv_txfm_t inv_txfm_; +}; + +TEST_P(Trans32x32Test, AccuracyCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - unsigned int max_error = 0; + uint32_t max_error = 0; int64_t total_error = 0; const int count_test_block = 1000; - for (int i = 0; i < count_test_block; ++i) { - int16_t test_input_block[1024]; - int16_t test_temp_block[1024]; - uint8_t dst[1024], src[1024]; + DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs); - for (int j = 0; j < 1024; ++j) { + for (int i = 0; i < count_test_block; ++i) { + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < kNumCoeffs; ++j) { src[j] = rnd.Rand8(); dst[j] = rnd.Rand8(); - } - // Initialize a test block with input range [-255, 255]. - for (int j = 0; j < 1024; ++j) test_input_block[j] = src[j] - dst[j]; + } const int pitch = 64; - vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch); - vp9_short_idct32x32_add_c(test_temp_block, dst, 32); + REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch)); + REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32)); - for (int j = 0; j < 1024; ++j) { - const unsigned diff = dst[j] - src[j]; - const unsigned error = diff * diff; + for (int j = 0; j < kNumCoeffs; ++j) { + const uint32_t diff = dst[j] - src[j]; + const uint32_t error = diff * diff; if (max_error < error) max_error = error; total_error += error; } } + if (version_ == 1) { + max_error /= 2; + total_error /= 45; + } + EXPECT_GE(1u, max_error) - << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1"; + << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1"; EXPECT_GE(count_test_block, total_error) - << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1 per block"; + << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block"; } -TEST(VP9Fdct32x32Test, CoeffSizeCheck) { +TEST_P(Trans32x32Test, CoeffCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 1000; + + DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs); + for (int i = 0; i < count_test_block; ++i) { - int16_t input_block[1024], input_extreme_block[1024]; - int16_t output_block[1024], output_extreme_block[1024]; + for (int j = 0; j < kNumCoeffs; ++j) + input_block[j] = rnd.Rand8() - rnd.Rand8(); + + const int pitch = 64; + vp9_short_fdct32x32_c(input_block, output_ref_block, pitch); + REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch)); + + if (version_ == 0) { + for (int j = 0; j < kNumCoeffs; ++j) + EXPECT_EQ(output_block[j], output_ref_block[j]) + << "Error: 32x32 FDCT versions have mismatched coefficients"; + } else { + for (int j = 0; j < kNumCoeffs; ++j) + EXPECT_GE(6, abs(output_block[j] - output_ref_block[j])) + << "Error: 32x32 FDCT rd has mismatched coefficients"; + } + } +} + +TEST_P(Trans32x32Test, MemCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 2000; + + DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs); + for (int i = 0; i < count_test_block; ++i) { // Initialize a test block with input range [-255, 255]. - for (int j = 0; j < 1024; ++j) { + for (int j = 0; j < kNumCoeffs; ++j) { input_block[j] = rnd.Rand8() - rnd.Rand8(); - input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255; + input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255; } if (i == 0) - for (int j = 0; j < 1024; ++j) + for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = 255; + if (i == 1) + for (int j = 0; j < kNumCoeffs; ++j) + input_extreme_block[j] = -255; const int pitch = 64; - vp9_short_fdct32x32_c(input_block, output_block, pitch); - vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch); + vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch); + REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch)); // The minimum quant value is 4. - for (int j = 0; j < 1024; ++j) { - EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j])) - << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE"; - EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j])) - << "Error: 32x32 FDCT extreme has coefficient larger than " - "4*DCT_MAX_VALUE"; + for (int j = 0; j < kNumCoeffs; ++j) { + if (version_ == 0) { + EXPECT_EQ(output_block[j], output_ref_block[j]) + << "Error: 32x32 FDCT versions have mismatched coefficients"; + } else { + EXPECT_GE(6, abs(output_block[j] - output_ref_block[j])) + << "Error: 32x32 FDCT rd has mismatched coefficients"; + } + EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j])) + << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE"; + EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j])) + << "Error: 32x32 FDCT has coefficient larger than " + << "4*DCT_MAX_VALUE"; } } } + +TEST_P(Trans32x32Test, InverseAccuracy) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs); + DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs); + + for (int i = 0; i < count_test_block; ++i) { + double out_r[kNumCoeffs]; + + // Initialize a test block with input range [-255, 255] + for (int j = 0; j < kNumCoeffs; ++j) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + in[j] = src[j] - dst[j]; + } + + reference_32x32_dct_2d(in, out_r); + for (int j = 0; j < kNumCoeffs; ++j) + coeff[j] = round(out_r[j]); + REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32)); + for (int j = 0; j < kNumCoeffs; ++j) { + const int diff = dst[j] - src[j]; + const int error = diff * diff; + EXPECT_GE(1, error) + << "Error: 32x32 IDCT has error " << error + << " at index " << j; + } + } +} + +using std::tr1::make_tuple; + +INSTANTIATE_TEST_CASE_P( + C, Trans32x32Test, + ::testing::Values( + make_tuple(&vp9_short_fdct32x32_c, &vp9_short_idct32x32_add_c, 0), + make_tuple(&vp9_short_fdct32x32_rd_c, &vp9_short_idct32x32_add_c, 1))); + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, Trans32x32Test, + ::testing::Values( + make_tuple(&vp9_short_fdct32x32_sse2, + &vp9_short_idct32x32_add_sse2, 0), + make_tuple(&vp9_short_fdct32x32_rd_sse2, + &vp9_short_idct32x32_add_sse2, 1))); +#endif } // namespace diff --git a/libvpx/test/decode_test_driver.h b/libvpx/test/decode_test_driver.h index 49e7384..055c45e 100644 --- a/libvpx/test/decode_test_driver.h +++ b/libvpx/test/decode_test_driver.h @@ -12,7 +12,7 @@ #define TEST_DECODE_TEST_DRIVER_H_ #include <cstring> #include "third_party/googletest/src/include/gtest/gtest.h" -#include "vpx_config.h" +#include "./vpx_config.h" #include "vpx/vpx_decoder.h" namespace libvpx_test { @@ -36,9 +36,8 @@ class DxDataIterator { }; // Provides a simplified interface to manage one video decoding. -// -// TODO: similar to Encoder class, the exact services should be -// added as more tests are added. +// Similar to Encoder class, the exact services should be added +// as more tests are added. class Decoder { public: Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline) diff --git a/libvpx/test/encode_test_driver.cc b/libvpx/test/encode_test_driver.cc index eed3e33..709831e 100644 --- a/libvpx/test/encode_test_driver.cc +++ b/libvpx/test/encode_test_driver.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" +#include "./vpx_config.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/decode_test_driver.h" @@ -114,19 +114,19 @@ static bool compare_img(const vpx_image_t *img1, const unsigned int height_y = img1->d_h; unsigned int i; for (i = 0; i < height_y; ++i) - match = ( memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y], - img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y], - width_y) == 0) && match; + match = (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y], + img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y], + width_y) == 0) && match; const unsigned int width_uv = (img1->d_w + 1) >> 1; const unsigned int height_uv = (img1->d_h + 1) >> 1; for (i = 0; i < height_uv; ++i) - match = ( memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U], - img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U], - width_uv) == 0) && match; + match = (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U], + img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U], + width_uv) == 0) && match; for (i = 0; i < height_uv; ++i) - match = ( memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V], - img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V], - width_uv) == 0) && match; + match = (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V], + img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V], + width_uv) == 0) && match; return match; } @@ -158,7 +158,7 @@ void EncoderTest::RunLoop(VideoSource *video) { Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0); bool again; for (again = true, video->Begin(); again; video->Next()) { - again = video->img() != NULL; + again = (video->img() != NULL); PreEncodeFrameHook(video); PreEncodeFrameHook(video, encoder); diff --git a/libvpx/test/error_resilience_test.cc b/libvpx/test/error_resilience_test.cc index d4a6967..16d250c 100644 --- a/libvpx/test/error_resilience_test.cc +++ b/libvpx/test/error_resilience_test.cc @@ -62,7 +62,7 @@ class ErrorResilienceTest : public ::libvpx_test::EncoderTest, if (droppable_nframes_ > 0 && (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) { for (unsigned int i = 0; i < droppable_nframes_; ++i) { - if (droppable_frames_[i] == nframes_) { + if (droppable_frames_[i] == video->frame()) { std::cout << " Encoding droppable frame: " << droppable_frames_[i] << "\n"; frame_flags_ |= (VP8_EFLAG_NO_UPD_LAST | @@ -148,7 +148,7 @@ TEST_P(ErrorResilienceTest, OnVersusOff) { const vpx_rational timebase = { 33333333, 1000000000 }; cfg_.g_timebase = timebase; cfg_.rc_target_bitrate = 2000; - cfg_.g_lag_in_frames = 25; + cfg_.g_lag_in_frames = 10; init_flags_ = VPX_CODEC_USE_PSNR; @@ -179,6 +179,9 @@ TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) { const vpx_rational timebase = { 33333333, 1000000000 }; cfg_.g_timebase = timebase; cfg_.rc_target_bitrate = 500; + // FIXME(debargha): Fix this to work for any lag. + // Currently this test only works for lag = 0 + cfg_.g_lag_in_frames = 0; init_flags_ = VPX_CODEC_USE_PSNR; diff --git a/libvpx/test/fdct4x4_test.cc b/libvpx/test/fdct4x4_test.cc index 9dcc078..ea40ca6 100644 --- a/libvpx/test/fdct4x4_test.cc +++ b/libvpx/test/fdct4x4_test.cc @@ -15,10 +15,10 @@ #include "third_party/googletest/src/include/gtest/gtest.h" extern "C" { -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" } -#include "acm_random.h" +#include "test/acm_random.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" @@ -136,7 +136,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); int max_error = 0; - double total_error = 0; + int total_error = 0; const int count_test_block = 1000000; for (int i = 0; i < count_test_block; ++i) { DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16); @@ -156,7 +156,7 @@ TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) { RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); for (int j = 0; j < 16; ++j) { - if(test_temp_block[j] > 0) { + if (test_temp_block[j] > 0) { test_temp_block[j] += 2; test_temp_block[j] /= 4; test_temp_block[j] *= 4; diff --git a/libvpx/test/fdct8x8_test.cc b/libvpx/test/fdct8x8_test.cc index 50e2e9d..ee6c9f6 100644 --- a/libvpx/test/fdct8x8_test.cc +++ b/libvpx/test/fdct8x8_test.cc @@ -13,14 +13,16 @@ #include <string.h> #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" #include "vpx_ports/mem.h" extern "C" { -#include "vp9_rtcd.h" -void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch); +#include "./vp9_rtcd.h" +void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *output, int pitch); } -#include "acm_random.h" +#include "test/acm_random.h" #include "vpx/vpx_integer.h" using libvpx_test::ACMRandom; @@ -62,6 +64,7 @@ class FwdTrans8x8Test : public ::testing::TestWithParam<int> { inv_txfm = iht8x8_add; } } + virtual void TearDown() { libvpx_test::ClearSystemState(); } protected: void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst, @@ -92,8 +95,9 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) { // Initialize a test block with input range [-255, 255]. for (int j = 0; j < 64; ++j) test_input_block[j] = rnd.Rand8() - rnd.Rand8(); - - RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_); + REGISTER_STATE_CHECK( + RunFwdTxfm(test_input_block, test_output_block, + NULL, pitch, tx_type_)); for (int j = 0; j < 64; ++j) { if (test_output_block[j] < 0) @@ -121,8 +125,9 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) { // Initialize a test block with input range [-15, 15]. for (int j = 0; j < 64; ++j) test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4); - - RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_); + REGISTER_STATE_CHECK( + RunFwdTxfm(test_input_block, test_output_block, + NULL, pitch, tx_type_)); for (int j = 0; j < 64; ++j) { if (test_output_block[j] < 0) @@ -148,7 +153,7 @@ TEST_P(FwdTrans8x8Test, SignBiasCheck) { TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); int max_error = 0; - double total_error = 0; + int total_error = 0; const int count_test_block = 100000; for (int i = 0; i < count_test_block; ++i) { DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64); @@ -165,9 +170,11 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) { test_input_block[j] = src[j] - dst[j]; const int pitch = 16; - RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); - for (int j = 0; j < 64; ++j){ - if(test_temp_block[j] > 0) { + REGISTER_STATE_CHECK( + RunFwdTxfm(test_input_block, test_temp_block, + dst, pitch, tx_type_)); + for (int j = 0; j < 64; ++j) { + if (test_temp_block[j] > 0) { test_temp_block[j] += 2; test_temp_block[j] /= 4; test_temp_block[j] *= 4; @@ -177,7 +184,9 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) { test_temp_block[j] *= 4; } } - RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); + REGISTER_STATE_CHECK( + RunInvTxfm(test_input_block, test_temp_block, + dst, pitch, tx_type_)); for (int j = 0; j < 64; ++j) { const int diff = dst[j] - src[j]; @@ -199,7 +208,7 @@ TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) { TEST_P(FwdTrans8x8Test, ExtremalCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); int max_error = 0; - double total_error = 0; + int total_error = 0; const int count_test_block = 100000; for (int i = 0; i < count_test_block; ++i) { DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64); @@ -216,8 +225,12 @@ TEST_P(FwdTrans8x8Test, ExtremalCheck) { test_input_block[j] = src[j] - dst[j]; const int pitch = 16; - RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); - RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); + REGISTER_STATE_CHECK( + RunFwdTxfm(test_input_block, test_temp_block, + dst, pitch, tx_type_)); + REGISTER_STATE_CHECK( + RunInvTxfm(test_input_block, test_temp_block, + dst, pitch, tx_type_)); for (int j = 0; j < 64; ++j) { const int diff = dst[j] - src[j]; diff --git a/libvpx/test/i420_video_source.h b/libvpx/test/i420_video_source.h index bcbe8a7..2bf2a03 100644 --- a/libvpx/test/i420_video_source.h +++ b/libvpx/test/i420_video_source.h @@ -11,6 +11,7 @@ #define TEST_I420_VIDEO_SOURCE_H_ #include <cstdio> #include <cstdlib> +#include <string> #include "test/video_source.h" @@ -34,7 +35,6 @@ class I420VideoSource : public VideoSource { height_(0), framerate_numerator_(rate_numerator), framerate_denominator_(rate_denominator) { - // This initializes raw_sz_, width_, height_ and allocates an img. SetSize(width, height); } diff --git a/libvpx/test/idct8x8_test.cc b/libvpx/test/idct8x8_test.cc index 67db78b..fc8129e 100644 --- a/libvpx/test/idct8x8_test.cc +++ b/libvpx/test/idct8x8_test.cc @@ -15,10 +15,10 @@ #include "third_party/googletest/src/include/gtest/gtest.h" extern "C" { -#include "vp9_rtcd.h" +#include "./vp9_rtcd.h" } -#include "acm_random.h" +#include "test/acm_random.h" #include "vpx/vpx_integer.h" using libvpx_test::ACMRandom; @@ -27,10 +27,10 @@ namespace { #ifdef _MSC_VER static int round(double x) { - if(x < 0) - return (int)ceil(x - 0.5); + if (x < 0) + return static_cast<int>(ceil(x - 0.5)); else - return (int)floor(x + 0.5); + return static_cast<int>(floor(x + 0.5)); } #endif diff --git a/libvpx/test/idct_test.cc b/libvpx/test/idct_test.cc index aa786cb..2c7fa0e 100644 --- a/libvpx/test/idct_test.cc +++ b/libvpx/test/idct_test.cc @@ -16,7 +16,9 @@ extern "C" { #include "test/register_state_check.h" #include "third_party/googletest/src/include/gtest/gtest.h" -typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr, +#include "vpx/vpx_integer.h" + +typedef void (*idct_fn_t)(int16_t *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride); namespace { @@ -34,7 +36,7 @@ class IDCTTest : public ::testing::TestWithParam<idct_fn_t> { virtual void TearDown() { libvpx_test::ClearSystemState(); } idct_fn_t UUT; - short input[16]; + int16_t input[16]; unsigned char output[256]; unsigned char predict[256]; }; diff --git a/libvpx/test/intrapred_test.cc b/libvpx/test/intrapred_test.cc index da96741..f5f6d5b 100644 --- a/libvpx/test/intrapred_test.cc +++ b/libvpx/test/intrapred_test.cc @@ -15,8 +15,8 @@ #include "test/register_state_check.h" #include "third_party/googletest/src/include/gtest/gtest.h" extern "C" { -#include "vpx_config.h" -#include "vp8_rtcd.h" +#include "./vpx_config.h" +#include "./vp8_rtcd.h" #include "vp8/common/blockd.h" #include "vpx_mem/vpx_mem.h" } @@ -106,9 +106,9 @@ class IntraPredBase { for (int y = 0; y < block_size_; y++) sum += data_ptr_[p][y * stride_ - 1]; expected = (sum + (1 << (shift - 1))) >> shift; - } else + } else { expected = 0x80; - + } // check that all subsequent lines are equal to the first for (int y = 1; y < block_size_; ++y) ASSERT_EQ(0, memcmp(data_ptr_[p], &data_ptr_[p][y * stride_], diff --git a/libvpx/test/ivf_video_source.h b/libvpx/test/ivf_video_source.h index 926f801..3fbafbd 100644 --- a/libvpx/test/ivf_video_source.h +++ b/libvpx/test/ivf_video_source.h @@ -28,7 +28,7 @@ static unsigned int MemGetLe32(const uint8_t *mem) { // so that we can do actual file decodes. class IVFVideoSource : public CompressedVideoSource { public: - IVFVideoSource(const std::string &file_name) + explicit IVFVideoSource(const std::string &file_name) : file_name_(file_name), input_file_(NULL), compressed_frame_buf_(NULL), diff --git a/libvpx/test/keyframe_test.cc b/libvpx/test/keyframe_test.cc index f7572e8..7ee2898 100644 --- a/libvpx/test/keyframe_test.cc +++ b/libvpx/test/keyframe_test.cc @@ -132,7 +132,6 @@ TEST_P(KeyframeTest, TestAutoKeyframe) { // Verify that keyframes match the file keyframes in the file. for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin(); iter != kf_pts_list_.end(); ++iter) { - if (deadline_ == VPX_DL_REALTIME && *iter > 0) EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame " << *iter; diff --git a/libvpx/test/md5_helper.h b/libvpx/test/md5_helper.h index fc1a974..289f608 100644 --- a/libvpx/test/md5_helper.h +++ b/libvpx/test/md5_helper.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef LIBVPX_TEST_MD5_HELPER_H_ -#define LIBVPX_TEST_MD5_HELPER_H_ +#ifndef TEST_MD5_HELPER_H_ +#define TEST_MD5_HELPER_H_ extern "C" { #include "./md5_utils.h" @@ -25,9 +25,15 @@ class MD5 { void Add(const vpx_image_t *img) { for (int plane = 0; plane < 3; ++plane) { - uint8_t *buf = img->planes[plane]; - const int h = plane ? (img->d_h + 1) >> 1 : img->d_h; - const int w = plane ? (img->d_w + 1) >> 1 : img->d_w; + const uint8_t *buf = img->planes[plane]; + // Calculate the width and height to do the md5 check. For the chroma + // plane, we never want to round down and thus skip a pixel so if + // we are shifting by 1 (chroma_shift) we add 1 before doing the shift. + // This works only for chroma_shift of 0 and 1. + const int h = plane ? (img->d_h + img->y_chroma_shift) >> + img->y_chroma_shift : img->d_h; + const int w = plane ? (img->d_w + img->x_chroma_shift) >> + img->x_chroma_shift : img->d_w; for (int y = 0; y < h; ++y) { MD5Update(&md5_, buf, w); @@ -61,4 +67,4 @@ class MD5 { } // namespace libvpx_test -#endif // LIBVPX_TEST_MD5_HELPER_H_ +#endif // TEST_MD5_HELPER_H_ diff --git a/libvpx/test/pp_filter_test.cc b/libvpx/test/pp_filter_test.cc index 79896fe..e5ac9db 100644 --- a/libvpx/test/pp_filter_test.cc +++ b/libvpx/test/pp_filter_test.cc @@ -11,8 +11,8 @@ #include "test/register_state_check.h" #include "third_party/googletest/src/include/gtest/gtest.h" extern "C" { -#include "vpx_config.h" -#include "vp8_rtcd.h" +#include "./vpx_config.h" +#include "./vp8_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" } @@ -63,7 +63,8 @@ TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) { // Pointers to top-left pixel of block in the input and output images. uint8_t *const src_image_ptr = src_image + (input_stride << 1); uint8_t *const dst_image_ptr = dst_image + 8; - uint8_t *const flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width)); + uint8_t *const flimits = + reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width)); (void)vpx_memset(flimits, 255, block_width); // Initialize pixels in the input: diff --git a/libvpx/test/register_state_check.h b/libvpx/test/register_state_check.h index fb3f53b..479a42d 100644 --- a/libvpx/test/register_state_check.h +++ b/libvpx/test/register_state_check.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef LIBVPX_TEST_REGISTER_STATE_CHECK_H_ -#define LIBVPX_TEST_REGISTER_STATE_CHECK_H_ +#ifndef TEST_REGISTER_STATE_CHECK_H_ +#define TEST_REGISTER_STATE_CHECK_H_ #ifdef _WIN64 @@ -92,4 +92,4 @@ class RegisterStateCheck {}; #endif // _WIN64 -#endif // LIBVPX_TEST_REGISTER_STATE_CHECK_H_ +#endif // TEST_REGISTER_STATE_CHECK_H_ diff --git a/libvpx/test/resize_test.cc b/libvpx/test/resize_test.cc index 7412a24..77d3f5c 100644 --- a/libvpx/test/resize_test.cc +++ b/libvpx/test/resize_test.cc @@ -124,6 +124,13 @@ TEST_P(ResizeInternalTest, TestInternalResizeWorks) { ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, 30, 1, 0, 10); init_flags_ = VPX_CODEC_USE_PSNR; + + // If the number of frames being encoded is smaller than g_lag_in_frames + // the encoded frame is unavailable using the current API. Comparing + // frames to detect mismatch would then not be possible. Set + // g_lag_in_frames = 0 to get around this. + cfg_.g_lag_in_frames = 0; + // q picked such that initial keyframe on this clip is ~30dB PSNR cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); diff --git a/libvpx/test/sad_test.cc b/libvpx/test/sad_test.cc index bf3e0b8..453b3a8 100644 --- a/libvpx/test/sad_test.cc +++ b/libvpx/test/sad_test.cc @@ -17,7 +17,6 @@ extern "C" { #include "./vpx_config.h" #if CONFIG_VP8_ENCODER #include "./vp8_rtcd.h" -//#include "vp8/common/blockd.h" #endif #if CONFIG_VP9_ENCODER #include "./vp9_rtcd.h" @@ -428,6 +427,7 @@ INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests)); #if HAVE_SSE #if CONFIG_VP9_ENCODER +#if CONFIG_USE_X86INC const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse; const sad_m_by_n_fn_t sad_4x8_sse_vp9 = vp9_sad4x8_sse; INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values( @@ -441,6 +441,7 @@ INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values( make_tuple(4, 4, sad_4x4x4d_sse))); #endif #endif +#endif #if HAVE_SSE2 #if CONFIG_VP8_ENCODER @@ -451,6 +452,7 @@ const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt; const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt; #endif #if CONFIG_VP9_ENCODER +#if CONFIG_USE_X86INC const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2; const sad_m_by_n_fn_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2; const sad_m_by_n_fn_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2; @@ -463,6 +465,7 @@ const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2; const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2; const sad_m_by_n_fn_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2; #endif +#endif const sad_m_by_n_test_param_t sse2_tests[] = { #if CONFIG_VP8_ENCODER make_tuple(16, 16, sad_16x16_wmt), @@ -472,6 +475,7 @@ const sad_m_by_n_test_param_t sse2_tests[] = { make_tuple(4, 4, sad_4x4_wmt), #endif #if CONFIG_VP9_ENCODER +#if CONFIG_USE_X86INC make_tuple(64, 64, sad_64x64_sse2_vp9), make_tuple(64, 32, sad_64x32_sse2_vp9), make_tuple(32, 64, sad_32x64_sse2_vp9), @@ -484,10 +488,12 @@ const sad_m_by_n_test_param_t sse2_tests[] = { make_tuple(8, 8, sad_8x8_sse2_vp9), make_tuple(8, 4, sad_8x4_sse2_vp9), #endif +#endif }; INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); #if CONFIG_VP9_ENCODER +#if CONFIG_USE_X86INC const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2; const sad_n_by_n_by_4_fn_t sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2; const sad_n_by_n_by_4_fn_t sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2; @@ -513,6 +519,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values( make_tuple(8, 4, sad_8x4x4d_sse2))); #endif #endif +#endif #if HAVE_SSE3 #if CONFIG_VP8_ENCODER @@ -531,9 +538,11 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values( #endif #if HAVE_SSSE3 +#if CONFIG_USE_X86INC const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3; INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values( make_tuple(16, 16, sad_16x16_sse3))); #endif +#endif } // namespace diff --git a/libvpx/test/set_roi.cc b/libvpx/test/set_roi.cc index 3b6112e..9d2e771 100644 --- a/libvpx/test/set_roi.cc +++ b/libvpx/test/set_roi.cc @@ -17,15 +17,19 @@ #include <sys/types.h> #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" extern "C" { #include "vp8/encoder/onyx_int.h" } +using libvpx_test::ACMRandom; + namespace { TEST(Vp8RoiMapTest, ParameterCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); int delta_q[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 }; int delta_lf[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 }; unsigned int threshold[MAX_MB_SEGMENTS] = { 0, 100, 200, 300 }; @@ -121,10 +125,10 @@ TEST(Vp8RoiMapTest, ParameterCheck) { for (int i = 0; i < 1000; ++i) { int rand_deltas[4]; int deltas_valid; - rand_deltas[0] = (rand() % 160) - 80; - rand_deltas[1] = (rand() % 160) - 80; - rand_deltas[2] = (rand() % 160) - 80; - rand_deltas[3] = (rand() % 160) - 80; + rand_deltas[0] = rnd(160) - 80; + rand_deltas[1] = rnd(160) - 80; + rand_deltas[2] = rnd(160) - 80; + rand_deltas[3] = rnd(160) - 80; deltas_valid = ((abs(rand_deltas[0]) <= 63) && (abs(rand_deltas[1]) <= 63) && diff --git a/libvpx/test/subtract_test.cc b/libvpx/test/subtract_test.cc index 574bfbf..d1f2729 100644 --- a/libvpx/test/subtract_test.cc +++ b/libvpx/test/subtract_test.cc @@ -13,8 +13,8 @@ #include "test/clear_system_state.h" #include "test/register_state_check.h" extern "C" { -#include "vpx_config.h" -#include "vp8_rtcd.h" +#include "./vpx_config.h" +#include "./vp8_rtcd.h" #include "vp8/common/blockd.h" #include "vp8/encoder/block.h" #include "vpx_mem/vpx_mem.h" @@ -51,7 +51,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) { bd.predictor = reinterpret_cast<unsigned char*>( vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor))); - for(int i = 0; kSrcStride[i] > 0; ++i) { + for (int i = 0; kSrcStride[i] > 0; ++i) { // start at block0 be.src = 0; be.base_src = &source; diff --git a/libvpx/test/test-data.sha1 b/libvpx/test/test-data.sha1 index 0ac4905..370ffc1 100644 --- a/libvpx/test/test-data.sha1 +++ b/libvpx/test/test-data.sha1 @@ -520,3 +520,7 @@ d17bc08eedfc60c4c23d576a6c964a21bf854d1f vp90-2-03-size-226x202.webm 83c6d8f2969b759e10e5c6542baca1265c874c29 vp90-2-03-size-226x224.webm.md5 fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce vp90-2-03-size-226x226.webm 94ad19b8b699cea105e2ff18f0df2afd7242bcf7 vp90-2-03-size-226x226.webm.md5 +495256cfd123fe777b2c0406862ed8468a1f4677 vp91-2-04-yv444.webm +65e3a7ffef61ab340d9140f335ecc49125970c2c vp91-2-04-yv444.webm.md5 +b6524e4084d15b5d0caaa3d3d1368db30cbee69c vp90-2-03-deltaq.webm +65f45ec9a55537aac76104818278e0978f94a678 vp90-2-03-deltaq.webm.md5 diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk index 25e05b9..a64c0b8 100644 --- a/libvpx/test/test.mk +++ b/libvpx/test/test.mk @@ -24,7 +24,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += error_resilience_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += i420_video_source.h LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc -LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc @@ -629,3 +629,7 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-deltaq.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5 diff --git a/libvpx/test/test_libvpx.cc b/libvpx/test/test_libvpx.cc index 5610c26..a4dbca4 100644 --- a/libvpx/test/test_libvpx.cc +++ b/libvpx/test/test_libvpx.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ #include <string> -#include "vpx_config.h" +#include "./vpx_config.h" extern "C" { #if ARCH_X86 || ARCH_X86_64 #include "vpx_ports/x86.h" @@ -48,7 +48,9 @@ int main(int argc, char **argv) { #endif #if !CONFIG_SHARED - /* Shared library builds don't support whitebox tests that exercise internal symbols. */ +// Shared library builds don't support whitebox tests +// that exercise internal symbols. + #if CONFIG_VP8 vp8_rtcd(); #endif diff --git a/libvpx/test/test_vector_test.cc b/libvpx/test/test_vector_test.cc index 9b0e9d5..9bd03b9 100644 --- a/libvpx/test/test_vector_test.cc +++ b/libvpx/test/test_vector_test.cc @@ -159,7 +159,10 @@ const char *kVP9TestVectors[] = { "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm", "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm", "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm", - "vp90-2-03-size-226x226.webm" + "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm", +#if CONFIG_NON420 + "vp91-2-04-yv444.webm" +#endif }; #endif diff --git a/libvpx/test/variance_test.cc b/libvpx/test/variance_test.cc index 207b6e7..ca53ffb 100644 --- a/libvpx/test/variance_test.cc +++ b/libvpx/test/variance_test.cc @@ -16,16 +16,16 @@ #include "test/register_state_check.h" #include "vpx/vpx_integer.h" -#include "vpx_config.h" +#include "./vpx_config.h" extern "C" { #include "vpx_mem/vpx_mem.h" #if CONFIG_VP8_ENCODER # include "vp8/common/variance.h" -# include "vp8_rtcd.h" +# include "./vp8_rtcd.h" #endif #if CONFIG_VP9_ENCODER # include "vp9/encoder/vp9_variance.h" -# include "vp9_rtcd.h" +# include "./vp9_rtcd.h" #endif } #include "test/acm_random.h" @@ -107,8 +107,8 @@ static unsigned int subpel_avg_variance_ref(const uint8_t *ref, } template<typename VarianceFunctionType> -class VarianceTest : - public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > { +class VarianceTest + : public ::testing::TestWithParam<tuple<int, int, VarianceFunctionType> > { public: virtual void SetUp() { const tuple<int, int, VarianceFunctionType>& params = this->GetParam(); @@ -191,9 +191,9 @@ void VarianceTest<VarianceFunctionType>::OneQuarterTest() { } template<typename SubpelVarianceFunctionType> -class SubpelVarianceTest : - public ::testing::TestWithParam<tuple<int, int, - SubpelVarianceFunctionType> > { +class SubpelVarianceTest + : public ::testing::TestWithParam<tuple<int, int, + SubpelVarianceFunctionType> > { public: virtual void SetUp() { const tuple<int, int, SubpelVarianceFunctionType>& params = @@ -483,6 +483,7 @@ INSTANTIATE_TEST_CASE_P( #endif #if HAVE_SSE2 +#if CONFIG_USE_X86INC const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2; const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2; const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2; @@ -596,8 +597,11 @@ INSTANTIATE_TEST_CASE_P( make_tuple(6, 5, subpel_avg_variance64x32_sse2), make_tuple(6, 6, subpel_avg_variance64x64_sse2))); #endif +#endif #if HAVE_SSSE3 +#if CONFIG_USE_X86INC + const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 = vp9_sub_pixel_variance4x4_ssse3; const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 = @@ -682,6 +686,7 @@ INSTANTIATE_TEST_CASE_P( make_tuple(6, 5, subpel_avg_variance64x32_ssse3), make_tuple(6, 6, subpel_avg_variance64x64_ssse3))); #endif +#endif #endif // CONFIG_VP9_ENCODER } // namespace vp9 diff --git a/libvpx/test/vp8_boolcoder_test.cc b/libvpx/test/vp8_boolcoder_test.cc index c3a8d12..0383af2 100644 --- a/libvpx/test/vp8_boolcoder_test.cc +++ b/libvpx/test/vp8_boolcoder_test.cc @@ -8,10 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ -extern "C" { -#include "vp8/encoder/boolhuff.h" -#include "vp8/decoder/dboolhuff.h" -} #include <math.h> #include <stddef.h> @@ -24,6 +20,11 @@ extern "C" { #include "third_party/googletest/src/include/gtest/gtest.h" #include "vpx/vpx_integer.h" +extern "C" { +#include "vp8/encoder/boolhuff.h" +#include "vp8/decoder/dboolhuff.h" +} + namespace { const int num_tests = 10; @@ -44,7 +45,7 @@ void encrypt_buffer(uint8_t *buffer, int size) { void test_decrypt_cb(void *decrypt_state, const uint8_t *input, uint8_t *output, int count) { - int offset = input - (uint8_t *)decrypt_state; + int offset = input - reinterpret_cast<uint8_t *>(decrypt_state); for (int i = 0; i < count; i++) { output[i] = input[i] ^ secret_key[(offset + i) & 15]; } @@ -58,10 +59,10 @@ TEST(VP8, TestBitIO) { ACMRandom rnd(ACMRandom::DeterministicSeed()); for (int n = 0; n < num_tests; ++n) { for (int method = 0; method <= 7; ++method) { // we generate various proba - const int bits_to_test = 1000; - uint8_t probas[bits_to_test]; + const int kBitsToTest = 1000; + uint8_t probas[kBitsToTest]; - for (int i = 0; i < bits_to_test; ++i) { + for (int i = 0; i < kBitsToTest; ++i) { const int parity = i & 1; probas[i] = (method == 0) ? 0 : (method == 1) ? 255 : @@ -76,14 +77,14 @@ TEST(VP8, TestBitIO) { } for (int bit_method = 0; bit_method <= 3; ++bit_method) { const int random_seed = 6432; - const int buffer_size = 10000; + const int kBufferSize = 10000; ACMRandom bit_rnd(random_seed); BOOL_CODER bw; - uint8_t bw_buffer[buffer_size]; - vp8_start_encode(&bw, bw_buffer, bw_buffer + buffer_size); + uint8_t bw_buffer[kBufferSize]; + vp8_start_encode(&bw, bw_buffer, bw_buffer + kBufferSize); int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0; - for (int i = 0; i < bits_to_test; ++i) { + for (int i = 0; i < kBitsToTest; ++i) { if (bit_method == 2) { bit = (i & 1); } else if (bit_method == 3) { @@ -98,19 +99,20 @@ TEST(VP8, TestBitIO) { #if CONFIG_DECRYPT encrypt_buffer(bw_buffer, buffer_size); vp8dx_start_decode(&br, bw_buffer, buffer_size, - test_decrypt_cb, (void *)bw_buffer); + test_decrypt_cb, + reinterpret_cast<void *>(bw_buffer)); #else - vp8dx_start_decode(&br, bw_buffer, buffer_size, NULL, NULL); + vp8dx_start_decode(&br, bw_buffer, kBufferSize, NULL, NULL); #endif bit_rnd.Reset(random_seed); - for (int i = 0; i < bits_to_test; ++i) { + for (int i = 0; i < kBitsToTest; ++i) { if (bit_method == 2) { bit = (i & 1); } else if (bit_method == 3) { bit = bit_rnd(2); } GTEST_ASSERT_EQ(vp8dx_decode_bool(&br, probas[i]), bit) - << "pos: "<< i << " / " << bits_to_test + << "pos: "<< i << " / " << kBitsToTest << " bit_method: " << bit_method << " method: " << method; } diff --git a/libvpx/test/vp8_decrypt_test.cc b/libvpx/test/vp8_decrypt_test.cc index d850f00..b092509 100644 --- a/libvpx/test/vp8_decrypt_test.cc +++ b/libvpx/test/vp8_decrypt_test.cc @@ -26,7 +26,8 @@ const uint8_t test_key[16] = { 0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0 }; -void encrypt_buffer(const uint8_t *src, uint8_t *dst, int size, int offset = 0) { +void encrypt_buffer(const uint8_t *src, uint8_t *dst, + int size, int offset = 0) { for (int i = 0; i < size; ++i) { dst[i] = src[i] ^ test_key[(offset + i) & 15]; } @@ -34,10 +35,11 @@ void encrypt_buffer(const uint8_t *src, uint8_t *dst, int size, int offset = 0) void test_decrypt_cb(void *decrypt_state, const uint8_t *input, uint8_t *output, int count) { - encrypt_buffer(input, output, count, input - (uint8_t *)decrypt_state); + encrypt_buffer(input, output, count, + input - reinterpret_cast<uint8_t *>(decrypt_state)); } -} // namespace +} // namespace namespace libvpx_test { diff --git a/libvpx/test/vp8_fdct4x4_test.cc b/libvpx/test/vp8_fdct4x4_test.cc index 3c60011..c823436 100644 --- a/libvpx/test/vp8_fdct4x4_test.cc +++ b/libvpx/test/vp8_fdct4x4_test.cc @@ -18,7 +18,7 @@ extern "C" { -#include "vp8_rtcd.h" +#include "./vp8_rtcd.h" } #include "test/acm_random.h" diff --git a/libvpx/test/vp9_boolcoder_test.cc b/libvpx/test/vp9_boolcoder_test.cc index 42b2229..5edde90 100644 --- a/libvpx/test/vp9_boolcoder_test.cc +++ b/libvpx/test/vp9_boolcoder_test.cc @@ -19,7 +19,7 @@ extern "C" { #include "vp9/decoder/vp9_dboolhuff.h" } -#include "acm_random.h" +#include "test/acm_random.h" #include "vpx/vpx_integer.h" using libvpx_test::ACMRandom; @@ -32,10 +32,10 @@ TEST(VP9, TestBitIO) { ACMRandom rnd(ACMRandom::DeterministicSeed()); for (int n = 0; n < num_tests; ++n) { for (int method = 0; method <= 7; ++method) { // we generate various proba - const int bits_to_test = 1000; - uint8_t probas[bits_to_test]; + const int kBitsToTest = 1000; + uint8_t probas[kBitsToTest]; - for (int i = 0; i < bits_to_test; ++i) { + for (int i = 0; i < kBitsToTest; ++i) { const int parity = i & 1; probas[i] = (method == 0) ? 0 : (method == 1) ? 255 : @@ -50,14 +50,14 @@ TEST(VP9, TestBitIO) { } for (int bit_method = 0; bit_method <= 3; ++bit_method) { const int random_seed = 6432; - const int buffer_size = 10000; + const int kBufferSize = 10000; ACMRandom bit_rnd(random_seed); vp9_writer bw; - uint8_t bw_buffer[buffer_size]; + uint8_t bw_buffer[kBufferSize]; vp9_start_encode(&bw, bw_buffer); int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0; - for (int i = 0; i < bits_to_test; ++i) { + for (int i = 0; i < kBitsToTest; ++i) { if (bit_method == 2) { bit = (i & 1); } else if (bit_method == 3) { @@ -72,16 +72,16 @@ TEST(VP9, TestBitIO) { GTEST_ASSERT_EQ(bw_buffer[0] & 0x80, 0); vp9_reader br; - vp9_reader_init(&br, bw_buffer, buffer_size); + vp9_reader_init(&br, bw_buffer, kBufferSize); bit_rnd.Reset(random_seed); - for (int i = 0; i < bits_to_test; ++i) { + for (int i = 0; i < kBitsToTest; ++i) { if (bit_method == 2) { bit = (i & 1); } else if (bit_method == 3) { bit = bit_rnd(2); } GTEST_ASSERT_EQ(vp9_read(&br, probas[i]), bit) - << "pos: " << i << " / " << bits_to_test + << "pos: " << i << " / " << kBitsToTest << " bit_method: " << bit_method << " method: " << method; } diff --git a/libvpx/test/vp9_subtract_test.cc b/libvpx/test/vp9_subtract_test.cc index 2476795..332a839 100644 --- a/libvpx/test/vp9_subtract_test.cc +++ b/libvpx/test/vp9_subtract_test.cc @@ -39,8 +39,8 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) { ACMRandom rnd(ACMRandom::DeterministicSeed()); // FIXME(rbultje) split in its own file - for (BLOCK_SIZE_TYPE bsize = BLOCK_4X4; bsize < BLOCK_SIZE_TYPES; - bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) { + for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES; + bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) { const int block_width = 4 << b_width_log2(bsize); const int block_height = 4 << b_height_log2(bsize); int16_t *diff = reinterpret_cast<int16_t *>( diff --git a/libvpx/vp8/common/onyx.h b/libvpx/vp8/common/onyx.h index 766b4ea..30c4cbb 100644 --- a/libvpx/vp8/common/onyx.h +++ b/libvpx/vp8/common/onyx.h @@ -41,7 +41,8 @@ extern "C" { USAGE_STREAM_FROM_SERVER = 0x0, USAGE_LOCAL_FILE_PLAYBACK = 0x1, - USAGE_CONSTRAINED_QUALITY = 0x2 + USAGE_CONSTRAINED_QUALITY = 0x2, + USAGE_CONSTANT_QUALITY = 0x3 } END_USAGE; diff --git a/libvpx/vp8/encoder/picklpf.c b/libvpx/vp8/encoder/picklpf.c index 841e1e4..250d04c 100644 --- a/libvpx/vp8/encoder/picklpf.c +++ b/libvpx/vp8/encoder/picklpf.c @@ -313,7 +313,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) /* Get baseline error score */ /* Copy the unfiltered / processed recon buffer to the new buffer */ - vp8_yv12_copy_y(saved_frame, cm->frame_to_show); + vpx_yv12_copy_y(saved_frame, cm->frame_to_show); vp8cx_set_alt_lf_level(cpi, filt_mid); vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid); @@ -339,7 +339,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) if(ss_err[filt_low] == 0) { /* Get Low filter error score */ - vp8_yv12_copy_y(saved_frame, cm->frame_to_show); + vpx_yv12_copy_y(saved_frame, cm->frame_to_show); vp8cx_set_alt_lf_level(cpi, filt_low); vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low); @@ -367,7 +367,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) { if(ss_err[filt_high] == 0) { - vp8_yv12_copy_y(saved_frame, cm->frame_to_show); + vpx_yv12_copy_y(saved_frame, cm->frame_to_show); vp8cx_set_alt_lf_level(cpi, filt_high); vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high); diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c index 9a7b9c5..19e9d27 100644 --- a/libvpx/vp8/vp8_cx_iface.c +++ b/libvpx/vp8/vp8_cx_iface.c @@ -153,7 +153,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, #else RANGE_CHECK_HI(cfg, g_lag_in_frames, 25); #endif - RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_CQ); + RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q); RANGE_CHECK_HI(cfg, rc_undershoot_pct, 1000); RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000); RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); @@ -204,7 +204,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(vp8_cfg, arnr_strength, 6); RANGE_CHECK(vp8_cfg, arnr_type, 1, 3); RANGE_CHECK(vp8_cfg, cq_level, 0, 63); - if(finalize && cfg->rc_end_usage == VPX_CQ) + if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q)) RANGE_CHECK(vp8_cfg, cq_level, cfg->rc_min_quantizer, cfg->rc_max_quantizer); @@ -327,17 +327,14 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, oxcf->resample_up_water_mark = cfg.rc_resize_up_thresh; oxcf->resample_down_water_mark = cfg.rc_resize_down_thresh; - if (cfg.rc_end_usage == VPX_VBR) - { - oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK; - } - else if (cfg.rc_end_usage == VPX_CBR) - { - oxcf->end_usage = USAGE_STREAM_FROM_SERVER; - } - else if (cfg.rc_end_usage == VPX_CQ) - { - oxcf->end_usage = USAGE_CONSTRAINED_QUALITY; + if (cfg.rc_end_usage == VPX_VBR) { + oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK; + } else if (cfg.rc_end_usage == VPX_CBR) { + oxcf->end_usage = USAGE_STREAM_FROM_SERVER; + } else if (cfg.rc_end_usage == VPX_CQ) { + oxcf->end_usage = USAGE_CONSTRAINED_QUALITY; + } else if (cfg.rc_end_usage == VPX_Q) { + oxcf->end_usage = USAGE_CONSTANT_QUALITY; } oxcf->target_bandwidth = cfg.rc_target_bitrate; @@ -1272,7 +1269,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = 1, /* g_delete_first_pass_file */ "vp8.fpf" /* first pass filename */ #endif - + VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */ 1, /* ts_number_layers */ {0}, /* ts_target_bitrate */ {0}, /* ts_rate_decimator */ diff --git a/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm new file mode 100644 index 0000000..7d24530 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm @@ -0,0 +1,116 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_convolve_avg_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vp9_convolve_avg_neon| PROC + push {r4-r6, lr} + ldrd r4, r5, [sp, #32] + mov r6, r2 + + cmp r4, #32 + bgt avg64 + beq avg32 + cmp r4, #8 + bgt avg16 + beq avg8 + b avg4 + +avg64 + sub lr, r1, #32 + sub r4, r3, #32 +avg64_h + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0]! + vld1.8 {q2-q3}, [r0], lr + pld [r2, r3] + vld1.8 {q8-q9}, [r6@128]! + vld1.8 {q10-q11}, [r6@128], r4 + vrhadd.u8 q0, q0, q8 + vrhadd.u8 q1, q1, q9 + vrhadd.u8 q2, q2, q10 + vrhadd.u8 q3, q3, q11 + vst1.8 {q0-q1}, [r2@128]! + vst1.8 {q2-q3}, [r2@128], r4 + subs r5, r5, #1 + bgt avg64_h + pop {r4-r6, pc} + +avg32 + vld1.8 {q0-q1}, [r0], r1 + vld1.8 {q2-q3}, [r0], r1 + vld1.8 {q8-q9}, [r6@128], r3 + vld1.8 {q10-q11}, [r6@128], r3 + pld [r0] + vrhadd.u8 q0, q0, q8 + pld [r0, r1] + vrhadd.u8 q1, q1, q9 + pld [r6] + vrhadd.u8 q2, q2, q10 + pld [r6, r3] + vrhadd.u8 q3, q3, q11 + vst1.8 {q0-q1}, [r2@128], r3 + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #2 + bgt avg32 + pop {r4-r6, pc} + +avg16 + vld1.8 {q0}, [r0], r1 + vld1.8 {q1}, [r0], r1 + vld1.8 {q2}, [r6@128], r3 + vld1.8 {q3}, [r6@128], r3 + pld [r0] + pld [r0, r1] + vrhadd.u8 q0, q0, q2 + pld [r6] + pld [r6, r3] + vrhadd.u8 q1, q1, q3 + vst1.8 {q0}, [r2@128], r3 + vst1.8 {q1}, [r2@128], r3 + subs r5, r5, #2 + bgt avg16 + pop {r4-r6, pc} + +avg8 + vld1.8 {d0}, [r0], r1 + vld1.8 {d1}, [r0], r1 + vld1.8 {d2}, [r6@64], r3 + vld1.8 {d3}, [r6@64], r3 + pld [r0] + pld [r0, r1] + vrhadd.u8 q0, q0, q1 + pld [r6] + pld [r6, r3] + vst1.8 {d0}, [r2@64], r3 + vst1.8 {d1}, [r2@64], r3 + subs r5, r5, #2 + bgt avg8 + pop {r4-r6, pc} + +avg4 + vld1.32 {d0[0]}, [r0], r1 + vld1.32 {d0[1]}, [r0], r1 + vld1.32 {d2[0]}, [r6@32], r3 + vld1.32 {d2[1]}, [r6@32], r3 + vrhadd.u8 d0, d0, d2 + vst1.32 {d0[0]}, [r2@32], r3 + vst1.32 {d0[1]}, [r2@32], r3 + subs r5, r5, #2 + bgt avg4 + pop {r4-r6, pc} + ENDP + + END diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm index 110a56c..6b20cb9 100644 --- a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm @@ -66,46 +66,64 @@ vld1.s16 {q0}, [r5] ; filter_x - add r8, r1, r1, lsl #1 ; src_stride * 3 - add r8, r8, #4 ; src_stride * 3 + 4 - rsb r8, r8, #0 ; reset for src + sub r8, r1, r1, lsl #2 ; -src_stride * 3 + add r8, r8, #4 ; -src_stride * 3 + 4 - add r4, r3, r3, lsl #1 ; dst_stride * 3 - sub r4, r4, #4 ; dst_stride * 3 - 4 - rsb r4, r4, #0 ; reset for dst + sub r4, r3, r3, lsl #2 ; -dst_stride * 3 + add r4, r4, #4 ; -dst_stride * 3 + 4 - sub r9, r1, #8 ; post increment for src load - - rsb r1, r6, r1, lsl #2 ; reset src for outer loop + rsb r9, r6, r1, lsl #2 ; reset src for outer loop + sub r9, r9, #7 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop mov r10, r6 ; w loop counter -loop_horiz - vld1.8 {d24}, [r0]! - vld3.u8 {d28[0], d29[0], d30[0]}, [r0], r9 - - vld1.8 {d25}, [r0]! - vld3.u8 {d28[1], d29[1], d30[1]}, [r0], r9 - - vld1.8 {d26}, [r0]! - vld3.u8 {d28[2], d29[2], d30[2]}, [r0], r9 - - vld1.8 {d27}, [r0]! - vld3.u8 {d28[3], d29[3], d30[3]}, [r0], r8 +loop_horiz_v + vld1.8 {d24}, [r0], r1 + vld1.8 {d25}, [r0], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d27}, [r0], r8 vtrn.16 q12, q13 vtrn.8 d24, d25 vtrn.8 d26, d27 - ; extract to s16 + pld [r0, r1, lsl #2] + vmovl.u8 q8, d24 vmovl.u8 q9, d25 vmovl.u8 q10, d26 vmovl.u8 q11, d27 - vtrn.32 d28, d29 ; only the first half is populated + + ; save a few instructions in the inner loop + vswp d17, d18 + vmov d23, d21 + + add r0, r0, #3 + +loop_horiz + add r5, r0, #64 + + vld1.32 {d28[]}, [r0], r1 + vld1.32 {d29[]}, [r0], r1 + vld1.32 {d31[]}, [r0], r1 + vld1.32 {d30[]}, [r0], r8 + + pld [r5] + + vtrn.16 d28, d31 + vtrn.16 d29, d30 + vtrn.8 d28, d29 + vtrn.8 d31, d30 + + pld [r5, r1] + + ; extract to s16 + vtrn.32 q14, q15 vmovl.u8 q12, d28 - vmovl.u8 q13, d30 + vmovl.u8 q13, d29 + + pld [r5, r1, lsl #1] ; slightly out of order load to match the existing data vld1.u32 {d6[0]}, [r2], r3 @@ -116,10 +134,12 @@ loop_horiz sub r2, r2, r3, lsl #2 ; reset for store ; src[] * filter_x - MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23 - MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24 - MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25 - MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26 + MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 + MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 + MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 + MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 + + pld [r5, -r8] ; += 64 >> 7 vqrshrun.s32 d2, q1, #7 @@ -135,24 +155,29 @@ loop_horiz vtrn.16 d2, d3 vtrn.32 d2, d3 vtrn.8 d2, d3 - + ; average the new value and the dst value vrhadd.u8 q1, q1, q3 - vst1.u32 {d2[0]}, [r2], r3 - vst1.u32 {d3[0]}, [r2], r3 - vst1.u32 {d2[1]}, [r2], r3 - vst1.u32 {d3[1]}, [r2], r4 + vst1.u32 {d2[0]}, [r2@32], r3 + vst1.u32 {d3[0]}, [r2@32], r3 + vst1.u32 {d2[1]}, [r2@32], r3 + vst1.u32 {d3[1]}, [r2@32], r4 + + vmov q8, q9 + vmov d20, d23 + vmov q11, q12 + vmov q9, q13 subs r6, r6, #4 ; w -= 4 bgt loop_horiz ; outer loop mov r6, r10 ; restore w counter - add r0, r0, r1 ; src += src_stride * 4 - w + add r0, r0, r9 ; src += src_stride * 4 - w add r2, r2, r12 ; dst += dst_stride * 4 - w subs r7, r7, #4 ; h -= 4 - bgt loop_horiz + bgt loop_horiz_v pop {r4-r10, pc} @@ -163,66 +188,77 @@ loop_horiz cmp r12, #16 bne vp9_convolve8_avg_vert_c - push {r4-r10, lr} + push {r4-r8, lr} ; adjust for taps sub r0, r0, r1 sub r0, r0, r1, lsl #1 - ldr r7, [sp, #40] ; filter_y - ldr r8, [sp, #48] ; w - ldr r9, [sp, #52] ; h + ldr r4, [sp, #32] ; filter_y + ldr r6, [sp, #40] ; w + ldr lr, [sp, #44] ; h - vld1.s16 {q0}, [r7] ; filter_y + vld1.s16 {q0}, [r4] ; filter_y - mov r5, r1, lsl #1 ; src_stride * 2 - add r5, r5, r1, lsl #3 ; src_stride * 10 - sub r5, r5, #4 ; src_stride * 10 + 4 - rsb r5, r5, #0 ; reset for src + lsl r1, r1, #1 + lsl r3, r3, #1 - add r6, r3, r3, lsl #1 ; dst_stride * 3 - sub r6, r6, #4 ; dst_stride * 3 - 4 - rsb r6, r6, #0 ; reset for dst +loop_vert_h + mov r4, r0 + add r7, r0, r1, asr #1 + mov r5, r2 + add r8, r2, r3, asr #1 + mov r12, lr ; h loop counter - rsb r7, r8, r1, lsl #2 ; reset src for outer loop - rsb r12, r8, r3, lsl #2 ; reset dst for outer loop + vld1.u32 {d16[0]}, [r4], r1 + vld1.u32 {d16[1]}, [r7], r1 + vld1.u32 {d18[0]}, [r4], r1 + vld1.u32 {d18[1]}, [r7], r1 + vld1.u32 {d20[0]}, [r4], r1 + vld1.u32 {d20[1]}, [r7], r1 + vld1.u32 {d22[0]}, [r4], r1 - mov r10, r8 ; w loop counter + vmovl.u8 q8, d16 + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 loop_vert ; always process a 4x4 block at a time - vld1.u32 {d16[0]}, [r0], r1 - vld1.u32 {d16[1]}, [r0], r1 - vld1.u32 {d18[0]}, [r0], r1 - vld1.u32 {d18[1]}, [r0], r1 - vld1.u32 {d20[0]}, [r0], r1 - vld1.u32 {d20[1]}, [r0], r1 - vld1.u32 {d22[0]}, [r0], r1 - vld1.u32 {d22[1]}, [r0], r1 - vld1.u32 {d24[0]}, [r0], r1 - vld1.u32 {d24[1]}, [r0], r1 - vld1.u32 {d26[0]}, [r0], r5 + vld1.u32 {d24[0]}, [r7], r1 + vld1.u32 {d26[0]}, [r4], r1 + vld1.u32 {d26[1]}, [r7], r1 + vld1.u32 {d24[1]}, [r4], r1 ; extract to s16 - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmovl.u8 q10, d20 - vmovl.u8 q11, d22 vmovl.u8 q12, d24 vmovl.u8 q13, d26 - vld1.u32 {d6[0]}, [r2], r3 - vld1.u32 {d6[1]}, [r2], r3 - vld1.u32 {d7[0]}, [r2], r3 - vld1.u32 {d7[1]}, [r2], r3 + vld1.u32 {d6[0]}, [r5@32], r3 + vld1.u32 {d6[1]}, [r8@32], r3 + vld1.u32 {d7[0]}, [r5@32], r3 + vld1.u32 {d7[1]}, [r8@32], r3 - sub r2, r2, r3, lsl #2 ; reset for store + pld [r7] + pld [r4] ; src[] * filter_y - MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23 - MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24 - MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25 - MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26 + MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 + + pld [r7, r1] + pld [r4, r1] + + MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 + + pld [r5] + pld [r8] + + MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 + + pld [r5, r3] + pld [r8, r3] + + MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 ; += 64 >> 7 vqrshrun.s32 d2, q1, #7 @@ -237,22 +273,30 @@ loop_vert ; average the new value and the dst value vrhadd.u8 q1, q1, q3 - vst1.u32 {d2[0]}, [r2], r3 - vst1.u32 {d2[1]}, [r2], r3 - vst1.u32 {d3[0]}, [r2], r3 - vst1.u32 {d3[1]}, [r2], r6 + sub r5, r5, r3, lsl #1 ; reset for store + sub r8, r8, r3, lsl #1 - subs r8, r8, #4 ; w -= 4 + vst1.u32 {d2[0]}, [r5@32], r3 + vst1.u32 {d2[1]}, [r8@32], r3 + vst1.u32 {d3[0]}, [r5@32], r3 + vst1.u32 {d3[1]}, [r8@32], r3 + + vmov q8, q10 + vmov d18, d22 + vmov d19, d24 + vmov q10, q13 + vmov d22, d25 + + subs r12, r12, #4 ; h -= 4 bgt loop_vert ; outer loop - mov r8, r10 ; restore w counter - add r0, r0, r7 ; src += 4 * src_stride - w - add r2, r2, r12 ; dst += 4 * dst_stride - w - subs r9, r9, #4 ; h -= 4 - bgt loop_vert + add r0, r0, #4 + add r2, r2, #4 + subs r6, r6, #4 ; w -= 4 + bgt loop_vert_h - pop {r4-r10, pc} + pop {r4-r8, pc} ENDP END diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm index 845e4a8..4525845 100644 --- a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm @@ -66,52 +66,72 @@ vld1.s16 {q0}, [r5] ; filter_x - add r8, r1, r1, lsl #1 ; src_stride * 3 - add r8, r8, #4 ; src_stride * 3 + 4 - rsb r8, r8, #0 ; reset for src + sub r8, r1, r1, lsl #2 ; -src_stride * 3 + add r8, r8, #4 ; -src_stride * 3 + 4 - add r4, r3, r3, lsl #1 ; dst_stride * 3 - sub r4, r4, #4 ; dst_stride * 3 - 4 - rsb r4, r4, #0 ; reset for dst + sub r4, r3, r3, lsl #2 ; -dst_stride * 3 + add r4, r4, #4 ; -dst_stride * 3 + 4 - sub r9, r1, #8 ; post increment for src load - - rsb r1, r6, r1, lsl #2 ; reset src for outer loop + rsb r9, r6, r1, lsl #2 ; reset src for outer loop + sub r9, r9, #7 rsb r12, r6, r3, lsl #2 ; reset dst for outer loop mov r10, r6 ; w loop counter -loop_horiz - vld1.8 {d24}, [r0]! - vld3.u8 {d28[0], d29[0], d30[0]}, [r0], r9 - - vld1.8 {d25}, [r0]! - vld3.u8 {d28[1], d29[1], d30[1]}, [r0], r9 - - vld1.8 {d26}, [r0]! - vld3.u8 {d28[2], d29[2], d30[2]}, [r0], r9 - - vld1.8 {d27}, [r0]! - vld3.u8 {d28[3], d29[3], d30[3]}, [r0], r8 +loop_horiz_v + vld1.8 {d24}, [r0], r1 + vld1.8 {d25}, [r0], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d27}, [r0], r8 vtrn.16 q12, q13 vtrn.8 d24, d25 vtrn.8 d26, d27 - ; extract to s16 + pld [r0, r1, lsl #2] + vmovl.u8 q8, d24 vmovl.u8 q9, d25 vmovl.u8 q10, d26 vmovl.u8 q11, d27 - vtrn.32 d28, d29 ; only the first half is populated + + ; save a few instructions in the inner loop + vswp d17, d18 + vmov d23, d21 + + add r0, r0, #3 + +loop_horiz + add r5, r0, #64 + + vld1.32 {d28[]}, [r0], r1 + vld1.32 {d29[]}, [r0], r1 + vld1.32 {d31[]}, [r0], r1 + vld1.32 {d30[]}, [r0], r8 + + pld [r5] + + vtrn.16 d28, d31 + vtrn.16 d29, d30 + vtrn.8 d28, d29 + vtrn.8 d31, d30 + + pld [r5, r1] + + ; extract to s16 + vtrn.32 q14, q15 vmovl.u8 q12, d28 - vmovl.u8 q13, d30 + vmovl.u8 q13, d29 + + pld [r5, r1, lsl #1] ; src[] * filter_x - MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23 - MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24 - MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25 - MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26 + MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 + MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 + MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 + MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 + + pld [r5, -r8] ; += 64 >> 7 vqrshrun.s32 d2, q1, #7 @@ -128,20 +148,25 @@ loop_horiz vtrn.32 d2, d3 vtrn.8 d2, d3 - vst1.u32 {d2[0]}, [r2], r3 - vst1.u32 {d3[0]}, [r2], r3 - vst1.u32 {d2[1]}, [r2], r3 - vst1.u32 {d3[1]}, [r2], r4 + vst1.u32 {d2[0]}, [r2@32], r3 + vst1.u32 {d3[0]}, [r2@32], r3 + vst1.u32 {d2[1]}, [r2@32], r3 + vst1.u32 {d3[1]}, [r2@32], r4 + + vmov q8, q9 + vmov d20, d23 + vmov q11, q12 + vmov q9, q13 subs r6, r6, #4 ; w -= 4 bgt loop_horiz ; outer loop mov r6, r10 ; restore w counter - add r0, r0, r1 ; src += src_stride * 4 - w + add r0, r0, r9 ; src += src_stride * 4 - w add r2, r2, r12 ; dst += dst_stride * 4 - w subs r7, r7, #4 ; h -= 4 - bgt loop_horiz + bgt loop_horiz_v pop {r4-r10, pc} @@ -152,59 +177,72 @@ loop_horiz cmp r12, #16 bne vp9_convolve8_vert_c - push {r4-r10, lr} + push {r4-r8, lr} ; adjust for taps sub r0, r0, r1 sub r0, r0, r1, lsl #1 - ldr r7, [sp, #40] ; filter_y - ldr r8, [sp, #48] ; w - ldr r9, [sp, #52] ; h + ldr r4, [sp, #32] ; filter_y + ldr r6, [sp, #40] ; w + ldr lr, [sp, #44] ; h - vld1.s16 {q0}, [r7] ; filter_y + vld1.s16 {q0}, [r4] ; filter_y - mov r5, r1, lsl #1 ; src_stride * 2 - add r5, r5, r1, lsl #3 ; src_stride * 10 - sub r5, r5, #4 ; src_stride * 10 + 4 - rsb r5, r5, #0 ; reset for src + lsl r1, r1, #1 + lsl r3, r3, #1 - add r6, r3, r3, lsl #1 ; dst_stride * 3 - sub r6, r6, #4 ; dst_stride * 3 - 4 - rsb r6, r6, #0 ; reset for dst +loop_vert_h + mov r4, r0 + add r7, r0, r1, asr #1 + mov r5, r2 + add r8, r2, r3, asr #1 + mov r12, lr ; h loop counter - rsb r7, r8, r1, lsl #2 ; reset src for outer loop - rsb r12, r8, r3, lsl #2 ; reset dst for outer loop + vld1.u32 {d16[0]}, [r4], r1 + vld1.u32 {d16[1]}, [r7], r1 + vld1.u32 {d18[0]}, [r4], r1 + vld1.u32 {d18[1]}, [r7], r1 + vld1.u32 {d20[0]}, [r4], r1 + vld1.u32 {d20[1]}, [r7], r1 + vld1.u32 {d22[0]}, [r4], r1 - mov r10, r8 ; w loop counter + vmovl.u8 q8, d16 + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 loop_vert ; always process a 4x4 block at a time - vld1.u32 {d16[0]}, [r0], r1 - vld1.u32 {d16[1]}, [r0], r1 - vld1.u32 {d18[0]}, [r0], r1 - vld1.u32 {d18[1]}, [r0], r1 - vld1.u32 {d20[0]}, [r0], r1 - vld1.u32 {d20[1]}, [r0], r1 - vld1.u32 {d22[0]}, [r0], r1 - vld1.u32 {d22[1]}, [r0], r1 - vld1.u32 {d24[0]}, [r0], r1 - vld1.u32 {d24[1]}, [r0], r1 - vld1.u32 {d26[0]}, [r0], r5 + vld1.u32 {d24[0]}, [r7], r1 + vld1.u32 {d26[0]}, [r4], r1 + vld1.u32 {d26[1]}, [r7], r1 + vld1.u32 {d24[1]}, [r4], r1 ; extract to s16 - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmovl.u8 q10, d20 - vmovl.u8 q11, d22 vmovl.u8 q12, d24 vmovl.u8 q13, d26 + pld [r5] + pld [r8] + ; src[] * filter_y - MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23 - MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24 - MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25 - MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26 + MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 + + pld [r5, r3] + pld [r8, r3] + + MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 + + pld [r7] + pld [r4] + + MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 + + pld [r7, r1] + pld [r4, r1] + + MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 ; += 64 >> 7 vqrshrun.s32 d2, q1, #7 @@ -216,22 +254,27 @@ loop_vert vqmovn.u16 d2, q1 vqmovn.u16 d3, q2 - vst1.u32 {d2[0]}, [r2], r3 - vst1.u32 {d2[1]}, [r2], r3 - vst1.u32 {d3[0]}, [r2], r3 - vst1.u32 {d3[1]}, [r2], r6 + vst1.u32 {d2[0]}, [r5@32], r3 + vst1.u32 {d2[1]}, [r8@32], r3 + vst1.u32 {d3[0]}, [r5@32], r3 + vst1.u32 {d3[1]}, [r8@32], r3 + + vmov q8, q10 + vmov d18, d22 + vmov d19, d24 + vmov q10, q13 + vmov d22, d25 - subs r8, r8, #4 ; w -= 4 + subs r12, r12, #4 ; h -= 4 bgt loop_vert ; outer loop - mov r8, r10 ; restore w counter - add r0, r0, r7 ; src += 4 * src_stride - w - add r2, r2, r12 ; dst += 4 * dst_stride - w - subs r9, r9, #4 ; h -= 4 - bgt loop_vert + add r0, r0, #4 + add r2, r2, #4 + subs r6, r6, #4 ; w -= 4 + bgt loop_vert_h - pop {r4-r10, pc} + pop {r4-r8, pc} ENDP END diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c b/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c index 6e37ff6..d8b24bf 100644 --- a/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c +++ b/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c @@ -10,6 +10,7 @@ #include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" +#include "vpx_ports/mem.h" void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, @@ -19,7 +20,7 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). */ - uint8_t temp[64 * 72]; + DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72); // Account for the vertical phase needing 3 lines prior and 4 lines post int intermediate_height = h + 7; @@ -53,7 +54,7 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - uint8_t temp[64 * 72]; + DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72); int intermediate_height = h + 7; if (x_step_q4 != 16 || y_step_q4 != 16) diff --git a/libvpx/vp9/common/arm/neon/vp9_copy_neon.asm b/libvpx/vp9/common/arm/neon/vp9_copy_neon.asm new file mode 100644 index 0000000..a0bd04a --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_copy_neon.asm @@ -0,0 +1,84 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_convolve_copy_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vp9_convolve_copy_neon| PROC + push {r4-r5, lr} + ldrd r4, r5, [sp, #28] + + cmp r4, #32 + bgt copy64 + beq copy32 + cmp r4, #8 + bgt copy16 + beq copy8 + b copy4 + +copy64 + sub lr, r1, #32 + sub r3, r3, #32 +copy64_h + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0]! + vld1.8 {q2-q3}, [r0], lr + vst1.8 {q0-q1}, [r2@128]! + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #1 + bgt copy64_h + pop {r4-r5, pc} + +copy32 + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {q2-q3}, [r0], r1 + vst1.8 {q0-q1}, [r2@128], r3 + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #2 + bgt copy32 + pop {r4-r5, pc} + +copy16 + pld [r0, r1, lsl #1] + vld1.8 {q0}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {q1}, [r0], r1 + vst1.8 {q0}, [r2@128], r3 + vst1.8 {q1}, [r2@128], r3 + subs r5, r5, #2 + bgt copy16 + pop {r4-r5, pc} + +copy8 + pld [r0, r1, lsl #1] + vld1.8 {d0}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {d2}, [r0], r1 + vst1.8 {d0}, [r2@64], r3 + vst1.8 {d2}, [r2@64], r3 + subs r5, r5, #2 + bgt copy8 + pop {r4-r5, pc} + +copy4 + ldr r12, [r0], r1 + str r12, [r2], r3 + subs r5, r5, #1 + bgt copy4 + pop {r4-r5, pc} + ENDP + + END diff --git a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c new file mode 100644 index 0000000..3e3e400 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" + +extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input, + int16_t *output, + int output_stride); +extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src, + int16_t *output, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride); +extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input, + int16_t *output, + int output_stride); +extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src, + int16_t *output, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride); +extern void save_neon_registers(); +extern void restore_neon_registers(); + + +void vp9_short_idct16x16_add_neon(int16_t *input, + uint8_t *dest, int dest_stride) { + int16_t pass1_output[16*16] = {0}; + int16_t row_idct_output[16*16] = {0}; + + // save d8-d15 register values. + save_neon_registers(); + + /* Parallel idct on the upper 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + vp9_short_idct16x16_add_neon_pass2(input+1, + row_idct_output, + pass1_output, + 0, + dest, + dest_stride); + + /* Parallel idct on the lower 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + vp9_short_idct16x16_add_neon_pass2(input+8*16+1, + row_idct_output+8, + pass1_output, + 0, + dest, + dest_stride); + + /* Parallel idct on the left 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + vp9_short_idct16x16_add_neon_pass2(row_idct_output+1, + row_idct_output, + pass1_output, + 1, + dest, + dest_stride); + + /* Parallel idct on the right 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1, + row_idct_output+8, + pass1_output, + 1, + dest+8, + dest_stride); + + // restore d8-d15 register values. + restore_neon_registers(); + + return; +} + +void vp9_short_idct10_16x16_add_neon(int16_t *input, + uint8_t *dest, int dest_stride) { + int16_t pass1_output[16*16] = {0}; + int16_t row_idct_output[16*16] = {0}; + + // save d8-d15 register values. + save_neon_registers(); + + /* Parallel idct on the upper 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + vp9_short_idct10_16x16_add_neon_pass2(input+1, + row_idct_output, + pass1_output, + 0, + dest, + dest_stride); + + /* Skip Parallel idct on the lower 8 rows as they are all 0s */ + + /* Parallel idct on the left 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + vp9_short_idct16x16_add_neon_pass2(row_idct_output+1, + row_idct_output, + pass1_output, + 1, + dest, + dest_stride); + + /* Parallel idct on the right 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1, + row_idct_output+8, + pass1_output, + 1, + dest+8, + dest_stride); + + // restore d8-d15 register values. + restore_neon_registers(); + + return; +} diff --git a/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c b/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c new file mode 100644 index 0000000..ceecd6f --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_common.h" + +// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm +extern void idct32_transpose_and_transform(int16_t *transpose_buffer, + int16_t *output, int16_t *input); +extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride); + + +// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm +extern void save_neon_registers(); +extern void restore_neon_registers(); + +void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest, + int dest_stride) { + // TODO(cd): move the creation of these buffers within the ASM file + // internal buffer used to transpose 8 lines into before transforming them + int16_t transpose_buffer[32 * 8]; + // results of the first pass (transpose and transform rows) + int16_t pass1[32 * 32]; + // results of the second pass (transpose and transform columns) + int16_t pass2[32 * 32]; + + // save register we need to preserve + save_neon_registers(); + // process rows + idct32_transpose_and_transform(transpose_buffer, pass1, input); + // process columns + // TODO(cd): do these two steps/passes within the ASM file + idct32_transpose_and_transform(transpose_buffer, pass2, pass1); + // combine and add to dest + // TODO(cd): integrate this within the last storage step of the second pass + idct32_combine_add(dest, pass2, dest_stride); + // restore register we need to preserve + restore_neon_registers(); +} + +// TODO(cd): Eliminate this file altogether when everything is in ASM file diff --git a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm index edf5786..2e8001b 100644 --- a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm @@ -361,8 +361,6 @@ v_end vand d16, d20, d19 ; flat && mask vmov r5, r6, d16 - orrs r5, r5, r6 ; Check for 0 - orreq r7, r7, #1 ; Only do filter branch ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) vabd.u8 d22, d3, d7 ; abs(p4 - p0) @@ -388,10 +386,11 @@ v_end vmov.u8 d22, #0x80 + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #1 ; Only do filter branch + vand d17, d18, d16 ; flat2 && flat && mask vmov r5, r6, d17 - orrs r5, r5, r6 ; Check for 0 - orreq r7, r7, #2 ; Only do mbfilter branch ; mbfilter() function @@ -405,15 +404,10 @@ v_end vmov.u8 d27, #3 vsub.s8 d28, d23, d24 ; ( qs0 - ps0) - vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) - vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) - vand d29, d29, d21 ; filter &= hev - vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) - vmov.u8 d29, #4 ; filter = clamp(filter + 3 * ( qs0 - ps0)) @@ -452,37 +446,37 @@ v_end vaddl.u8 q15, d7, d8 ; op2 = p0 + q0 vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3 vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2 + vaddl.u8 q10, d4, d5 vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2 + vaddl.u8 q14, d6, d9 vqrshrn.u16 d18, q15, #3 ; r_op2 - vsubw.u8 q15, d4 ; op1 = op2 - p3 - vsubw.u8 q15, d5 ; op1 -= p2 - vaddw.u8 q15, d6 ; op1 += p1 - vaddw.u8 q15, d9 ; op1 += q1 + vsub.i16 q15, q10 + vaddl.u8 q10, d4, d6 + vadd.i16 q15, q14 + vaddl.u8 q14, d7, d10 vqrshrn.u16 d19, q15, #3 ; r_op1 - vsubw.u8 q15, d4 ; op0 = op1 - p3 - vsubw.u8 q15, d6 ; op0 -= p1 - vaddw.u8 q15, d7 ; op0 += p0 - vaddw.u8 q15, d10 ; op0 += q2 + vsub.i16 q15, q10 + vadd.i16 q15, q14 + vaddl.u8 q14, d8, d11 vqrshrn.u16 d20, q15, #3 ; r_op0 vsubw.u8 q15, d4 ; oq0 = op0 - p3 vsubw.u8 q15, d7 ; oq0 -= p0 - vaddw.u8 q15, d8 ; oq0 += q0 - vaddw.u8 q15, d11 ; oq0 += q3 + vadd.i16 q15, q14 + vaddl.u8 q14, d9, d11 vqrshrn.u16 d21, q15, #3 ; r_oq0 vsubw.u8 q15, d5 ; oq1 = oq0 - p2 vsubw.u8 q15, d8 ; oq1 -= q0 - vaddw.u8 q15, d9 ; oq1 += q1 - vaddw.u8 q15, d11 ; oq1 += q3 + vadd.i16 q15, q14 + vaddl.u8 q14, d10, d11 vqrshrn.u16 d22, q15, #3 ; r_oq1 vsubw.u8 q15, d6 ; oq2 = oq0 - p1 vsubw.u8 q15, d9 ; oq2 -= q1 - vaddw.u8 q15, d10 ; oq2 += q2 - vaddw.u8 q15, d11 ; oq2 += q3 + vadd.i16 q15, q14 vqrshrn.u16 d27, q15, #3 ; r_oq2 ; Filter does not set op2 or oq2, so use p2 and q2. @@ -501,113 +495,104 @@ v_end ; wide_mbfilter flat2 && flat && mask branch vmov.u8 d16, #7 vaddl.u8 q15, d7, d8 ; op6 = p0 + q0 + vaddl.u8 q12, d2, d3 + vaddl.u8 q13, d4, d5 + vaddl.u8 q14, d1, d6 vmlal.u8 q15, d0, d16 ; op6 += p7 * 3 - vmlal.u8 q15, d1, d29 ; op6 += p6 * 2 - vaddw.u8 q15, d2 ; op6 += p5 - vaddw.u8 q15, d3 ; op6 += p4 - vaddw.u8 q15, d4 ; op6 += p3 - vaddw.u8 q15, d5 ; op6 += p2 - vaddw.u8 q15, d6 ; op6 += p1 + vadd.i16 q12, q13 + vadd.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vadd.i16 q15, q12 + vaddl.u8 q12, d0, d1 + vaddw.u8 q15, d1 + vaddl.u8 q13, d0, d2 + vadd.i16 q14, q15, q14 vqrshrn.u16 d16, q15, #4 ; w_op6 - vsubw.u8 q15, d0 ; op5 = op6 - p7 - vsubw.u8 q15, d1 ; op5 -= p6 - vaddw.u8 q15, d2 ; op5 += p5 - vaddw.u8 q15, d9 ; op5 += q1 + vsub.i16 q15, q14, q12 + vaddl.u8 q14, d3, d10 vqrshrn.u16 d24, q15, #4 ; w_op5 - vsubw.u8 q15, d0 ; op4 = op5 - p7 - vsubw.u8 q15, d2 ; op4 -= p5 - vaddw.u8 q15, d3 ; op4 += p4 - vaddw.u8 q15, d10 ; op4 += q2 + vsub.i16 q15, q13 + vaddl.u8 q13, d0, d3 + vadd.i16 q15, q14 + vaddl.u8 q14, d4, d11 vqrshrn.u16 d25, q15, #4 ; w_op4 - vsubw.u8 q15, d0 ; op3 = op4 - p7 - vsubw.u8 q15, d3 ; op3 -= p4 - vaddw.u8 q15, d4 ; op3 += p3 - vaddw.u8 q15, d11 ; op3 += q3 + vadd.i16 q15, q14 + vaddl.u8 q14, d0, d4 + vsub.i16 q15, q13 + vsub.i16 q14, q15, q14 vqrshrn.u16 d26, q15, #4 ; w_op3 - vsubw.u8 q15, d0 ; op2 = op3 - p7 - vsubw.u8 q15, d4 ; op2 -= p3 - vaddw.u8 q15, d5 ; op2 += p2 + vaddw.u8 q15, q14, d5 ; op2 += p2 + vaddl.u8 q14, d0, d5 vaddw.u8 q15, d12 ; op2 += q4 + vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m) vqrshrn.u16 d27, q15, #4 ; w_op2 - vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m) - - vsubw.u8 q15, d0 ; op1 = op2 - p7 - vsubw.u8 q15, d5 ; op1 -= p2 + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d6 vaddw.u8 q15, d6 ; op1 += p1 vaddw.u8 q15, d13 ; op1 += q5 + vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m) vqrshrn.u16 d18, q15, #4 ; w_op1 - vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m) - - vsubw.u8 q15, d0 ; op0 = op1 - p7 - vsubw.u8 q15, d6 ; op0 -= p1 + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d7 vaddw.u8 q15, d7 ; op0 += p0 vaddw.u8 q15, d14 ; op0 += q6 + vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m) vqrshrn.u16 d19, q15, #4 ; w_op0 - vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m) - - vsubw.u8 q15, d0 ; oq0 = op0 - p7 - vsubw.u8 q15, d7 ; oq0 -= p0 + vsub.i16 q15, q14 + vaddl.u8 q14, d1, d8 vaddw.u8 q15, d8 ; oq0 += q0 vaddw.u8 q15, d15 ; oq0 += q7 + vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m) vqrshrn.u16 d20, q15, #4 ; w_oq0 - vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m) - - vsubw.u8 q15, d1 ; oq1 = oq0 - p6 - vsubw.u8 q15, d8 ; oq1 -= q0 + vsub.i16 q15, q14 + vaddl.u8 q14, d2, d9 vaddw.u8 q15, d9 ; oq1 += q1 + vaddl.u8 q4, d10, d15 vaddw.u8 q15, d15 ; oq1 += q7 + vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m) vqrshrn.u16 d21, q15, #4 ; w_oq1 + vsub.i16 q15, q14 + vaddl.u8 q14, d3, d10 + vadd.i16 q15, q4 + vaddl.u8 q4, d11, d15 vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m) - - vsubw.u8 q15, d2 ; oq2 = oq1 - p5 - vsubw.u8 q15, d9 ; oq2 -= q1 - vaddw.u8 q15, d10 ; oq2 += q2 - vaddw.u8 q15, d15 ; oq2 += q7 vqrshrn.u16 d22, q15, #4 ; w_oq2 + vsub.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vadd.i16 q15, q4 + vaddl.u8 q4, d12, d15 vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m) - - vsubw.u8 q15, d3 ; oq3 = oq2 - p4 - vsubw.u8 q15, d10 ; oq3 -= q2 - vaddw.u8 q15, d11 ; oq3 += q3 - vaddw.u8 q15, d15 ; oq3 += q7 vqrshrn.u16 d23, q15, #4 ; w_oq3 + vsub.i16 q15, q14 + vaddl.u8 q14, d5, d12 + vadd.i16 q15, q4 + vaddl.u8 q4, d13, d15 vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m) - - vsubw.u8 q15, d4 ; oq4 = oq3 - p3 - vsubw.u8 q15, d11 ; oq4 -= q3 - vaddw.u8 q15, d12 ; oq4 += q4 - vaddw.u8 q15, d15 ; oq4 += q7 vqrshrn.u16 d1, q15, #4 ; w_oq4 + vsub.i16 q15, q14 + vaddl.u8 q14, d6, d13 + vadd.i16 q15, q4 + vaddl.u8 q4, d14, d15 vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m) - - vsubw.u8 q15, d5 ; oq5 = oq4 - p2 - vsubw.u8 q15, d12 ; oq5 -= q4 - vaddw.u8 q15, d13 ; oq5 += q5 - vaddw.u8 q15, d15 ; oq5 += q7 vqrshrn.u16 d2, q15, #4 ; w_oq5 + vsub.i16 q15, q14 vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m) - - vsubw.u8 q15, d6 ; oq6 = oq5 - p1 - vsubw.u8 q15, d13 ; oq6 -= q5 - vaddw.u8 q15, d14 ; oq6 += q6 - vaddw.u8 q15, d15 ; oq6 += q7 - vqrshrn.u16 d3, q15, #4 ; w_oq6 - - vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m) + vadd.i16 q15, q4 vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m) + vqrshrn.u16 d3, q15, #4 ; w_oq6 vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m) vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m) vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m) diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm new file mode 100644 index 0000000..cf5c8f7 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm @@ -0,0 +1,198 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp9_short_idct16x16_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_short_idct16x16_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 6) + add r0, r0, #32 ; + (1 <<((6) - 1)) + asr r0, r0, #6 ; >> 6 + + vdup.s16 q0, r0 ; duplicate a1 + mov r0, #8 + sub r2, #8 + + ; load destination data row0 - row3 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row4 - row7 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row8 - row11 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row12 - row15 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + bx lr + ENDP ; |vp9_short_idct16x16_1_add_neon| + + END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm new file mode 100644 index 0000000..7464e80 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm @@ -0,0 +1,1191 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_short_idct16x16_add_neon_pass1| + EXPORT |vp9_short_idct16x16_add_neon_pass2| + EXPORT |vp9_short_idct10_16x16_add_neon_pass1| + EXPORT |vp9_short_idct10_16x16_add_neon_pass2| + EXPORT |save_neon_registers| + EXPORT |restore_neon_registers| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. + MACRO + TRANSPOSE8X8 + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + MEND + + AREA Block, CODE, READONLY ; name this block of code +;void |vp9_short_idct16x16_add_neon_pass1|(int16_t *input, +; int16_t *output, int output_stride) +; +; r0 int16_t input +; r1 int16_t *output +; r2 int output_stride) + +; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|vp9_short_idct16x16_add_neon_pass1| PROC + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q1,q2}, [r0]! + vmov.s16 q15, q1 + + ; generate cospi_28_64 = 3196 + mov r3, #0xc00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r12, #0x3e00 + add r12, #0xc5 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 d0, r3 ; duplicate cospi_28_64 + vdup.16 d1, r12 ; duplicate cospi_4_64 + + ; preloading to avoid stall + ; generate cospi_12_64 = 13623 + mov r3, #0x3500 + add r3, #0x37 + + ; generate cospi_20_64 = 9102 + mov r12, #0x2300 + add r12, #0x8e + + ; step2[4] * cospi_28_64 + vmull.s16 q2, d18, d0 + vmull.s16 q3, d19, d0 + + ; step2[4] * cospi_4_64 + vmull.s16 q5, d18, d1 + vmull.s16 q6, d19, d1 + + ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64 + vmlsl.s16 q2, d30, d1 + vmlsl.s16 q3, d31, d1 + + ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64 + vmlal.s16 q5, d30, d0 + vmlal.s16 q6, d31, d0 + + vdup.16 d2, r3 ; duplicate cospi_12_64 + vdup.16 d3, r12 ; duplicate cospi_20_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d8, q2, #14 ; >> 14 + vqrshrn.s32 d9, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d14, q5, #14 ; >> 14 + vqrshrn.s32 d15, q6, #14 ; >> 14 + + ; preloading to avoid stall + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + ; generate cospi_24_64 = 6270 + mov r12, #0x1800 + add r12, #0x7e + + ; step2[5] * cospi_12_64 + vmull.s16 q2, d26, d2 + vmull.s16 q3, d27, d2 + + ; step2[5] * cospi_20_64 + vmull.s16 q9, d26, d3 + vmull.s16 q15, d27, d3 + + ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64 + vmlsl.s16 q2, d22, d3 + vmlsl.s16 q3, d23, d3 + + ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64 + vmlal.s16 q9, d22, d2 + vmlal.s16 q15, d23, d2 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q2, #14 ; >> 14 + vqrshrn.s32 d11, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q15, #14 ; >> 14 + + ; stage 4 + vdup.16 d30, r3 ; cospi_16_64 + + ; step1[0] * cospi_16_64 + vmull.s16 q2, d16, d30 + vmull.s16 q11, d17, d30 + + ; step1[1] * cospi_16_64 + vmull.s16 q0, d24, d30 + vmull.s16 q1, d25, d30 + + ; generate cospi_8_64 = 15137 + mov r3, #0x3b00 + add r3, #0x21 + + vdup.16 d30, r12 ; duplicate cospi_24_64 + vdup.16 d31, r3 ; duplicate cospi_8_64 + + ; temp1 = (step1[0] + step1[1]) * cospi_16_64 + vadd.s32 q3, q2, q0 + vadd.s32 q12, q11, q1 + + ; temp2 = (step1[0] - step1[1]) * cospi_16_64 + vsub.s32 q13, q2, q0 + vsub.s32 q1, q11, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d16, q3, #14 ; >> 14 + vqrshrn.s32 d17, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d18, q13, #14 ; >> 14 + vqrshrn.s32 d19, q1, #14 ; >> 14 + + ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + ; step1[2] * cospi_8_64 + vmull.s16 q0, d20, d31 + vmull.s16 q1, d21, d31 + + ; step1[2] * cospi_24_64 + vmull.s16 q12, d20, d30 + vmull.s16 q13, d21, d30 + + ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64 + vmlal.s16 q0, d28, d30 + vmlal.s16 q1, d29, d30 + + ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64 + vmlsl.s16 q12, d28, d31 + vmlsl.s16 q13, d29, d31 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d22, q0, #14 ; >> 14 + vqrshrn.s32 d23, q1, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d20, q12, #14 ; >> 14 + vqrshrn.s32 d21, q13, #14 ; >> 14 + + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]; + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]; + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]; + vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7]; + + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + ; stage 5 + vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3]; + vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2]; + vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2]; + vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3]; + + vdup.16 d16, r3; ; duplicate cospi_16_64 + + ; step2[5] * cospi_16_64 + vmull.s16 q11, d26, d16 + vmull.s16 q12, d27, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; temp1 = (step2[6] - step2[5]) * cospi_16_64 + vsub.s32 q6, q9, q11 + vsub.s32 q13, q10, q12 + + ; temp2 = (step2[5] + step2[6]) * cospi_16_64 + vadd.s32 q9, q9, q11 + vadd.s32 q10, q10, q12 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q6, #14 ; >> 14 + vqrshrn.s32 d11, q13, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q10, #14 ; >> 14 + + ; stage 6 + vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7]; + + ; store the data + vst1.64 {d16}, [r1], r2 + vst1.64 {d17}, [r1], r2 + vst1.64 {d18}, [r1], r2 + vst1.64 {d19}, [r1], r2 + vst1.64 {d20}, [r1], r2 + vst1.64 {d21}, [r1], r2 + vst1.64 {d22}, [r1], r2 + vst1.64 {d23}, [r1], r2 + vst1.64 {d24}, [r1], r2 + vst1.64 {d25}, [r1], r2 + vst1.64 {d26}, [r1], r2 + vst1.64 {d27}, [r1], r2 + vst1.64 {d28}, [r1], r2 + vst1.64 {d29}, [r1], r2 + vst1.64 {d30}, [r1], r2 + vst1.64 {d31}, [r1], r2 + + bx lr + ENDP ; |vp9_short_idct16x16_add_neon_pass1| + +;void vp9_short_idct16x16_add_neon_pass2(int16_t *src, +; int16_t *output, +; int16_t *pass1Output, +; int16_t skip_adding, +; uint8_t *dest, +; int dest_stride) +; +; r0 int16_t *src +; r1 int16_t *output, +; r2 int16_t *pass1Output, +; r3 int16_t skip_adding, +; r4 uint8_t *dest, +; r5 int dest_stride) + +; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|vp9_short_idct16x16_add_neon_pass2| PROC + push {r3-r9} + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q0,q1}, [r0]! + vmov.s16 q15, q0; + + ; generate cospi_30_64 = 1606 + mov r3, #0x0600 + add r3, #0x46 + + ; generate cospi_2_64 = 16305 + mov r12, #0x3f00 + add r12, #0xb1 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 d12, r3 ; duplicate cospi_30_64 + vdup.16 d13, r12 ; duplicate cospi_2_64 + + ; preloading to avoid stall + ; generate cospi_14_64 = 12665 + mov r3, #0x3100 + add r3, #0x79 + + ; generate cospi_18_64 = 10394 + mov r12, #0x2800 + add r12, #0x9a + + ; step1[8] * cospi_30_64 + vmull.s16 q2, d16, d12 + vmull.s16 q3, d17, d12 + + ; step1[8] * cospi_2_64 + vmull.s16 q1, d16, d13 + vmull.s16 q4, d17, d13 + + ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64 + vmlsl.s16 q2, d30, d13 + vmlsl.s16 q3, d31, d13 + + ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64 + vmlal.s16 q1, d30, d12 + vmlal.s16 q4, d31, d12 + + vdup.16 d30, r3 ; duplicate cospi_14_64 + vdup.16 d31, r12 ; duplicate cospi_18_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d0, q2, #14 ; >> 14 + vqrshrn.s32 d1, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d14, q1, #14 ; >> 14 + vqrshrn.s32 d15, q4, #14 ; >> 14 + + ; preloading to avoid stall + ; generate cospi_22_64 = 7723 + mov r3, #0x1e00 + add r3, #0x2b + + ; generate cospi_10_64 = 14449 + mov r12, #0x3800 + add r12, #0x71 + + ; step1[9] * cospi_14_64 + vmull.s16 q2, d24, d30 + vmull.s16 q3, d25, d30 + + ; step1[9] * cospi_18_64 + vmull.s16 q4, d24, d31 + vmull.s16 q5, d25, d31 + + ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64 + vmlsl.s16 q2, d22, d31 + vmlsl.s16 q3, d23, d31 + + ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64 + vmlal.s16 q4, d22, d30 + vmlal.s16 q5, d23, d30 + + vdup.16 d30, r3 ; duplicate cospi_22_64 + vdup.16 d31, r12 ; duplicate cospi_10_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q2, #14 ; >> 14 + vqrshrn.s32 d3, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q4, #14 ; >> 14 + vqrshrn.s32 d13, q5, #14 ; >> 14 + + ; step1[10] * cospi_22_64 + vmull.s16 q11, d20, d30 + vmull.s16 q12, d21, d30 + + ; step1[10] * cospi_10_64 + vmull.s16 q4, d20, d31 + vmull.s16 q5, d21, d31 + + ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64 + vmlsl.s16 q11, d26, d31 + vmlsl.s16 q12, d27, d31 + + ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64 + vmlal.s16 q4, d26, d30 + vmlal.s16 q5, d27, d30 + + ; preloading to avoid stall + ; generate cospi_6_64 = 15679 + mov r3, #0x3d00 + add r3, #0x3f + + ; generate cospi_26_64 = 4756 + mov r12, #0x1200 + add r12, #0x94 + + vdup.16 d30, r3 ; duplicate cospi_6_64 + vdup.16 d31, r12 ; duplicate cospi_26_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q11, #14 ; >> 14 + vqrshrn.s32 d5, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d11, q5, #14 ; >> 14 + vqrshrn.s32 d10, q4, #14 ; >> 14 + + ; step1[11] * cospi_6_64 + vmull.s16 q10, d28, d30 + vmull.s16 q11, d29, d30 + + ; step1[11] * cospi_26_64 + vmull.s16 q12, d28, d31 + vmull.s16 q13, d29, d31 + + ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64 + vmlsl.s16 q10, d18, d31 + vmlsl.s16 q11, d19, d31 + + ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64 + vmlal.s16 q12, d18, d30 + vmlal.s16 q13, d19, d30 + + vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9] + vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9] + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q11, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d8, q12, #14 ; >> 14 + vqrshrn.s32 d9, q13, #14 ; >> 14 + + ; stage 3 + vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11] + vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11] + vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13] + vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13] + vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15] + vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15] + + ; stage 4 + ; generate cospi_24_64 = 6270 + mov r3, #0x1800 + add r3, #0x7e + + ; generate cospi_8_64 = 15137 + mov r12, #0x3b00 + add r12, #0x21 + + ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vdup.16 d30, r12 ; duplicate cospi_8_64 + vdup.16 d31, r3 ; duplicate cospi_24_64 + + ; step1[9] * cospi_24_64 + vmull.s16 q2, d18, d31 + vmull.s16 q3, d19, d31 + + ; step1[14] * cospi_24_64 + vmull.s16 q4, d28, d31 + vmull.s16 q5, d29, d31 + + ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 + vmlal.s16 q2, d28, d30 + vmlal.s16 q3, d29, d30 + + ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vmlsl.s16 q4, d18, d30 + vmlsl.s16 q5, d19, d30 + + rsb r12, #0 + vdup.16 d30, r12 ; duplicate -cospi_8_64 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q2, #14 ; >> 14 + vqrshrn.s32 d13, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q4, #14 ; >> 14 + vqrshrn.s32 d3, q5, #14 ; >> 14 + + vmov.s16 q3, q11 + vmov.s16 q4, q12 + + ; - step1[13] * cospi_8_64 + vmull.s16 q11, d26, d30 + vmull.s16 q12, d27, d30 + + ; -step1[10] * cospi_8_64 + vmull.s16 q8, d20, d30 + vmull.s16 q9, d21, d30 + + ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlsl.s16 q11, d20, d31 + vmlsl.s16 q12, d21, d31 + + ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlal.s16 q8, d26, d31 + vmlal.s16 q9, d27, d31 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d4, q11, #14 ; >> 14 + vqrshrn.s32 d5, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q8, #14 ; >> 14 + vqrshrn.s32 d11, q9, #14 ; >> 14 + + ; stage 5 + vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; + vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; + vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; + vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; + vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; + vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; + vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; + vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; + + ; stage 6. + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + vdup.16 d14, r12 ; duplicate cospi_16_64 + + ; step1[13] * cospi_16_64 + vmull.s16 q3, d26, d14 + vmull.s16 q4, d27, d14 + + ; step1[10] * cospi_16_64 + vmull.s16 q0, d20, d14 + vmull.s16 q1, d21, d14 + + ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 + vsub.s32 q5, q3, q0 + vsub.s32 q6, q4, q1 + + ; temp2 = (step1[10] + step1[13]) * cospi_16_64 + vadd.s32 q10, q3, q0 + vadd.s32 q4, q4, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q5, #14 ; >> 14 + vqrshrn.s32 d5, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q10, #14 ; >> 14 + vqrshrn.s32 d11, q4, #14 ; >> 14 + + ; step1[11] * cospi_16_64 + vmull.s16 q0, d22, d14 + vmull.s16 q1, d23, d14 + + ; step1[12] * cospi_16_64 + vmull.s16 q13, d24, d14 + vmull.s16 q6, d25, d14 + + ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 + vsub.s32 q10, q13, q0 + vsub.s32 q4, q6, q1 + + ; temp2 = (step1[11] + step1[12]) * cospi_16_64 + vadd.s32 q13, q13, q0 + vadd.s32 q6, q6, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q4, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d8, q13, #14 ; >> 14 + vqrshrn.s32 d9, q6, #14 ; >> 14 + + mov r4, #16 ; pass1Output stride + ldr r3, [sp] ; load skip_adding + cmp r3, #0 ; check if need adding dest data + beq skip_adding_dest + + ldr r7, [sp, #28] ; dest used to save element 0-7 + mov r9, r7 ; save dest pointer for later use + ldr r8, [sp, #32] ; load dest_stride + + ; stage 7 + ; load the data in pass1 + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + + ; store the data output 8,9,10,11,12,13,14,15 + vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q8 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q9, q9, #6 + vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q9 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q2, q2, #6 + vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q2 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q3, q3, #6 + vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q3 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q4, q4, #6 + vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q4 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q5, q5, #6 + vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q5 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q14, q14, #6 + vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q14 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q15, q15, #6 + vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q15 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + b end_idct16x16_pass2 + +skip_adding_dest + ; stage 7 + ; load the data in pass1 + mov r5, #24 + mov r3, #8 + + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vst1.64 {d24}, [r1], r3 ; store output[0] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[1] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vst1.64 {d24}, [r1], r3 ; store output[2] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[3] + vst1.64 {d27}, [r1], r5 + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vst1.64 {d24}, [r1], r3 ; store output[4] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[5] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + vst1.64 {d24}, [r1], r3 ; store output[6] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[7] + vst1.64 {d27}, [r1], r5 + + ; store the data output 8,9,10,11,12,13,14,15 + vst1.64 {d16}, [r1], r3 + vst1.64 {d17}, [r1], r5 + vst1.64 {d18}, [r1], r3 + vst1.64 {d19}, [r1], r5 + vst1.64 {d4}, [r1], r3 + vst1.64 {d5}, [r1], r5 + vst1.64 {d6}, [r1], r3 + vst1.64 {d7}, [r1], r5 + vst1.64 {d8}, [r1], r3 + vst1.64 {d9}, [r1], r5 + vst1.64 {d10}, [r1], r3 + vst1.64 {d11}, [r1], r5 + vst1.64 {d28}, [r1], r3 + vst1.64 {d29}, [r1], r5 + vst1.64 {d30}, [r1], r3 + vst1.64 {d31}, [r1], r5 +end_idct16x16_pass2 + pop {r3-r9} + bx lr + ENDP ; |vp9_short_idct16x16_add_neon_pass2| + +;void |vp9_short_idct10_16x16_add_neon_pass1|(int16_t *input, +; int16_t *output, int output_stride) +; +; r0 int16_t input +; r1 int16_t *output +; r2 int output_stride) + +; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|vp9_short_idct10_16x16_add_neon_pass1| PROC + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q1,q2}, [r0]! + vmov.s16 q15, q1 + + ; generate cospi_28_64*2 = 6392 + mov r3, #0x1800 + add r3, #0xf8 + + ; generate cospi_4_64*2 = 32138 + mov r12, #0x7d00 + add r12, #0x8a + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 q0, r3 ; duplicate cospi_28_64*2 + vdup.16 q1, r12 ; duplicate cospi_4_64*2 + + ; The following instructions use vqrdmulh to do the + ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply, + ; double, and return the high 16 bits, effectively giving >> 15. Doubling + ; the constant will change this to >> 14. + ; dct_const_round_shift(step2[4] * cospi_28_64); + vqrdmulh.s16 q4, q9, q0 + + ; preloading to avoid stall + ; generate cospi_16_64*2 = 23170 + mov r3, #0x5a00 + add r3, #0x82 + + ; dct_const_round_shift(step2[4] * cospi_4_64); + vqrdmulh.s16 q7, q9, q1 + + ; stage 4 + vdup.16 q1, r3 ; cospi_16_64*2 + + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + vdup.16 d4, r3; ; duplicate cospi_16_64 + + ; dct_const_round_shift(step1[0] * cospi_16_64) + vqrdmulh.s16 q8, q8, q1 + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d14, d4 + vmull.s16 q10, d15, d4 + + ; step2[5] * cospi_16_64 + vmull.s16 q12, d9, d4 + vmull.s16 q11, d8, d4 + + ; temp1 = (step2[6] - step2[5]) * cospi_16_64 + vsub.s32 q15, q10, q12 + vsub.s32 q6, q9, q11 + + ; temp2 = (step2[5] + step2[6]) * cospi_16_64 + vadd.s32 q9, q9, q11 + vadd.s32 q10, q10, q12 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d11, q15, #14 ; >> 14 + vqrshrn.s32 d10, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q10, #14 ; >> 14 + + ; stage 6 + vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7]; + vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5]; + vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4]; + vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6]; + vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4]; + vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5]; + vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6]; + vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7]; + + ; store the data + vst1.64 {d4}, [r1], r2 + vst1.64 {d5}, [r1], r2 + vst1.64 {d18}, [r1], r2 + vst1.64 {d19}, [r1], r2 + vst1.64 {d20}, [r1], r2 + vst1.64 {d21}, [r1], r2 + vst1.64 {d22}, [r1], r2 + vst1.64 {d23}, [r1], r2 + vst1.64 {d24}, [r1], r2 + vst1.64 {d25}, [r1], r2 + vst1.64 {d26}, [r1], r2 + vst1.64 {d27}, [r1], r2 + vst1.64 {d28}, [r1], r2 + vst1.64 {d29}, [r1], r2 + vst1.64 {d30}, [r1], r2 + vst1.64 {d31}, [r1], r2 + + bx lr + ENDP ; |vp9_short_idct10_16x16_add_neon_pass1| + +;void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src, +; int16_t *output, +; int16_t *pass1Output, +; int16_t skip_adding, +; uint8_t *dest, +; int dest_stride) +; +; r0 int16_t *src +; r1 int16_t *output, +; r2 int16_t *pass1Output, +; r3 int16_t skip_adding, +; r4 uint8_t *dest, +; r5 int dest_stride) + +; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|vp9_short_idct10_16x16_add_neon_pass2| PROC + push {r3-r9} + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q0,q1}, [r0]! + vmov.s16 q15, q0; + + ; generate 2*cospi_30_64 = 3212 + mov r3, #0xc00 + add r3, #0x8c + + ; generate 2*cospi_2_64 = 32610 + mov r12, #0x7f00 + add r12, #0x62 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 q6, r3 ; duplicate 2*cospi_30_64 + + ; dct_const_round_shift(step1[8] * cospi_30_64) + vqrdmulh.s16 q0, q8, q6 + + vdup.16 q6, r12 ; duplicate 2*cospi_2_64 + + ; dct_const_round_shift(step1[8] * cospi_2_64) + vqrdmulh.s16 q7, q8, q6 + + ; preloading to avoid stall + ; generate 2*cospi_26_64 = 9512 + mov r12, #0x2500 + add r12, #0x28 + rsb r12, #0 + vdup.16 q15, r12 ; duplicate -2*cospi_26_64 + + ; generate 2*cospi_6_64 = 31358 + mov r3, #0x7a00 + add r3, #0x7e + vdup.16 q14, r3 ; duplicate 2*cospi_6_64 + + ; dct_const_round_shift(- step1[12] * cospi_26_64) + vqrdmulh.s16 q3, q9, q15 + + ; dct_const_round_shift(step1[12] * cospi_6_64) + vqrdmulh.s16 q4, q9, q14 + + ; stage 4 + ; generate cospi_24_64 = 6270 + mov r3, #0x1800 + add r3, #0x7e + vdup.16 d31, r3 ; duplicate cospi_24_64 + + ; generate cospi_8_64 = 15137 + mov r12, #0x3b00 + add r12, #0x21 + vdup.16 d30, r12 ; duplicate cospi_8_64 + + ; step1[14] * cospi_24_64 + vmull.s16 q12, d14, d31 + vmull.s16 q5, d15, d31 + + ; step1[9] * cospi_24_64 + vmull.s16 q2, d0, d31 + vmull.s16 q11, d1, d31 + + ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vmlsl.s16 q12, d0, d30 + vmlsl.s16 q5, d1, d30 + + ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 + vmlal.s16 q2, d14, d30 + vmlal.s16 q11, d15, d30 + + rsb r12, #0 + vdup.16 d30, r12 ; duplicate -cospi_8_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q12, #14 ; >> 14 + vqrshrn.s32 d3, q5, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q2, #14 ; >> 14 + vqrshrn.s32 d13, q11, #14 ; >> 14 + + ; - step1[13] * cospi_8_64 + vmull.s16 q10, d8, d30 + vmull.s16 q13, d9, d30 + + ; -step1[10] * cospi_8_64 + vmull.s16 q8, d6, d30 + vmull.s16 q9, d7, d30 + + ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64 + vmlsl.s16 q10, d6, d31 + vmlsl.s16 q13, d7, d31 + + ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlal.s16 q8, d8, d31 + vmlal.s16 q9, d9, d31 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q10, #14 ; >> 14 + vqrshrn.s32 d5, q13, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q8, #14 ; >> 14 + vqrshrn.s32 d11, q9, #14 ; >> 14 + + ; stage 5 + vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; + vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; + vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; + vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; + vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; + vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; + vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; + vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; + + ; stage 6. + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + vdup.16 d14, r12 ; duplicate cospi_16_64 + + ; step1[13] * cospi_16_64 + vmull.s16 q3, d26, d14 + vmull.s16 q4, d27, d14 + + ; step1[10] * cospi_16_64 + vmull.s16 q0, d20, d14 + vmull.s16 q1, d21, d14 + + ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 + vsub.s32 q5, q3, q0 + vsub.s32 q6, q4, q1 + + ; temp2 = (step1[10] + step1[13]) * cospi_16_64 + vadd.s32 q0, q3, q0 + vadd.s32 q1, q4, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q5, #14 ; >> 14 + vqrshrn.s32 d5, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q0, #14 ; >> 14 + vqrshrn.s32 d11, q1, #14 ; >> 14 + + ; step1[11] * cospi_16_64 + vmull.s16 q0, d22, d14 + vmull.s16 q1, d23, d14 + + ; step1[12] * cospi_16_64 + vmull.s16 q13, d24, d14 + vmull.s16 q6, d25, d14 + + ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 + vsub.s32 q10, q13, q0 + vsub.s32 q4, q6, q1 + + ; temp2 = (step1[11] + step1[12]) * cospi_16_64 + vadd.s32 q13, q13, q0 + vadd.s32 q6, q6, q1 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q4, #14 ; >> 14 + + ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64); + vqrshrn.s32 d8, q13, #14 ; >> 14 + vqrshrn.s32 d9, q6, #14 ; >> 14 + + mov r4, #16 ; pass1Output stride + ldr r3, [sp] ; load skip_adding + + ; stage 7 + ; load the data in pass1 + mov r5, #24 + mov r3, #8 + + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vst1.64 {d24}, [r1], r3 ; store output[0] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[1] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vst1.64 {d24}, [r1], r3 ; store output[2] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[3] + vst1.64 {d27}, [r1], r5 + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vst1.64 {d24}, [r1], r3 ; store output[4] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[5] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + vst1.64 {d24}, [r1], r3 ; store output[6] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[7] + vst1.64 {d27}, [r1], r5 + + ; store the data output 8,9,10,11,12,13,14,15 + vst1.64 {d16}, [r1], r3 + vst1.64 {d17}, [r1], r5 + vst1.64 {d18}, [r1], r3 + vst1.64 {d19}, [r1], r5 + vst1.64 {d4}, [r1], r3 + vst1.64 {d5}, [r1], r5 + vst1.64 {d6}, [r1], r3 + vst1.64 {d7}, [r1], r5 + vst1.64 {d8}, [r1], r3 + vst1.64 {d9}, [r1], r5 + vst1.64 {d10}, [r1], r3 + vst1.64 {d11}, [r1], r5 + vst1.64 {d28}, [r1], r3 + vst1.64 {d29}, [r1], r5 + vst1.64 {d30}, [r1], r3 + vst1.64 {d31}, [r1], r5 +end_idct10_16x16_pass2 + pop {r3-r9} + bx lr + ENDP ; |vp9_short_idct10_16x16_add_neon_pass2| +;void |save_neon_registers|() +|save_neon_registers| PROC + vpush {d8-d15} + bx lr + ENDP ; |save_registers| +;void |restore_neon_registers|() +|restore_neon_registers| PROC + vpop {d8-d15} + bx lr + ENDP ; |restore_registers| + END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm new file mode 100644 index 0000000..5c097cc --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm @@ -0,0 +1,1013 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +;TODO(cd): adjust these constant to be able to use vqdmulh for faster +; dct_const_round_shift(a * b) within butterfly calculations. +cospi_1_64 EQU 16364 +cospi_2_64 EQU 16305 +cospi_3_64 EQU 16207 +cospi_4_64 EQU 16069 +cospi_5_64 EQU 15893 +cospi_6_64 EQU 15679 +cospi_7_64 EQU 15426 +cospi_8_64 EQU 15137 +cospi_9_64 EQU 14811 +cospi_10_64 EQU 14449 +cospi_11_64 EQU 14053 +cospi_12_64 EQU 13623 +cospi_13_64 EQU 13160 +cospi_14_64 EQU 12665 +cospi_15_64 EQU 12140 +cospi_16_64 EQU 11585 +cospi_17_64 EQU 11003 +cospi_18_64 EQU 10394 +cospi_19_64 EQU 9760 +cospi_20_64 EQU 9102 +cospi_21_64 EQU 8423 +cospi_22_64 EQU 7723 +cospi_23_64 EQU 7005 +cospi_24_64 EQU 6270 +cospi_25_64 EQU 5520 +cospi_26_64 EQU 4756 +cospi_27_64 EQU 3981 +cospi_28_64 EQU 3196 +cospi_29_64 EQU 2404 +cospi_30_64 EQU 1606 +cospi_31_64 EQU 804 + + + EXPORT |idct32_transpose_and_transform| + EXPORT |idct32_combine_add| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + AREA Block, CODE, READONLY + + ; -------------------------------------------------------------------------- + ; Load from transposed_buffer + ; q13 = transposed_buffer[first_offset] + ; q14 = transposed_buffer[second_offset] + ; for proper address calculation, the last offset used when manipulating + ; transposed_buffer must be passed in. use 0 for first use. + MACRO + LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset + ; address calculation with proper stride and loading + add r0, #($first_offset - $prev_offset )*8*2 + vld1.s16 {q14}, [r0] + add r0, #($second_offset - $first_offset)*8*2 + vld1.s16 {q13}, [r0] + ; (used) two registers (q14, q13) + MEND + ; -------------------------------------------------------------------------- + ; Load from output (used as temporary storage) + ; reg1 = output[first_offset] + ; reg2 = output[second_offset] + ; for proper address calculation, the last offset used when manipulating + ; output, wethere reading or storing) must be passed in. use 0 for first + ; use. + MACRO + LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 + ; address calculation with proper stride and loading + add r1, #($first_offset - $prev_offset )*32*2 + vld1.s16 {$reg1}, [r1] + add r1, #($second_offset - $first_offset)*32*2 + vld1.s16 {$reg2}, [r1] + ; (used) two registers ($reg1, $reg2) + MEND + ; -------------------------------------------------------------------------- + ; Store into output (sometimes as as temporary storage) + ; output[first_offset] = reg1 + ; output[second_offset] = reg2 + ; for proper address calculation, the last offset used when manipulating + ; output, wethere reading or storing) must be passed in. use 0 for first + ; use. + MACRO + STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 + ; address calculation with proper stride and storing + add r1, #($first_offset - $prev_offset )*32*2 + vst1.16 {$reg1}, [r1] + add r1, #($second_offset - $first_offset)*32*2 + vst1.16 {$reg2}, [r1] + MEND + ; -------------------------------------------------------------------------- + ; Touches q8-q12, q15 (q13-q14 are preserved) + ; valid output registers are anything but q8-q11 + MACRO + DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + ; TODO(cd): have special case to re-use constants when they are similar for + ; consecutive butterflies + ; TODO(cd): have special case when both constants are the same, do the + ; additions/substractions before the multiplies. + ; generate the constants + ; generate scalar constants + mov r3, #$first_constant & 0xFF00 + add r3, #$first_constant & 0x00FF + mov r12, #$second_constant & 0xFF00 + add r12, #$second_constant & 0x00FF + ; generate vector constants + vdup.16 d30, r3 + vdup.16 d31, r12 + ; (used) two for inputs (regA-regD), one for constants (q15) + ; do some multiplications (ordered for maximum latency hiding) + vmull.s16 q8, $regC, d30 + vmull.s16 q10, $regA, d31 + vmull.s16 q9, $regD, d30 + vmull.s16 q11, $regB, d31 + vmull.s16 q12, $regC, d31 + ; (used) five for intermediate (q8-q12), one for constants (q15) + ; do some addition/substractions (to get back two register) + vsub.s32 q8, q8, q10 + vsub.s32 q9, q9, q11 + ; do more multiplications (ordered for maximum latency hiding) + vmull.s16 q10, $regD, d31 + vmull.s16 q11, $regA, d30 + vmull.s16 q15, $regB, d30 + ; (used) six for intermediate (q8-q12, q15) + ; do more addition/substractions + vadd.s32 q11, q12, q11 + vadd.s32 q10, q10, q15 + ; (used) four for intermediate (q8-q11) + ; dct_const_round_shift + vqrshrn.s32 $reg1, q8, #14 + vqrshrn.s32 $reg2, q9, #14 + vqrshrn.s32 $reg3, q11, #14 + vqrshrn.s32 $reg4, q10, #14 + ; (used) two for results, well four d registers + MEND + ; -------------------------------------------------------------------------- + ; Touches q8-q12, q15 (q13-q14 are preserved) + ; valid output registers are anything but q8-q11 + MACRO + DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + MEND + ; -------------------------------------------------------------------------- + +;void idct32_transpose_and_transform(int16_t *transpose_buffer, int16_t *output, int16_t *input); +; +; r0 int16_t *transpose_buffer +; r1 int16_t *output +; r2 int16_t *input) +; TODO(cd): have more logical parameter ordering but this issue will disappear +; when functions are combined. + +|idct32_transpose_and_transform| PROC + ; This function does one pass of idct32x32 transform. + ; + ; This is done by transposing the input and then doing a 1d transform on + ; columns. In the first pass, the transposed columns are the original + ; rows. In the second pass, after the transposition, the colums are the + ; original columns. + ; The 1d transform is done by looping over bands of eight columns (the + ; idct32_bands loop). For each band, the transform input transposition + ; is done on demand, one band of four 8x8 matrices at a time. The four + ; matrices are trsnposed by pairs (the idct32_transpose_pair loop). + push {r4} + mov r4, #0 ; initialize bands loop counter +idct32_bands_loop + ; TODO(cd) get rid of these push/pop by properly adjusting register + ; content at end of loop + push {r0} + push {r1} + push {r2} + mov r3, #0 ; initialize transpose loop counter +idct32_transpose_pair_loop + ; Load two horizontally consecutive 8x8 16bit data matrices. The first one + ; into q0-q7 and the second one into q8-q15. There is a stride of 64, + ; adjusted to 32 because of the two post-increments. + vld1.s16 {q8}, [r2]! + vld1.s16 {q0}, [r2]! + add r2, #32 + vld1.s16 {q9}, [r2]! + vld1.s16 {q1}, [r2]! + add r2, #32 + vld1.s16 {q10}, [r2]! + vld1.s16 {q2}, [r2]! + add r2, #32 + vld1.s16 {q11}, [r2]! + vld1.s16 {q3}, [r2]! + add r2, #32 + vld1.s16 {q12}, [r2]! + vld1.s16 {q4}, [r2]! + add r2, #32 + vld1.s16 {q13}, [r2]! + vld1.s16 {q5}, [r2]! + add r2, #32 + vld1.s16 {q14}, [r2]! + vld1.s16 {q6}, [r2]! + add r2, #32 + vld1.s16 {q15}, [r2]! + vld1.s16 {q7}, [r2]! + + ; Transpose the two 8x8 16bit data matrices. + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vswp d1, d8 + vswp d7, d14 + vswp d5, d12 + vswp d3, d10 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + ; Store both matrices after each other. There is a stride of 32, which + ; adjusts to nothing because of the post-increments. + vst1.16 {q8}, [r0]! + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + vst1.16 {q0}, [r0]! + vst1.16 {q1}, [r0]! + vst1.16 {q2}, [r0]! + vst1.16 {q3}, [r0]! + vst1.16 {q4}, [r0]! + vst1.16 {q5}, [r0]! + vst1.16 {q6}, [r0]! + vst1.16 {q7}, [r0]! + + ; increment pointers by adjusted stride (not necessary for r0/out) + sub r2, r2, #8*32*2-32-16*2 + ; transpose pair loop processing + add r3, r3, #1 + cmp r3, #1 + BLE idct32_transpose_pair_loop + + ; restore r0/input to its original value + sub r0, r0, #32*8*2 + + ; Instead of doing the transforms stage by stage, it is done by loading + ; some input values and doing as many stages as possible to minimize the + ; storing/loading of intermediate results. To fit within registers, the + ; final coefficients are cut into four blocks: + ; BLOCK A: 16-19,28-31 + ; BLOCK B: 20-23,24-27 + ; BLOCK C: 8-10,11-15 + ; BLOCK D: 0-3,4-7 + ; Blocks A and C are straight calculation through the various stages. In + ; block B, further calculations are performed using the results from + ; block A. In block D, further calculations are performed using the results + ; from block C and then the final calculations are done using results from + ; block A and B which have been combined at the end of block B. + + ; -------------------------------------------------------------------------- + ; BLOCK A: 16-19,28-31 + ; -------------------------------------------------------------------------- + ; generate 16,17,30,31 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; + ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; + ;step1b[16][i] = dct_const_round_shift(temp1); + ;step1b[31][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 0, 1, 31 + DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; + ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; + ;step1b[17][i] = dct_const_round_shift(temp1); + ;step1b[30][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 31, 17, 15 + DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[16] = step1b[16][i] + step1b[17][i]; + ;step2[17] = step1b[16][i] - step1b[17][i]; + ;step2[30] = -step1b[30][i] + step1b[31][i]; + ;step2[31] = step1b[30][i] + step1b[31][i]; + vadd.s16 q4, q0, q1 + vsub.s16 q13, q0, q1 + vadd.s16 q6, q2, q3 + vsub.s16 q14, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; + ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; + ;step3[17] = dct_const_round_shift(temp1); + ;step3[30] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; generate 18,19,28,29 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; + ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; + ;step1b[18][i] = dct_const_round_shift(temp1); + ;step1b[29][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 15, 9, 23 + DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; + ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; + ;step1b[19][i] = dct_const_round_shift(temp1); + ;step1b[28][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 23, 25, 7 + DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[18] = -step1b[18][i] + step1b[19][i]; + ;step2[19] = step1b[18][i] + step1b[19][i]; + ;step2[28] = step1b[28][i] + step1b[29][i]; + ;step2[29] = step1b[28][i] - step1b[29][i]; + vsub.s16 q13, q3, q2 + vadd.s16 q3, q3, q2 + vsub.s16 q14, q1, q0 + vadd.s16 q2, q1, q0 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); + ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); + ;step3[29] = dct_const_round_shift(temp1); + ;step3[18] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 + ; -------------------------------------------------------------------------- + ; combine 16-19,28-31 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[16] = step1b[16][i] + step1b[19][i]; + ;step1[17] = step1b[17][i] + step1b[18][i]; + ;step1[18] = step1b[17][i] - step1b[18][i]; + ;step1[29] = step1b[30][i] - step1b[29][i]; + ;step1[30] = step1b[30][i] + step1b[29][i]; + ;step1[31] = step1b[31][i] + step1b[28][i]; + vadd.s16 q8, q4, q2 + vadd.s16 q9, q5, q0 + vadd.s16 q10, q7, q1 + vadd.s16 q15, q6, q3 + vsub.s16 q13, q5, q0 + vsub.s16 q14, q7, q1 + STORE_IN_OUTPUT 0, 16, 31, q8, q15 + STORE_IN_OUTPUT 31, 17, 30, q9, q10 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; + ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; + ;step2[18] = dct_const_round_shift(temp1); + ;step2[29] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 + STORE_IN_OUTPUT 30, 29, 18, q1, q0 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[19] = step1b[16][i] - step1b[19][i]; + ;step1[28] = step1b[31][i] - step1b[28][i]; + vsub.s16 q13, q4, q2 + vsub.s16 q14, q6, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; + ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; + ;step2[19] = dct_const_round_shift(temp1); + ;step2[28] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 + STORE_IN_OUTPUT 18, 19, 28, q4, q6 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK B: 20-23,24-27 + ; -------------------------------------------------------------------------- + ; generate 20,21,26,27 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; + ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; + ;step1b[20][i] = dct_const_round_shift(temp1); + ;step1b[27][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 7, 5, 27 + DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; + ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; + ;step1b[21][i] = dct_const_round_shift(temp1); + ;step1b[26][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 27, 21, 11 + DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[20] = step1b[20][i] + step1b[21][i]; + ;step2[21] = step1b[20][i] - step1b[21][i]; + ;step2[26] = -step1b[26][i] + step1b[27][i]; + ;step2[27] = step1b[26][i] + step1b[27][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; + ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; + ;step3[21] = dct_const_round_shift(temp1); + ;step3[26] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 22,23,24,25 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; + ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; + ;step1b[22][i] = dct_const_round_shift(temp1); + ;step1b[25][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 11, 13, 19 + DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; + ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; + ;step1b[23][i] = dct_const_round_shift(temp1); + ;step1b[24][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 19, 29, 3 + DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[22] = -step1b[22][i] + step1b[23][i]; + ;step2[23] = step1b[22][i] + step1b[23][i]; + ;step2[24] = step1b[24][i] + step1b[25][i]; + ;step2[25] = step1b[24][i] - step1b[25][i]; + vsub.s16 q14, q4, q5 + vadd.s16 q5, q4, q5 + vsub.s16 q13, q6, q7 + vadd.s16 q6, q6, q7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); + ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); + ;step3[25] = dct_const_round_shift(temp1); + ;step3[22] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 + ; -------------------------------------------------------------------------- + ; combine 20-23,24-27 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[22] = step1b[22][i] + step1b[21][i]; + ;step1[23] = step1b[23][i] + step1b[20][i]; + vadd.s16 q10, q7, q1 + vadd.s16 q11, q5, q0 + ;step1[24] = step1b[24][i] + step1b[27][i]; + ;step1[25] = step1b[25][i] + step1b[26][i]; + vadd.s16 q12, q6, q2 + vadd.s16 q15, q4, q3 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[16] = step1b[16][i] + step1b[23][i]; + ;step3[17] = step1b[17][i] + step1b[22][i]; + ;step3[22] = step1b[17][i] - step1b[22][i]; + ;step3[23] = step1b[16][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 + vadd.s16 q8, q14, q11 + vadd.s16 q9, q13, q10 + vsub.s16 q13, q13, q10 + vsub.s16 q11, q14, q11 + STORE_IN_OUTPUT 17, 17, 16, q9, q8 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[24] = step1b[31][i] - step1b[24][i]; + ;step3[25] = step1b[30][i] - step1b[25][i]; + ;step3[30] = step1b[30][i] + step1b[25][i]; + ;step3[31] = step1b[31][i] + step1b[24][i]; + LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 + vsub.s16 q8, q9, q12 + vadd.s16 q10, q14, q15 + vsub.s16 q14, q14, q15 + vadd.s16 q12, q9, q12 + STORE_IN_OUTPUT 31, 30, 31, q10, q12 + ; -------------------------------------------------------------------------- + ; TODO(cd) do some register allocation change to remove these push/pop + vpush {q8} ; [24] + vpush {q11} ; [23] + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; + ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; + ;step1[22] = dct_const_round_shift(temp1); + ;step1[25] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 31, 25, 22, q14, q13 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; + ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; + ;step1[23] = dct_const_round_shift(temp1); + ;step1[24] = dct_const_round_shift(temp2); + ; TODO(cd) do some register allocation change to remove these push/pop + vpop {q13} ; [23] + vpop {q14} ; [24] + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 22, 24, 23, q14, q13 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[20] = step1b[23][i] - step1b[20][i]; + ;step1[27] = step1b[24][i] - step1b[27][i]; + vsub.s16 q14, q5, q0 + vsub.s16 q13, q6, q2 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); + ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); + ;step2[27] = dct_const_round_shift(temp1); + ;step2[20] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[21] = step1b[22][i] - step1b[21][i]; + ;step1[26] = step1b[25][i] - step1b[26][i]; + vsub.s16 q14, q7, q1 + vsub.s16 q13, q4, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); + ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); + ;step2[26] = dct_const_round_shift(temp1); + ;step2[21] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[18] = step1b[18][i] + step1b[21][i]; + ;step3[19] = step1b[19][i] + step1b[20][i]; + ;step3[20] = step1b[19][i] - step1b[20][i]; + ;step3[21] = step1b[18][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 + vadd.s16 q8, q14, q1 + vadd.s16 q9, q13, q6 + vsub.s16 q13, q13, q6 + vsub.s16 q1, q14, q1 + STORE_IN_OUTPUT 19, 18, 19, q8, q9 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[27] = step1b[28][i] - step1b[27][i]; + ;step3[28] = step1b[28][i] + step1b[27][i]; + ;step3[29] = step1b[29][i] + step1b[26][i]; + ;step3[26] = step1b[29][i] - step1b[26][i]; + LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 + vsub.s16 q14, q8, q5 + vadd.s16 q10, q8, q5 + vadd.s16 q11, q9, q0 + vsub.s16 q0, q9, q0 + STORE_IN_OUTPUT 29, 28, 29, q10, q11 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; + ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; + ;step1[20] = dct_const_round_shift(temp1); + ;step1[27] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 29, 20, 27, q13, q14 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; + ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; + ;step1[21] = dct_const_round_shift(temp1); + ;step1[26] = dct_const_round_shift(temp2); + DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 + STORE_IN_OUTPUT 27, 21, 26, q1, q0 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK C: 8-10,11-15 + ; -------------------------------------------------------------------------- + ; generate 8,9,14,15 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; + ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; + ;step2[8] = dct_const_round_shift(temp1); + ;step2[15] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 3, 2, 30 + DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; + ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; + ;step2[9] = dct_const_round_shift(temp1); + ;step2[14] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 30, 18, 14 + DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;step3[8] = step1b[8][i] + step1b[9][i]; + ;step3[9] = step1b[8][i] - step1b[9][i]; + ;step3[14] = step1b[15][i] - step1b[14][i]; + ;step3[15] = step1b[15][i] + step1b[14][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; + ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; + ;step1[9] = dct_const_round_shift(temp1); + ;step1[14] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 10,11,12,13 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; + ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; + ;step2[10] = dct_const_round_shift(temp1); + ;step2[13] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 14, 10, 22 + DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; + ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; + ;step2[11] = dct_const_round_shift(temp1); + ;step2[12] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 22, 26, 6 + DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;step3[10] = step1b[11][i] - step1b[10][i]; + ;step3[11] = step1b[11][i] + step1b[10][i]; + ;step3[12] = step1b[12][i] + step1b[13][i]; + ;step3[13] = step1b[12][i] - step1b[13][i]; + vsub.s16 q14, q4, q5 + vadd.s16 q5, q4, q5 + vsub.s16 q13, q6, q7 + vadd.s16 q6, q6, q7 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); + ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); + ;step1[13] = dct_const_round_shift(temp1); + ;step1[10] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 + ; -------------------------------------------------------------------------- + ; combine 8-10,11-15 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[8] = step1b[8][i] + step1b[11][i]; + ;step2[9] = step1b[9][i] + step1b[10][i]; + ;step2[10] = step1b[9][i] - step1b[10][i]; + vadd.s16 q8, q0, q5 + vadd.s16 q9, q1, q7 + vsub.s16 q13, q1, q7 + ;step2[13] = step1b[14][i] - step1b[13][i]; + ;step2[14] = step1b[14][i] + step1b[13][i]; + ;step2[15] = step1b[15][i] + step1b[12][i]; + vsub.s16 q14, q3, q4 + vadd.s16 q10, q3, q4 + vadd.s16 q15, q2, q6 + STORE_IN_OUTPUT 26, 8, 15, q8, q15 + STORE_IN_OUTPUT 15, 9, 14, q9, q10 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; + ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; + ;step3[10] = dct_const_round_shift(temp1); + ;step3[13] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + STORE_IN_OUTPUT 14, 13, 10, q3, q1 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[11] = step1b[8][i] - step1b[11][i]; + ;step2[12] = step1b[15][i] - step1b[12][i]; + vsub.s16 q13, q0, q5 + vsub.s16 q14, q2, q6 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; + ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; + ;step3[11] = dct_const_round_shift(temp1); + ;step3[12] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + STORE_IN_OUTPUT 10, 11, 12, q1, q3 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK D: 0-3,4-7 + ; -------------------------------------------------------------------------- + ; generate 4,5,6,7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; + ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; + ;step3[4] = dct_const_round_shift(temp1); + ;step3[7] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 6, 4, 28 + DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; + ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; + ;step3[5] = dct_const_round_shift(temp1); + ;step3[6] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 28, 20, 12 + DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[4] = step1b[4][i] + step1b[5][i]; + ;step1[5] = step1b[4][i] - step1b[5][i]; + ;step1[6] = step1b[7][i] - step1b[6][i]; + ;step1[7] = step1b[7][i] + step1b[6][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; + ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; + ;step2[5] = dct_const_round_shift(temp1); + ;step2[6] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 0,1,2,3 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; + ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; + ;step1[1] = dct_const_round_shift(temp1); + ;step1[0] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 12, 0, 16 + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; + ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; + ;step1[2] = dct_const_round_shift(temp1); + ;step1[3] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 16, 8, 24 + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[0] = step1b[0][i] + step1b[3][i]; + ;step2[1] = step1b[1][i] + step1b[2][i]; + ;step2[2] = step1b[1][i] - step1b[2][i]; + ;step2[3] = step1b[0][i] - step1b[3][i]; + vadd.s16 q4, q7, q6 + vsub.s16 q7, q7, q6 + vsub.s16 q6, q5, q14 + vadd.s16 q5, q5, q14 + ; -------------------------------------------------------------------------- + ; combine 0-3,4-7 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[0] = step1b[0][i] + step1b[7][i]; + ;step3[1] = step1b[1][i] + step1b[6][i]; + ;step3[2] = step1b[2][i] + step1b[5][i]; + ;step3[3] = step1b[3][i] + step1b[4][i]; + vadd.s16 q8, q4, q2 + vadd.s16 q9, q5, q3 + vadd.s16 q10, q6, q1 + vadd.s16 q11, q7, q0 + ;step3[4] = step1b[3][i] - step1b[4][i]; + ;step3[5] = step1b[2][i] - step1b[5][i]; + ;step3[6] = step1b[1][i] - step1b[6][i]; + ;step3[7] = step1b[0][i] - step1b[7][i]; + vsub.s16 q12, q7, q0 + vsub.s16 q13, q6, q1 + vsub.s16 q14, q5, q3 + vsub.s16 q15, q4, q2 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[0] = step1b[0][i] + step1b[15][i]; + ;step1[1] = step1b[1][i] + step1b[14][i]; + ;step1[14] = step1b[1][i] - step1b[14][i]; + ;step1[15] = step1b[0][i] - step1b[15][i]; + LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 + vadd.s16 q2, q8, q1 + vadd.s16 q3, q9, q0 + vsub.s16 q4, q9, q0 + vsub.s16 q5, q8, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[14 * 32] = step1b[14][i] + step1b[17][i]; + ;output[15 * 32] = step1b[15][i] + step1b[16][i]; + ;output[16 * 32] = step1b[15][i] - step1b[16][i]; + ;output[17 * 32] = step1b[14][i] - step1b[17][i]; + LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 17, 17, 16, q7, q6 + STORE_IN_OUTPUT 16, 15, 14, q9, q8 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; + ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; + ;output[30 * 32] = step1b[1][i] - step1b[30][i]; + ;output[31 * 32] = step1b[0][i] - step1b[31][i]; + LOAD_FROM_OUTPUT 14, 30, 31, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 31, 31, 30, q7, q6 + STORE_IN_OUTPUT 30, 0, 1, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[2] = step1b[2][i] + step1b[13][i]; + ;step1[3] = step1b[3][i] + step1b[12][i]; + ;step1[12] = step1b[3][i] - step1b[12][i]; + ;step1[13] = step1b[2][i] - step1b[13][i]; + LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 + vadd.s16 q2, q10, q1 + vadd.s16 q3, q11, q0 + vsub.s16 q4, q11, q0 + vsub.s16 q5, q10, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[12 * 32] = step1b[12][i] + step1b[19][i]; + ;output[13 * 32] = step1b[13][i] + step1b[18][i]; + ;output[18 * 32] = step1b[13][i] - step1b[18][i]; + ;output[19 * 32] = step1b[12][i] - step1b[19][i]; + LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 + vadd.s16 q6, q4, q1 + vadd.s16 q7, q5, q0 + vsub.s16 q8, q5, q0 + vsub.s16 q9, q4, q1 + STORE_IN_OUTPUT 19, 19, 18, q9, q8 + STORE_IN_OUTPUT 18, 13, 12, q7, q6 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; + ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; + ;output[28 * 32] = step1b[3][i] - step1b[28][i]; + ;output[29 * 32] = step1b[2][i] - step1b[29][i]; + LOAD_FROM_OUTPUT 12, 28, 29, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 29, 29, 28, q7, q6 + STORE_IN_OUTPUT 28, 2, 3, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[4] = step1b[4][i] + step1b[11][i]; + ;step1[5] = step1b[5][i] + step1b[10][i]; + ;step1[10] = step1b[5][i] - step1b[10][i]; + ;step1[11] = step1b[4][i] - step1b[11][i]; + LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 + vadd.s16 q2, q12, q1 + vadd.s16 q3, q13, q0 + vsub.s16 q4, q13, q0 + vsub.s16 q5, q12, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[10 * 32] = step1b[10][i] + step1b[21][i]; + ;output[11 * 32] = step1b[11][i] + step1b[20][i]; + ;output[20 * 32] = step1b[11][i] - step1b[20][i]; + ;output[21 * 32] = step1b[10][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 + vadd.s16 q6, q4, q1 + vadd.s16 q7, q5, q0 + vsub.s16 q8, q5, q0 + vsub.s16 q9, q4, q1 + STORE_IN_OUTPUT 21, 21, 20, q9, q8 + STORE_IN_OUTPUT 20, 11, 10, q7, q6 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; + ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; + ;output[26 * 32] = step1b[5][i] - step1b[26][i]; + ;output[27 * 32] = step1b[4][i] - step1b[27][i]; + LOAD_FROM_OUTPUT 10, 26, 27, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 27, 27, 26, q7, q6 + STORE_IN_OUTPUT 26, 4, 5, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[6] = step1b[6][i] + step1b[9][i]; + ;step1[7] = step1b[7][i] + step1b[8][i]; + ;step1[8] = step1b[7][i] - step1b[8][i]; + ;step1[9] = step1b[6][i] - step1b[9][i]; + LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 + vadd.s16 q2, q14, q1 + vadd.s16 q3, q15, q0 + vsub.s16 q4, q15, q0 + vsub.s16 q5, q14, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; + ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; + ;output[22 * 32] = step1b[9][i] - step1b[22][i]; + ;output[23 * 32] = step1b[8][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 + vadd.s16 q6, q4, q1 + vadd.s16 q7, q5, q0 + vsub.s16 q8, q5, q0 + vsub.s16 q9, q4, q1 + STORE_IN_OUTPUT 23, 23, 22, q9, q8 + STORE_IN_OUTPUT 22, 9, 8, q7, q6 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; + ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; + ;output[24 * 32] = step1b[7][i] - step1b[24][i]; + ;output[25 * 32] = step1b[6][i] - step1b[25][i]; + LOAD_FROM_OUTPUT 8, 24, 25, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 25, 25, 24, q7, q6 + STORE_IN_OUTPUT 24, 6, 7, q4, q5 + ; -------------------------------------------------------------------------- + + ; TODO(cd) get rid of these push/pop by properly adjusting register + ; content at end of loop + pop {r2} + pop {r1} + pop {r0} + add r1, r1, #8*2 + add r2, r2, #8*32*2 + + ; bands loop processing + add r4, r4, #1 + cmp r4, #3 + BLE idct32_bands_loop + + pop {r4} + bx lr + ENDP ; |idct32_transpose_and_transform| + +;void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride); +; +; r0 uint8_t *dest +; r1 int16_t *out +; r2 int dest_stride) + +|idct32_combine_add| PROC + + mov r12, r0 ; dest pointer used for stores + sub r2, r2, #32 ; adjust the stride (remove the post-increments) + mov r3, #0 ; initialize loop counter + +idct32_combine_add_loop + ; load out[j * 32 + 0-31] + vld1.s16 {q12}, [r1]! + vld1.s16 {q13}, [r1]! + vld1.s16 {q14}, [r1]! + vld1.s16 {q15}, [r1]! + ; load dest[j * dest_stride + 0-31] + vld1.s16 {q6}, [r0]! + vld1.s16 {q7}, [r0]! + ; ROUND_POWER_OF_TWO + vrshr.s16 q12, q12, #6 + vrshr.s16 q13, q13, #6 + vrshr.s16 q14, q14, #6 + vrshr.s16 q15, q15, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q12, q12, d12 + vaddw.u8 q13, q13, d13 + vaddw.u8 q14, q14, d14 + vaddw.u8 q15, q15, d15 + ; clip pixel + vqmovun.s16 d12, q12 + vqmovun.s16 d13, q13 + vqmovun.s16 d14, q14 + vqmovun.s16 d15, q15 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {q6}, [r12]! + vst1.16 {q7}, [r12]! + ; increment pointers by adjusted stride (not necessary for r1/out) + add r0, r0, r2 + add r12, r12, r2 + ; loop processing + add r3, r3, #1 + cmp r3, #31 + BLE idct32_combine_add_loop + + bx lr + ENDP ; |idct32_transpose| + + END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm new file mode 100644 index 0000000..869ee5f --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm @@ -0,0 +1,68 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp9_short_idct4x4_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_short_idct4x4_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 4) + add r0, r0, #8 ; + (1 <<((4) - 1)) + asr r0, r0, #4 ; >> 4 + + vdup.s16 q0, r0 ; duplicate a1 + + vld1.32 {d2[0]}, [r1], r2 + vld1.32 {d2[1]}, [r1], r2 + vld1.32 {d4[0]}, [r1], r2 + vld1.32 {d4[1]}, [r1] + + vaddw.u8 q8, q0, d2 ; dest[x] + a1 + vaddw.u8 q9, q0, d4 + + vqmovun.s16 d6, q8 ; clip_pixel + vqmovun.s16 d7, q9 + + vst1.32 {d6[0]}, [r12], r2 + vst1.32 {d6[1]}, [r12], r2 + vst1.32 {d7[0]}, [r12], r2 + vst1.32 {d7[1]}, [r12] + + bx lr + ENDP ; |vp9_short_idct4x4_1_add_neon| + + END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm new file mode 100644 index 0000000..640fb93 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm @@ -0,0 +1,190 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_short_idct4x4_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + AREA Block, CODE, READONLY ; name this block of code +;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_short_idct4x4_add_neon| PROC + + ; The 2D transform is done with two passes which are actually pretty + ; similar. We first transform the rows. This is done by transposing + ; the inputs, doing an SIMD column transform (the columns are the + ; transposed rows) and then transpose the results (so that it goes back + ; in normal/row positions). Then, we transform the columns by doing + ; another SIMD column transform. + ; So, two passes of a transpose followed by a column transform. + + ; load the inputs into q8-q9, d16-d19 + vld1.s16 {q8,q9}, [r0]! + + ; generate scalar constants + ; cospi_8_64 = 15137 = 0x3b21 + mov r0, #0x3b00 + add r0, #0x21 + ; cospi_16_64 = 11585 = 0x2d41 + mov r3, #0x2d00 + add r3, #0x41 + ; cospi_24_64 = 6270 = 0x 187e + mov r12, #0x1800 + add r12, #0x7e + + ; transpose the input data + ; 00 01 02 03 d16 + ; 10 11 12 13 d17 + ; 20 21 22 23 d18 + ; 30 31 32 33 d19 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + + ; generate constant vectors + vdup.16 d20, r0 ; replicate cospi_8_64 + vdup.16 d21, r3 ; replicate cospi_16_64 + + ; 00 10 02 12 d16 + ; 01 11 03 13 d17 + ; 20 30 22 32 d18 + ; 21 31 23 33 d19 + vtrn.32 q8, q9 + ; 00 10 20 30 d16 + ; 01 11 21 31 d17 + ; 02 12 22 32 d18 + ; 03 13 23 33 d19 + + vdup.16 d22, r12 ; replicate cospi_24_64 + + ; do the transform on transposed rows + + ; stage 1 + vadd.s16 d23, d16, d18 ; (input[0] + input[2]) + vsub.s16 d24, d16, d18 ; (input[0] - input[2]) + + vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 + vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 + + ; (input[0] + input[2]) * cospi_16_64; + ; (input[0] - input[2]) * cospi_16_64; + vmull.s16 q13, d23, d21 + vmull.s16 q14, d24, d21 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64; + ; input[1] * cospi_8_64 + input[3] * cospi_24_64; + vmlsl.s16 q15, d19, d20 + vmlal.s16 q1, d19, d22 + + ; dct_const_round_shift + vqrshrn.s32 d26, q13, #14 + vqrshrn.s32 d27, q14, #14 + vqrshrn.s32 d29, q15, #14 + vqrshrn.s32 d28, q1, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + vswp d18, d19 + + ; transpose the results + ; 00 01 02 03 d16 + ; 10 11 12 13 d17 + ; 20 21 22 23 d18 + ; 30 31 32 33 d19 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + ; 00 10 02 12 d16 + ; 01 11 03 13 d17 + ; 20 30 22 32 d18 + ; 21 31 23 33 d19 + vtrn.32 q8, q9 + ; 00 10 20 30 d16 + ; 01 11 21 31 d17 + ; 02 12 22 32 d18 + ; 03 13 23 33 d19 + + ; do the transform on columns + + ; stage 1 + vadd.s16 d23, d16, d18 ; (input[0] + input[2]) + vsub.s16 d24, d16, d18 ; (input[0] - input[2]) + + vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 + vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 + + ; (input[0] + input[2]) * cospi_16_64; + ; (input[0] - input[2]) * cospi_16_64; + vmull.s16 q13, d23, d21 + vmull.s16 q14, d24, d21 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64; + ; input[1] * cospi_8_64 + input[3] * cospi_24_64; + vmlsl.s16 q15, d19, d20 + vmlal.s16 q1, d19, d22 + + ; dct_const_round_shift + vqrshrn.s32 d26, q13, #14 + vqrshrn.s32 d27, q14, #14 + vqrshrn.s32 d29, q15, #14 + vqrshrn.s32 d28, q1, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + + ; The results are in two registers, one of them being swapped. This will + ; be taken care of by loading the 'dest' value in a swapped fashion and + ; also storing them in the same swapped fashion. + ; temp_out[0, 1] = d16, d17 = q8 + ; temp_out[2, 3] = d19, d18 = q9 swapped + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + vrshr.s16 q8, q8, #4 + vrshr.s16 q9, q9, #4 + + vld1.32 {d26[0]}, [r1], r2 + vld1.32 {d26[1]}, [r1], r2 + vld1.32 {d27[1]}, [r1], r2 + vld1.32 {d27[0]}, [r1] ; no post-increment + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d26 + vaddw.u8 q9, q9, d27 + + ; clip_pixel + vqmovun.s16 d26, q8 + vqmovun.s16 d27, q9 + + ; do the stores in reverse order with negative post-increment, by changing + ; the sign of the stride + rsb r2, r2, #0 + vst1.32 {d27[0]}, [r1], r2 + vst1.32 {d27[1]}, [r1], r2 + vst1.32 {d26[1]}, [r1], r2 + vst1.32 {d26[0]}, [r1] ; no post-increment + bx lr + ENDP ; |vp9_short_idct4x4_add_neon| + + END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm new file mode 100644 index 0000000..923804f --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm @@ -0,0 +1,88 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp9_short_idct8x8_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_short_idct8x8_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 5) + add r0, r0, #16 ; + (1 <<((5) - 1)) + asr r0, r0, #5 ; >> 5 + + vdup.s16 q0, r0 ; duplicate a1 + + ; load destination data + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r2 + vld1.64 {d17}, [r1] + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r2 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r2 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r2 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r2 + vst1.64 {d31}, [r12], r2 + + bx lr + ENDP ; |vp9_short_idct8x8_1_add_neon| + + END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm index f829665..a744f59 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm @@ -9,6 +9,7 @@ ; EXPORT |vp9_short_idct8x8_add_neon| + EXPORT |vp9_short_idct10_8x8_add_neon| ARM REQUIRE8 PRESERVE8 @@ -24,191 +25,149 @@ ; stage 1 vdup.16 d0, r3 ; duplicate cospi_28_64 vdup.16 d1, r4 ; duplicate cospi_4_64 + vdup.16 d2, r5 ; duplicate cospi_12_64 + vdup.16 d3, r6 ; duplicate cospi_20_64 ; input[1] * cospi_28_64 vmull.s16 q2, d18, d0 vmull.s16 q3, d19, d0 - ; input[7] * cospi_4_64 - vmull.s16 q4, d30, d1 - vmull.s16 q5, d31, d1 + ; input[5] * cospi_12_64 + vmull.s16 q5, d26, d2 + vmull.s16 q6, d27, d2 ; input[1]*cospi_28_64-input[7]*cospi_4_64 - vsub.s32 q6, q2, q4 - vsub.s32 q7, q3, q5 + vmlsl.s16 q2, d30, d1 + vmlsl.s16 q3, d31, d1 + + ; input[5] * cospi_12_64 - input[3] * cospi_20_64 + vmlsl.s16 q5, d22, d3 + vmlsl.s16 q6, d23, d3 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d8, q6, #14 ; >> 14 - vqrshrn.s32 d9, q7, #14 ; >> 14 + vqrshrn.s32 d8, q2, #14 ; >> 14 + vqrshrn.s32 d9, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q5, #14 ; >> 14 + vqrshrn.s32 d11, q6, #14 ; >> 14 ; input[1] * cospi_4_64 vmull.s16 q2, d18, d1 vmull.s16 q3, d19, d1 - ; input[7] * cospi_28_64 - vmull.s16 q1, d30, d0 - vmull.s16 q5, d31, d0 - - ; input[1]*cospi_4_64+input[7]*cospi_28_64 - vadd.s32 q2, q2, q1 - vadd.s32 q3, q3, q5 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d14, q2, #14 ; >> 14 - vqrshrn.s32 d15, q3, #14 ; >> 14 - - vdup.16 d0, r5 ; duplicate cospi_12_64 - vdup.16 d1, r6 ; duplicate cospi_20_64 - - ; input[5] * cospi_12_64 - vmull.s16 q2, d26, d0 - vmull.s16 q3, d27, d0 - - ; input[3] * cospi_20_64 - vmull.s16 q5, d22, d1 - vmull.s16 q6, d23, d1 - - ; input[5] * cospi_12_64 - input[3] * cospi_20_64 - vsub.s32 q2, q2, q5 - vsub.s32 q3, q3, q6 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q2, #14 ; >> 14 - vqrshrn.s32 d11, q3, #14 ; >> 14 - ; input[5] * cospi_20_64 - vmull.s16 q2, d26, d1 - vmull.s16 q3, d27, d1 + vmull.s16 q9, d26, d3 + vmull.s16 q13, d27, d3 - ; input[3] * cospi_12_64 - vmull.s16 q9, d22, d0 - vmull.s16 q15, d23, d0 + ; input[1]*cospi_4_64+input[7]*cospi_28_64 + vmlal.s16 q2, d30, d0 + vmlal.s16 q3, d31, d0 ; input[5] * cospi_20_64 + input[3] * cospi_12_64 - vadd.s32 q0, q2, q9 - vadd.s32 q1, q3, q15 + vmlal.s16 q9, d22, d2 + vmlal.s16 q13, d23, d2 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q0, #14 ; >> 14 - vqrshrn.s32 d13, q1, #14 ; >> 14 + vqrshrn.s32 d14, q2, #14 ; >> 14 + vqrshrn.s32 d15, q3, #14 ; >> 14 ; stage 2 & stage 3 - even half vdup.16 d0, r7 ; duplicate cospi_16_64 + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q13, #14 ; >> 14 + ; input[0] * cospi_16_64 vmull.s16 q2, d16, d0 vmull.s16 q3, d17, d0 - ; input[2] * cospi_16_64 - vmull.s16 q9, d24, d0 - vmull.s16 q11, d25, d0 + ; input[0] * cospi_16_64 + vmull.s16 q13, d16, d0 + vmull.s16 q15, d17, d0 ; (input[0] + input[2]) * cospi_16_64 - vadd.s32 q9, q2, q9 - vadd.s32 q11, q3, q11 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d18, q9, #14 ; >> 14 - vqrshrn.s32 d19, q11, #14 ; >> 14 + vmlal.s16 q2, d24, d0 + vmlal.s16 q3, d25, d0 - ; input[0] * cospi_16_64 - vmull.s16 q2, d16, d0 - vmull.s16 q3, d17, d0 + ; (input[0] - input[2]) * cospi_16_64 + vmlsl.s16 q13, d24, d0 + vmlsl.s16 q15, d25, d0 - ; input[2] * cospi_16_64 - vmull.s16 q13, d24, d0 - vmull.s16 q15, d25, d0 + vdup.16 d0, r8 ; duplicate cospi_24_64 + vdup.16 d1, r9 ; duplicate cospi_8_64 - ; (input[0] - input[2]) * cospi_16_64 - vsub.s32 q2, q2, q13 - vsub.s32 q3, q3, q15 + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d18, q2, #14 ; >> 14 + vqrshrn.s32 d19, q3, #14 ; >> 14 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d22, q2, #14 ; >> 14 - vqrshrn.s32 d23, q3, #14 ; >> 14 + vqrshrn.s32 d22, q13, #14 ; >> 14 + vqrshrn.s32 d23, q15, #14 ; >> 14 ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - vdup.16 d0, r8 ; duplicate cospi_24_64 - vdup.16 d1, r9 ; duplicate cospi_8_64 - ; input[1] * cospi_24_64 vmull.s16 q2, d20, d0 vmull.s16 q3, d21, d0 - ; input[3] * cospi_8_64 - vmull.s16 q13, d28, d1 - vmull.s16 q15, d29, d1 + ; input[1] * cospi_8_64 + vmull.s16 q8, d20, d1 + vmull.s16 q12, d21, d1 ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - vsub.s32 q2, q2, q13 - vsub.s32 q3, q3, q15 + vmlsl.s16 q2, d28, d1 + vmlsl.s16 q3, d29, d1 + + ; input[1] * cospi_8_64 + input[3] * cospi_24_64 + vmlal.s16 q8, d28, d0 + vmlal.s16 q12, d29, d0 ; dct_const_round_shift(input_dc * cospi_16_64) vqrshrn.s32 d26, q2, #14 ; >> 14 vqrshrn.s32 d27, q3, #14 ; >> 14 - ; input[1] * cospi_8_64 - vmull.s16 q2, d20, d1 - vmull.s16 q3, d21, d1 - - ; input[3] * cospi_24_64 - vmull.s16 q8, d28, d0 - vmull.s16 q10, d29, d0 - - ; input[1] * cospi_8_64 + input[3] * cospi_24_64 - vadd.s32 q0, q2, q8 - vadd.s32 q1, q3, q10 - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d30, q0, #14 ; >> 14 - vqrshrn.s32 d31, q1, #14 ; >> 14 - + vqrshrn.s32 d30, q8, #14 ; >> 14 + vqrshrn.s32 d31, q12, #14 ; >> 14 vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] + ; stage 3 -odd half + vdup.16 d16, r7 ; duplicate cospi_16_64 + ; stage 2 - odd half vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - ; step2[6] * cospi_16_64 vmull.s16 q9, d28, d16 vmull.s16 q10, d29, d16 - ; step2[5] * cospi_16_64 - vmull.s16 q11, d26, d16 - vmull.s16 q12, d27, d16 + ; step2[6] * cospi_16_64 + vmull.s16 q11, d28, d16 + vmull.s16 q12, d29, d16 ; (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q9, q9, q11 - vsub.s32 q10, q10, q12 + vmlsl.s16 q9, d26, d16 + vmlsl.s16 q10, d27, d16 + + ; (step2[5] + step2[6]) * cospi_16_64 + vmlal.s16 q11, d26, d16 + vmlal.s16 q12, d27, d16 ; dct_const_round_shift(input_dc * cospi_16_64) vqrshrn.s32 d10, q9, #14 ; >> 14 vqrshrn.s32 d11, q10, #14 ; >> 14 - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[5] * cospi_16_64 - vmull.s16 q11, d26, d16 - vmull.s16 q12, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q10, #14 ; >> 14 + vqrshrn.s32 d12, q11, #14 ; >> 14 + vqrshrn.s32 d13, q12, #14 ; >> 14 ; stage 4 vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; @@ -247,14 +206,11 @@ |vp9_short_idct8x8_add_neon| PROC push {r4-r9} - vld1.s16 {q8}, [r0]! - vld1.s16 {q9}, [r0]! - vld1.s16 {q10}, [r0]! - vld1.s16 {q11}, [r0]! - vld1.s16 {q12}, [r0]! - vld1.s16 {q13}, [r0]! - vld1.s16 {q14}, [r0]! - vld1.s16 {q15}, [r0]! + vpush {d8-d15} + vld1.s16 {q8,q9}, [r0]! + vld1.s16 {q10,q11}, [r0]! + vld1.s16 {q12,q13}, [r0]! + vld1.s16 {q14,q15}, [r0]! ; transpose the input data TRANSPOSE8X8 @@ -349,8 +305,215 @@ vst1.64 {d6}, [r0], r2 vst1.64 {d7}, [r0], r2 + vpop {d8-d15} pop {r4-r9} bx lr ENDP ; |vp9_short_idct8x8_add_neon| +;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_short_idct10_8x8_add_neon| PROC + push {r4-r9} + vpush {d8-d15} + vld1.s16 {q8,q9}, [r0]! + vld1.s16 {q10,q11}, [r0]! + vld1.s16 {q12,q13}, [r0]! + vld1.s16 {q14,q15}, [r0]! + + ; transpose the input data + TRANSPOSE8X8 + + ; generate cospi_28_64 = 3196 + mov r3, #0x0c00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r4, #0x3e00 + add r4, #0xc5 + + ; generate cospi_12_64 = 13623 + mov r5, #0x3500 + add r5, #0x37 + + ; generate cospi_20_64 = 9102 + mov r6, #0x2300 + add r6, #0x8e + + ; generate cospi_16_64 = 11585 + mov r7, #0x2d00 + add r7, #0x41 + + ; generate cospi_24_64 = 6270 + mov r8, #0x1800 + add r8, #0x7e + + ; generate cospi_8_64 = 15137 + mov r9, #0x3b00 + add r9, #0x21 + + ; First transform rows + ; stage 1 + ; The following instructions use vqrdmulh to do the + ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling + ; multiply and shift the result by 16 bits instead of 14 bits. So we need + ; to double the constants before multiplying to compensate this. + mov r12, r3, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_28_64*2 + mov r12, r4, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_4_64*2 + + ; dct_const_round_shift(input[1] * cospi_28_64) + vqrdmulh.s16 q4, q9, q0 + + mov r12, r6, lsl #1 + rsb r12, #0 + vdup.16 q0, r12 ; duplicate -cospi_20_64*2 + + ; dct_const_round_shift(input[1] * cospi_4_64) + vqrdmulh.s16 q7, q9, q1 + + mov r12, r5, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_12_64*2 + + ; dct_const_round_shift(- input[3] * cospi_20_64) + vqrdmulh.s16 q5, q11, q0 + + mov r12, r7, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_16_64*2 + + ; dct_const_round_shift(input[3] * cospi_12_64) + vqrdmulh.s16 q6, q11, q1 + + ; stage 2 & stage 3 - even half + mov r12, r8, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_24_64*2 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrdmulh.s16 q9, q8, q0 + + mov r12, r9, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_8_64*2 + + ; dct_const_round_shift(input[1] * cospi_24_64) + vqrdmulh.s16 q13, q10, q1 + + ; dct_const_round_shift(input[1] * cospi_8_64) + vqrdmulh.s16 q15, q10, q0 + + ; stage 3 -odd half + vdup.16 d16, r7 ; duplicate cospi_16_64 + + vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] + vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2] + vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2] + vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] + + ; stage 2 - odd half + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] + vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q11, d28, d16 + vmull.s16 q12, d29, d16 + + ; (step2[6] - step2[5]) * cospi_16_64 + vmlsl.s16 q9, d26, d16 + vmlsl.s16 q10, d27, d16 + + ; (step2[5] + step2[6]) * cospi_16_64 + vmlal.s16 q11, d26, d16 + vmlal.s16 q12, d27, d16 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q9, #14 ; >> 14 + vqrshrn.s32 d11, q10, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q11, #14 ; >> 14 + vqrshrn.s32 d13, q12, #14 ; >> 14 + + ; stage 4 + vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; + + ; Transpose the matrix + TRANSPOSE8X8 + + ; Then transform columns + IDCT8x8_1D + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + vrshr.s16 q8, q8, #5 + vrshr.s16 q9, q9, #5 + vrshr.s16 q10, q10, #5 + vrshr.s16 q11, q11, #5 + vrshr.s16 q12, q12, #5 + vrshr.s16 q13, q13, #5 + vrshr.s16 q14, q14, #5 + vrshr.s16 q15, q15, #5 + + ; save dest pointer + mov r0, r1 + + ; load destination data + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1] + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d0 + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + vaddw.u8 q12, q12, d4 + vaddw.u8 q13, q13, d5 + vaddw.u8 q14, q14, d6 + vaddw.u8 q15, q15, d7 + + ; clip_pixel + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + vqmovun.s16 d4, q12 + vqmovun.s16 d5, q13 + vqmovun.s16 d6, q14 + vqmovun.s16 d7, q15 + + ; store the data + vst1.64 {d0}, [r0], r2 + vst1.64 {d1}, [r0], r2 + vst1.64 {d2}, [r0], r2 + vst1.64 {d3}, [r0], r2 + vst1.64 {d4}, [r0], r2 + vst1.64 {d5}, [r0], r2 + vst1.64 {d6}, [r0], r2 + vst1.64 {d7}, [r0], r2 + + vpop {d8-d15} + pop {r4-r9} + bx lr + ENDP ; |vp9_short_idct10_8x8_add_neon| + END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm new file mode 100644 index 0000000..963ef35 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm @@ -0,0 +1,237 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_short_iht4x4_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are + ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain + ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back + ; into d16-d19 registers. This macro will touch q10- q15 registers and use + ; them as buffer during calculation. + MACRO + IDCT4x4_1D + ; stage 1 + vadd.s16 d23, d16, d18 ; (input[0] + input[2]) + vsub.s16 d24, d16, d18 ; (input[0] - input[2]) + + vmull.s16 q15, d17, d2 ; input[1] * cospi_24_64 + vmull.s16 q10, d17, d0 ; input[1] * cospi_8_64 + vmull.s16 q13, d23, d1 ; (input[0] + input[2]) * cospi_16_64 + vmull.s16 q14, d24, d1 ; (input[0] - input[2]) * cospi_16_64 + vmlsl.s16 q15, d19, d0 ; input[1] * cospi_24_64 - input[3] * cospi_8_64 + vmlal.s16 q10, d19, d2 ; input[1] * cospi_8_64 + input[3] * cospi_24_64 + + ; dct_const_round_shift + vqrshrn.s32 d26, q13, #14 + vqrshrn.s32 d27, q14, #14 + vqrshrn.s32 d29, q15, #14 + vqrshrn.s32 d28, q10, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + vswp d18, d19 + MEND + + ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which + ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9. + ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be + ; stored back into d16-d19 registers. This macro will touch q11,q12,q13, + ; q14,q15 registers and use them as buffer during calculation. + MACRO + IADST4x4_1D + vmull.s16 q10, d3, d16 ; s0 = sinpi_1_9 * x0 + vmull.s16 q11, d4, d16 ; s1 = sinpi_2_9 * x0 + vmull.s16 q12, d6, d17 ; s2 = sinpi_3_9 * x1 + vmull.s16 q13, d5, d18 ; s3 = sinpi_4_9 * x2 + vmull.s16 q14, d3, d18 ; s4 = sinpi_1_9 * x2 + vmovl.s16 q15, d16 ; expand x0 from 16 bit to 32 bit + vaddw.s16 q15, q15, d19 ; x0 + x3 + vmull.s16 q8, d4, d19 ; s5 = sinpi_2_9 * x3 + vsubw.s16 q15, q15, d18 ; s7 = x0 + x3 - x2 + vmull.s16 q9, d5, d19 ; s6 = sinpi_4_9 * x3 + + vadd.s32 q10, q10, q13 ; x0 = s0 + s3 + s5 + vadd.s32 q10, q10, q8 + vsub.s32 q11, q11, q14 ; x1 = s1 - s4 - s6 + vdup.32 q8, r0 ; duplicate sinpi_3_9 + vsub.s32 q11, q11, q9 + vmul.s32 q15, q15, q8 ; x2 = sinpi_3_9 * s7 + + vadd.s32 q13, q10, q12 ; s0 = x0 + x3 + vadd.s32 q10, q10, q11 ; x0 + x1 + vadd.s32 q14, q11, q12 ; s1 = x1 + x3 + vsub.s32 q10, q10, q12 ; s3 = x0 + x1 - x3 + + ; dct_const_round_shift + vqrshrn.s32 d16, q13, #14 + vqrshrn.s32 d17, q14, #14 + vqrshrn.s32 d18, q15, #14 + vqrshrn.s32 d19, q10, #14 + MEND + + ; Generate cosine constants in d6 - d8 for the IDCT + MACRO + GENERATE_COSINE_CONSTANTS + ; cospi_8_64 = 15137 = 0x3b21 + mov r0, #0x3b00 + add r0, #0x21 + ; cospi_16_64 = 11585 = 0x2d41 + mov r3, #0x2d00 + add r3, #0x41 + ; cospi_24_64 = 6270 = 0x187e + mov r12, #0x1800 + add r12, #0x7e + + ; generate constant vectors + vdup.16 d0, r0 ; duplicate cospi_8_64 + vdup.16 d1, r3 ; duplicate cospi_16_64 + vdup.16 d2, r12 ; duplicate cospi_24_64 + MEND + + ; Generate sine constants in d1 - d4 for the IADST. + MACRO + GENERATE_SINE_CONSTANTS + ; sinpi_1_9 = 5283 = 0x14A3 + mov r0, #0x1400 + add r0, #0xa3 + ; sinpi_2_9 = 9929 = 0x26C9 + mov r3, #0x2600 + add r3, #0xc9 + ; sinpi_4_9 = 15212 = 0x3B6C + mov r12, #0x3b00 + add r12, #0x6c + + ; generate constant vectors + vdup.16 d3, r0 ; duplicate sinpi_1_9 + + ; sinpi_3_9 = 13377 = 0x3441 + mov r0, #0x3400 + add r0, #0x41 + + vdup.16 d4, r3 ; duplicate sinpi_2_9 + vdup.16 d5, r12 ; duplicate sinpi_4_9 + vdup.16 q3, r0 ; duplicate sinpi_3_9 + MEND + + ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19. + MACRO + TRANSPOSE4X4 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + vtrn.32 q8, q9 + MEND + + AREA Block, CODE, READONLY ; name this block of code +;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride, int tx_type) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride +; r3 int tx_type) +; This function will only handle tx_type of 1,2,3. +|vp9_short_iht4x4_add_neon| PROC + + ; load the inputs into d16-d19 + vld1.s16 {q8,q9}, [r0]! + + ; transpose the input data + TRANSPOSE4X4 + + ; decide the type of transform + cmp r3, #2 + beq idct_iadst + cmp r3, #3 + beq iadst_iadst + +iadst_idct + ; generate constants + GENERATE_COSINE_CONSTANTS + GENERATE_SINE_CONSTANTS + + ; first transform rows + IDCT4x4_1D + + ; transpose the matrix + TRANSPOSE4X4 + + ; then transform columns + IADST4x4_1D + + b end_vp9_short_iht4x4_add_neon + +idct_iadst + ; generate constants + GENERATE_COSINE_CONSTANTS + GENERATE_SINE_CONSTANTS + + ; first transform rows + IADST4x4_1D + + ; transpose the matrix + TRANSPOSE4X4 + + ; then transform columns + IDCT4x4_1D + + b end_vp9_short_iht4x4_add_neon + +iadst_iadst + ; generate constants + GENERATE_SINE_CONSTANTS + + ; first transform rows + IADST4x4_1D + + ; transpose the matrix + TRANSPOSE4X4 + + ; then transform columns + IADST4x4_1D + +end_vp9_short_iht4x4_add_neon + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + vrshr.s16 q8, q8, #4 + vrshr.s16 q9, q9, #4 + + vld1.32 {d26[0]}, [r1], r2 + vld1.32 {d26[1]}, [r1], r2 + vld1.32 {d27[0]}, [r1], r2 + vld1.32 {d27[1]}, [r1] + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d26 + vaddw.u8 q9, q9, d27 + + ; clip_pixel + vqmovun.s16 d26, q8 + vqmovun.s16 d27, q9 + + ; do the stores in reverse order with negative post-increment, by changing + ; the sign of the stride + rsb r2, r2, #0 + vst1.32 {d27[1]}, [r1], r2 + vst1.32 {d27[0]}, [r1], r2 + vst1.32 {d26[1]}, [r1], r2 + vst1.32 {d26[0]}, [r1] ; no post-increment + bx lr + ENDP ; |vp9_short_iht4x4_add_neon| + + END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm new file mode 100644 index 0000000..bab9cb4 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm @@ -0,0 +1,696 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_short_iht8x8_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Generate IADST constants in r0 - r12 for the IADST. + MACRO + GENERATE_IADST_CONSTANTS + ; generate cospi_2_64 = 16305 + mov r0, #0x3f00 + add r0, #0xb1 + + ; generate cospi_30_64 = 1606 + mov r1, #0x600 + add r1, #0x46 + + ; generate cospi_10_64 = 14449 + mov r2, #0x3800 + add r2, #0x71 + + ; generate cospi_22_64 = 7723 + mov r3, #0x1e00 + add r3, #0x2b + + ; generate cospi_18_64 = 10394 + mov r4, #0x2800 + add r4, #0x9a + + ; generate cospi_14_64 = 12665 + mov r5, #0x3100 + add r5, #0x79 + + ; generate cospi_26_64 = 4756 + mov r6, #0x1200 + add r6, #0x94 + + ; generate cospi_6_64 = 15679 + mov r7, #0x3d00 + add r7, #0x3f + + ; generate cospi_8_64 = 15137 + mov r8, #0x3b00 + add r8, #0x21 + + ; generate cospi_24_64 = 6270 + mov r9, #0x1800 + add r9, #0x7e + + ; generate 0 + mov r10, #0 + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + MEND + + ; Generate IDCT constants in r3 - r9 for the IDCT. + MACRO + GENERATE_IDCT_CONSTANTS + ; generate cospi_28_64 = 3196 + mov r3, #0x0c00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r4, #0x3e00 + add r4, #0xc5 + + ; generate cospi_12_64 = 13623 + mov r5, #0x3500 + add r5, #0x37 + + ; generate cospi_20_64 = 9102 + mov r6, #0x2300 + add r6, #0x8e + + ; generate cospi_16_64 = 11585 + mov r7, #0x2d00 + add r7, #0x41 + + ; generate cospi_24_64 = 6270 + mov r8, #0x1800 + add r8, #0x7e + + ; generate cospi_8_64 = 15137 + mov r9, #0x3b00 + add r9, #0x21 + MEND + + ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15. + MACRO + TRANSPOSE8X8 + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + MEND + + ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are + ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output + ; will be stored back into q8-q15 registers. This macro will touch q0-q7 + ; registers and use them as buffer during calculation. + MACRO + IDCT8x8_1D + ; stage 1 + vdup.16 d0, r3 ; duplicate cospi_28_64 + vdup.16 d1, r4 ; duplicate cospi_4_64 + vdup.16 d2, r5 ; duplicate cospi_12_64 + vdup.16 d3, r6 ; duplicate cospi_20_64 + + ; input[1] * cospi_28_64 + vmull.s16 q2, d18, d0 + vmull.s16 q3, d19, d0 + + ; input[5] * cospi_12_64 + vmull.s16 q5, d26, d2 + vmull.s16 q6, d27, d2 + + ; input[1]*cospi_28_64-input[7]*cospi_4_64 + vmlsl.s16 q2, d30, d1 + vmlsl.s16 q3, d31, d1 + + ; input[5] * cospi_12_64 - input[3] * cospi_20_64 + vmlsl.s16 q5, d22, d3 + vmlsl.s16 q6, d23, d3 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d8, q2, #14 ; >> 14 + vqrshrn.s32 d9, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q5, #14 ; >> 14 + vqrshrn.s32 d11, q6, #14 ; >> 14 + + ; input[1] * cospi_4_64 + vmull.s16 q2, d18, d1 + vmull.s16 q3, d19, d1 + + ; input[5] * cospi_20_64 + vmull.s16 q9, d26, d3 + vmull.s16 q13, d27, d3 + + ; input[1]*cospi_4_64+input[7]*cospi_28_64 + vmlal.s16 q2, d30, d0 + vmlal.s16 q3, d31, d0 + + ; input[5] * cospi_20_64 + input[3] * cospi_12_64 + vmlal.s16 q9, d22, d2 + vmlal.s16 q13, d23, d2 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d14, q2, #14 ; >> 14 + vqrshrn.s32 d15, q3, #14 ; >> 14 + + ; stage 2 & stage 3 - even half + vdup.16 d0, r7 ; duplicate cospi_16_64 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q13, #14 ; >> 14 + + ; input[0] * cospi_16_64 + vmull.s16 q2, d16, d0 + vmull.s16 q3, d17, d0 + + ; input[0] * cospi_16_64 + vmull.s16 q13, d16, d0 + vmull.s16 q15, d17, d0 + + ; (input[0] + input[2]) * cospi_16_64 + vmlal.s16 q2, d24, d0 + vmlal.s16 q3, d25, d0 + + ; (input[0] - input[2]) * cospi_16_64 + vmlsl.s16 q13, d24, d0 + vmlsl.s16 q15, d25, d0 + + vdup.16 d0, r8 ; duplicate cospi_24_64 + vdup.16 d1, r9 ; duplicate cospi_8_64 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d18, q2, #14 ; >> 14 + vqrshrn.s32 d19, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d22, q13, #14 ; >> 14 + vqrshrn.s32 d23, q15, #14 ; >> 14 + + ; input[1] * cospi_24_64 + vmull.s16 q2, d20, d0 + vmull.s16 q3, d21, d0 + + ; input[1] * cospi_8_64 + vmull.s16 q8, d20, d1 + vmull.s16 q12, d21, d1 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64 + vmlsl.s16 q2, d28, d1 + vmlsl.s16 q3, d29, d1 + + ; input[1] * cospi_8_64 + input[3] * cospi_24_64 + vmlal.s16 q8, d28, d0 + vmlal.s16 q12, d29, d0 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d26, q2, #14 ; >> 14 + vqrshrn.s32 d27, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d30, q8, #14 ; >> 14 + vqrshrn.s32 d31, q12, #14 ; >> 14 + + vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] + vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] + vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] + vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] + + ; stage 3 -odd half + vdup.16 d16, r7 ; duplicate cospi_16_64 + + ; stage 2 - odd half + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] + vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q11, d28, d16 + vmull.s16 q12, d29, d16 + + ; (step2[6] - step2[5]) * cospi_16_64 + vmlsl.s16 q9, d26, d16 + vmlsl.s16 q10, d27, d16 + + ; (step2[5] + step2[6]) * cospi_16_64 + vmlal.s16 q11, d26, d16 + vmlal.s16 q12, d27, d16 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q9, #14 ; >> 14 + vqrshrn.s32 d11, q10, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q11, #14 ; >> 14 + vqrshrn.s32 d13, q12, #14 ; >> 14 + + ; stage 4 + vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; + MEND + + ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which + ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The + ; output will be stored back into q8-q15 registers. This macro will touch + ; q0 - q7 registers and use them as buffer during calculation. + MACRO + IADST8X8_1D + vdup.16 d14, r0 ; duplicate cospi_2_64 + vdup.16 d15, r1 ; duplicate cospi_30_64 + + ; cospi_2_64 * x0 + vmull.s16 q1, d30, d14 + vmull.s16 q2, d31, d14 + + ; cospi_30_64 * x0 + vmull.s16 q3, d30, d15 + vmull.s16 q4, d31, d15 + + vdup.16 d30, r4 ; duplicate cospi_18_64 + vdup.16 d31, r5 ; duplicate cospi_14_64 + + ; s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + vmlal.s16 q1, d16, d15 + vmlal.s16 q2, d17, d15 + + ; s1 = cospi_30_64 * x0 - cospi_2_64 * x1 + vmlsl.s16 q3, d16, d14 + vmlsl.s16 q4, d17, d14 + + ; cospi_18_64 * x4 + vmull.s16 q5, d22, d30 + vmull.s16 q6, d23, d30 + + ; cospi_14_64 * x4 + vmull.s16 q7, d22, d31 + vmull.s16 q8, d23, d31 + + ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + vmlal.s16 q5, d24, d31 + vmlal.s16 q6, d25, d31 + + ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5 + vmlsl.s16 q7, d24, d30 + vmlsl.s16 q8, d25, d30 + + ; (s0 + s4) + vadd.s32 q11, q1, q5 + vadd.s32 q12, q2, q6 + + vdup.16 d0, r2 ; duplicate cospi_10_64 + vdup.16 d1, r3 ; duplicate cospi_22_64 + + ; (s0 - s4) + vsub.s32 q1, q1, q5 + vsub.s32 q2, q2, q6 + + ; x0 = dct_const_round_shift(s0 + s4); + vqrshrn.s32 d22, q11, #14 ; >> 14 + vqrshrn.s32 d23, q12, #14 ; >> 14 + + ; (s1 + s5) + vadd.s32 q12, q3, q7 + vadd.s32 q15, q4, q8 + + ; (s1 - s5) + vsub.s32 q3, q3, q7 + vsub.s32 q4, q4, q8 + + ; x4 = dct_const_round_shift(s0 - s4); + vqrshrn.s32 d2, q1, #14 ; >> 14 + vqrshrn.s32 d3, q2, #14 ; >> 14 + + ; x1 = dct_const_round_shift(s1 + s5); + vqrshrn.s32 d24, q12, #14 ; >> 14 + vqrshrn.s32 d25, q15, #14 ; >> 14 + + ; x5 = dct_const_round_shift(s1 - s5); + vqrshrn.s32 d6, q3, #14 ; >> 14 + vqrshrn.s32 d7, q4, #14 ; >> 14 + + ; cospi_10_64 * x2 + vmull.s16 q4, d26, d0 + vmull.s16 q5, d27, d0 + + ; cospi_22_64 * x2 + vmull.s16 q2, d26, d1 + vmull.s16 q6, d27, d1 + + vdup.16 d30, r6 ; duplicate cospi_26_64 + vdup.16 d31, r7 ; duplicate cospi_6_64 + + ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + vmlal.s16 q4, d20, d1 + vmlal.s16 q5, d21, d1 + + ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + vmlsl.s16 q2, d20, d0 + vmlsl.s16 q6, d21, d0 + + ; cospi_26_64 * x6 + vmull.s16 q0, d18, d30 + vmull.s16 q13, d19, d30 + + ; s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + vmlal.s16 q0, d28, d31 + vmlal.s16 q13, d29, d31 + + ; cospi_6_64 * x6 + vmull.s16 q10, d18, d31 + vmull.s16 q9, d19, d31 + + ; s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + vmlsl.s16 q10, d28, d30 + vmlsl.s16 q9, d29, d30 + + ; (s3 + s7) + vadd.s32 q14, q2, q10 + vadd.s32 q15, q6, q9 + + ; (s3 - s7) + vsub.s32 q2, q2, q10 + vsub.s32 q6, q6, q9 + + ; x3 = dct_const_round_shift(s3 + s7); + vqrshrn.s32 d28, q14, #14 ; >> 14 + vqrshrn.s32 d29, q15, #14 ; >> 14 + + ; x7 = dct_const_round_shift(s3 - s7); + vqrshrn.s32 d4, q2, #14 ; >> 14 + vqrshrn.s32 d5, q6, #14 ; >> 14 + + ; (s2 + s6) + vadd.s32 q9, q4, q0 + vadd.s32 q10, q5, q13 + + ; (s2 - s6) + vsub.s32 q4, q4, q0 + vsub.s32 q5, q5, q13 + + vdup.16 d30, r8 ; duplicate cospi_8_64 + vdup.16 d31, r9 ; duplicate cospi_24_64 + + ; x2 = dct_const_round_shift(s2 + s6); + vqrshrn.s32 d18, q9, #14 ; >> 14 + vqrshrn.s32 d19, q10, #14 ; >> 14 + + ; x6 = dct_const_round_shift(s2 - s6); + vqrshrn.s32 d8, q4, #14 ; >> 14 + vqrshrn.s32 d9, q5, #14 ; >> 14 + + ; cospi_8_64 * x4 + vmull.s16 q5, d2, d30 + vmull.s16 q6, d3, d30 + + ; cospi_24_64 * x4 + vmull.s16 q7, d2, d31 + vmull.s16 q0, d3, d31 + + ; s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + vmlal.s16 q5, d6, d31 + vmlal.s16 q6, d7, d31 + + ; s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + vmlsl.s16 q7, d6, d30 + vmlsl.s16 q0, d7, d30 + + ; cospi_8_64 * x7 + vmull.s16 q1, d4, d30 + vmull.s16 q3, d5, d30 + + ; cospi_24_64 * x7 + vmull.s16 q10, d4, d31 + vmull.s16 q2, d5, d31 + + ; s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + vmlsl.s16 q1, d8, d31 + vmlsl.s16 q3, d9, d31 + + ; s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + vmlal.s16 q10, d8, d30 + vmlal.s16 q2, d9, d30 + + vadd.s16 q8, q11, q9 ; x0 = s0 + s2; + + vsub.s16 q11, q11, q9 ; x2 = s0 - s2; + + vadd.s16 q4, q12, q14 ; x1 = s1 + s3; + + vsub.s16 q12, q12, q14 ; x3 = s1 - s3; + + ; (s4 + s6) + vadd.s32 q14, q5, q1 + vadd.s32 q15, q6, q3 + + ; (s4 - s6) + vsub.s32 q5, q5, q1 + vsub.s32 q6, q6, q3 + + ; x4 = dct_const_round_shift(s4 + s6); + vqrshrn.s32 d18, q14, #14 ; >> 14 + vqrshrn.s32 d19, q15, #14 ; >> 14 + + ; x6 = dct_const_round_shift(s4 - s6); + vqrshrn.s32 d10, q5, #14 ; >> 14 + vqrshrn.s32 d11, q6, #14 ; >> 14 + + ; (s5 + s7) + vadd.s32 q1, q7, q10 + vadd.s32 q3, q0, q2 + + ; (s5 - s7)) + vsub.s32 q7, q7, q10 + vsub.s32 q0, q0, q2 + + ; x5 = dct_const_round_shift(s5 + s7); + vqrshrn.s32 d28, q1, #14 ; >> 14 + vqrshrn.s32 d29, q3, #14 ; >> 14 + + ; x7 = dct_const_round_shift(s5 - s7); + vqrshrn.s32 d14, q7, #14 ; >> 14 + vqrshrn.s32 d15, q0, #14 ; >> 14 + + vdup.16 d30, r12 ; duplicate cospi_16_64 + + ; cospi_16_64 * x2 + vmull.s16 q2, d22, d30 + vmull.s16 q3, d23, d30 + + ; cospi_6_64 * x6 + vmull.s16 q13, d22, d30 + vmull.s16 q1, d23, d30 + + ; cospi_16_64 * x2 + cospi_16_64 * x3; + vmlal.s16 q2, d24, d30 + vmlal.s16 q3, d25, d30 + + ; cospi_16_64 * x2 - cospi_16_64 * x3; + vmlsl.s16 q13, d24, d30 + vmlsl.s16 q1, d25, d30 + + ; x2 = dct_const_round_shift(s2); + vqrshrn.s32 d4, q2, #14 ; >> 14 + vqrshrn.s32 d5, q3, #14 ; >> 14 + + ;x3 = dct_const_round_shift(s3); + vqrshrn.s32 d24, q13, #14 ; >> 14 + vqrshrn.s32 d25, q1, #14 ; >> 14 + + ; cospi_16_64 * x6 + vmull.s16 q13, d10, d30 + vmull.s16 q1, d11, d30 + + ; cospi_6_64 * x6 + vmull.s16 q11, d10, d30 + vmull.s16 q0, d11, d30 + + ; cospi_16_64 * x6 + cospi_16_64 * x7; + vmlal.s16 q13, d14, d30 + vmlal.s16 q1, d15, d30 + + ; cospi_16_64 * x6 - cospi_16_64 * x7; + vmlsl.s16 q11, d14, d30 + vmlsl.s16 q0, d15, d30 + + ; x6 = dct_const_round_shift(s6); + vqrshrn.s32 d20, q13, #14 ; >> 14 + vqrshrn.s32 d21, q1, #14 ; >> 14 + + ;x7 = dct_const_round_shift(s7); + vqrshrn.s32 d12, q11, #14 ; >> 14 + vqrshrn.s32 d13, q0, #14 ; >> 14 + + vdup.16 q5, r10 ; duplicate 0 + + vsub.s16 q9, q5, q9 ; output[1] = -x4; + vsub.s16 q11, q5, q2 ; output[3] = -x2; + vsub.s16 q13, q5, q6 ; output[5] = -x7; + vsub.s16 q15, q5, q4 ; output[7] = -x1; + MEND + + + AREA Block, CODE, READONLY ; name this block of code +;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride, int tx_type) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride +; r3 int tx_type) +; This function will only handle tx_type of 1,2,3. +|vp9_short_iht8x8_add_neon| PROC + + ; load the inputs into d16-d19 + vld1.s16 {q8,q9}, [r0]! + vld1.s16 {q10,q11}, [r0]! + vld1.s16 {q12,q13}, [r0]! + vld1.s16 {q14,q15}, [r0]! + + push {r0-r10} + + ; transpose the input data + TRANSPOSE8X8 + + ; decide the type of transform + cmp r3, #2 + beq idct_iadst + cmp r3, #3 + beq iadst_iadst + +iadst_idct + ; generate IDCT constants + GENERATE_IDCT_CONSTANTS + + ; first transform rows + IDCT8x8_1D + + ; transpose the matrix + TRANSPOSE8X8 + + ; generate IADST constants + GENERATE_IADST_CONSTANTS + + ; then transform columns + IADST8X8_1D + + b end_vp9_short_iht8x8_add_neon + +idct_iadst + ; generate IADST constants + GENERATE_IADST_CONSTANTS + + ; first transform rows + IADST8X8_1D + + ; transpose the matrix + TRANSPOSE8X8 + + ; generate IDCT constants + GENERATE_IDCT_CONSTANTS + + ; then transform columns + IDCT8x8_1D + + b end_vp9_short_iht8x8_add_neon + +iadst_iadst + ; generate IADST constants + GENERATE_IADST_CONSTANTS + + ; first transform rows + IADST8X8_1D + + ; transpose the matrix + TRANSPOSE8X8 + + ; then transform columns + IADST8X8_1D + +end_vp9_short_iht8x8_add_neon + pop {r0-r10} + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + vrshr.s16 q8, q8, #5 + vrshr.s16 q9, q9, #5 + vrshr.s16 q10, q10, #5 + vrshr.s16 q11, q11, #5 + vrshr.s16 q12, q12, #5 + vrshr.s16 q13, q13, #5 + vrshr.s16 q14, q14, #5 + vrshr.s16 q15, q15, #5 + + ; save dest pointer + mov r0, r1 + + ; load destination data + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1] + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d0 + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + vaddw.u8 q12, q12, d4 + vaddw.u8 q13, q13, d5 + vaddw.u8 q14, q14, d6 + vaddw.u8 q15, q15, d7 + + ; clip_pixel + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + vqmovun.s16 d4, q12 + vqmovun.s16 d5, q13 + vqmovun.s16 d6, q14 + vqmovun.s16 d7, q15 + + ; store the data + vst1.64 {d0}, [r0], r2 + vst1.64 {d1}, [r0], r2 + vst1.64 {d2}, [r0], r2 + vst1.64 {d3}, [r0], r2 + vst1.64 {d4}, [r0], r2 + vst1.64 {d5}, [r0], r2 + vst1.64 {d6}, [r0], r2 + vst1.64 {d7}, [r0], r2 + bx lr + ENDP ; |vp9_short_iht8x8_add_neon| + + END diff --git a/libvpx/vp9/common/generic/vp9_systemdependent.c b/libvpx/vp9/common/generic/vp9_systemdependent.c index 79092cd..f144721 100644 --- a/libvpx/vp9/common/generic/vp9_systemdependent.c +++ b/libvpx/vp9/common/generic/vp9_systemdependent.c @@ -13,6 +13,7 @@ #include "vp9_rtcd.h" #include "vp9/common/vp9_onyxc_int.h" -void vp9_machine_specific_config(VP9_COMMON *ctx) { +void vp9_machine_specific_config(VP9_COMMON *cm) { + (void)cm; vp9_rtcd(); } diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c index 554a317..864e27e 100644 --- a/libvpx/vp9/common/vp9_alloccommon.c +++ b/libvpx/vp9/common/vp9_alloccommon.c @@ -31,40 +31,30 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) { vpx_memset(&mi[i * stride], 0, sizeof(MODE_INFO)); } -void vp9_update_mode_info_in_image(VP9_COMMON *cm, MODE_INFO *mi) { - int i, j; - - // For each in image mode_info element set the in image flag to 1 - for (i = 0; i < cm->mi_rows; i++) { - MODE_INFO *ptr = mi; - for (j = 0; j < cm->mi_cols; j++) { - ptr->mbmi.mb_in_image = 1; - ptr++; // Next element in the row - } - - // Step over border element at start of next row - mi += cm->mode_info_stride; - } -} - -void vp9_free_frame_buffers(VP9_COMMON *oci) { +void vp9_free_frame_buffers(VP9_COMMON *cm) { int i; for (i = 0; i < NUM_YV12_BUFFERS; i++) - vp9_free_frame_buffer(&oci->yv12_fb[i]); + vp9_free_frame_buffer(&cm->yv12_fb[i]); - vp9_free_frame_buffer(&oci->post_proc_buffer); + vp9_free_frame_buffer(&cm->post_proc_buffer); - vpx_free(oci->mip); - vpx_free(oci->prev_mip); - vpx_free(oci->above_seg_context); + vpx_free(cm->mip); + vpx_free(cm->prev_mip); + vpx_free(cm->above_seg_context); + vpx_free(cm->last_frame_seg_map); + vpx_free(cm->mi_grid_base); + vpx_free(cm->prev_mi_grid_base); - vpx_free(oci->above_context[0]); + vpx_free(cm->above_context[0]); for (i = 0; i < MAX_MB_PLANE; i++) - oci->above_context[i] = 0; - oci->mip = NULL; - oci->prev_mip = NULL; - oci->above_seg_context = NULL; + cm->above_context[i] = 0; + cm->mip = NULL; + cm->prev_mip = NULL; + cm->above_seg_context = NULL; + cm->last_frame_seg_map = NULL; + cm->mi_grid_base = NULL; + cm->prev_mi_grid_base = NULL; } static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) { @@ -72,112 +62,125 @@ static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) { cm->mb_rows = (aligned_height + 8) >> 4; cm->MBs = cm->mb_rows * cm->mb_cols; - cm->mi_cols = aligned_width >> LOG2_MI_SIZE; - cm->mi_rows = aligned_height >> LOG2_MI_SIZE; + cm->mi_cols = aligned_width >> MI_SIZE_LOG2; + cm->mi_rows = aligned_height >> MI_SIZE_LOG2; cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE; } static void setup_mi(VP9_COMMON *cm) { cm->mi = cm->mip + cm->mode_info_stride + 1; cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; + cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1; + cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1; vpx_memset(cm->mip, 0, cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); - vp9_update_mode_info_border(cm, cm->mip); - vp9_update_mode_info_in_image(cm, cm->mi); + vpx_memset(cm->mi_grid_base, 0, + cm->mode_info_stride * (cm->mi_rows + 1) * + sizeof(*cm->mi_grid_base)); + vp9_update_mode_info_border(cm, cm->mip); vp9_update_mode_info_border(cm, cm->prev_mip); - vp9_update_mode_info_in_image(cm, cm->prev_mi); } -int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { +int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { int i, mi_cols; - const int aligned_width = ALIGN_POWER_OF_TWO(width, LOG2_MI_SIZE); - const int aligned_height = ALIGN_POWER_OF_TWO(height, LOG2_MI_SIZE); - const int ss_x = oci->subsampling_x; - const int ss_y = oci->subsampling_y; + const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); + const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); + const int ss_x = cm->subsampling_x; + const int ss_y = cm->subsampling_y; int mi_size; - vp9_free_frame_buffers(oci); + vp9_free_frame_buffers(cm); for (i = 0; i < NUM_YV12_BUFFERS; i++) { - oci->fb_idx_ref_cnt[i] = 0; - if (vp9_alloc_frame_buffer(&oci->yv12_fb[i], width, height, ss_x, ss_y, + cm->fb_idx_ref_cnt[i] = 0; + if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y, VP9BORDERINPIXELS) < 0) goto fail; } - oci->new_fb_idx = NUM_YV12_BUFFERS - 1; - oci->fb_idx_ref_cnt[oci->new_fb_idx] = 1; + cm->new_fb_idx = NUM_YV12_BUFFERS - 1; + cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1; for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++) - oci->active_ref_idx[i] = i; + cm->active_ref_idx[i] = i; for (i = 0; i < NUM_REF_FRAMES; i++) { - oci->ref_frame_map[i] = i; - oci->fb_idx_ref_cnt[i] = 1; + cm->ref_frame_map[i] = i; + cm->fb_idx_ref_cnt[i] = 1; } - if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y, + if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, VP9BORDERINPIXELS) < 0) goto fail; - set_mb_mi(oci, aligned_width, aligned_height); + set_mb_mi(cm, aligned_width, aligned_height); // Allocation - mi_size = oci->mode_info_stride * (oci->mi_rows + MI_BLOCK_SIZE); + mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE); - oci->mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); - if (!oci->mip) + cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); + if (!cm->mip) goto fail; - oci->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); - if (!oci->prev_mip) + cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); + if (!cm->prev_mip) goto fail; - setup_mi(oci); + cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); + if (!cm->mi_grid_base) + goto fail; + + cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base)); + if (!cm->prev_mi_grid_base) + goto fail; + + setup_mi(cm); // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling // information is exposed at this level - mi_cols = mi_cols_aligned_to_sb(oci->mi_cols); + mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm // block where mi unit size is 8x8. -# if CONFIG_ALPHA - oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 8 * mi_cols, 1); -#else - oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 6 * mi_cols, 1); -#endif - if (!oci->above_context[0]) + cm->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE * + (2 * mi_cols), 1); + if (!cm->above_context[0]) goto fail; - oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1); - if (!oci->above_seg_context) + cm->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1); + if (!cm->above_seg_context) + goto fail; + + // Create the segmentation map structure and set to 0. + cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1); + if (!cm->last_frame_seg_map) goto fail; return 0; fail: - vp9_free_frame_buffers(oci); + vp9_free_frame_buffers(cm); return 1; } -void vp9_create_common(VP9_COMMON *oci) { - vp9_machine_specific_config(oci); +void vp9_create_common(VP9_COMMON *cm) { + vp9_machine_specific_config(cm); - vp9_init_mbmode_probs(oci); + vp9_init_mbmode_probs(cm); - oci->tx_mode = ONLY_4X4; - oci->comp_pred_mode = HYBRID_PREDICTION; + cm->tx_mode = ONLY_4X4; + cm->comp_pred_mode = HYBRID_PREDICTION; // Initialize reference frame sign bias structure to defaults - vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias)); + vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias)); } -void vp9_remove_common(VP9_COMMON *oci) { - vp9_free_frame_buffers(oci); +void vp9_remove_common(VP9_COMMON *cm) { + vp9_free_frame_buffers(cm); } void vp9_initialize_common() { @@ -188,8 +191,8 @@ void vp9_initialize_common() { void vp9_update_frame_size(VP9_COMMON *cm) { int i, mi_cols; - const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, LOG2_MI_SIZE); - const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, LOG2_MI_SIZE); + const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, MI_SIZE_LOG2); + const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, MI_SIZE_LOG2); set_mb_mi(cm, aligned_width, aligned_height); setup_mi(cm); @@ -198,4 +201,8 @@ void vp9_update_frame_size(VP9_COMMON *cm) { for (i = 1; i < MAX_MB_PLANE; i++) cm->above_context[i] = cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols; + + // Initialize the previous frame segment map to 0. + if (cm->last_frame_seg_map) + vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols); } diff --git a/libvpx/vp9/common/vp9_alloccommon.h b/libvpx/vp9/common/vp9_alloccommon.h index 8bf5ed1..5d5fae9 100644 --- a/libvpx/vp9/common/vp9_alloccommon.h +++ b/libvpx/vp9/common/vp9_alloccommon.h @@ -16,14 +16,13 @@ void vp9_initialize_common(); -void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi); -void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi); +void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi); -void vp9_create_common(VP9_COMMON *oci); -void vp9_remove_common(VP9_COMMON *oci); +void vp9_create_common(VP9_COMMON *cm); +void vp9_remove_common(VP9_COMMON *cm); -int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height); -void vp9_free_frame_buffers(VP9_COMMON *oci); +int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height); +void vp9_free_frame_buffers(VP9_COMMON *cm); void vp9_update_frame_size(VP9_COMMON *cm); diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h index f68c5c6..c8d677f 100644 --- a/libvpx/vp9/common/vp9_blockd.h +++ b/libvpx/vp9/common/vp9_blockd.h @@ -19,9 +19,9 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_common_data.h" -#include "vp9/common/vp9_convolve.h" #include "vp9/common/vp9_enums.h" #include "vp9/common/vp9_mv.h" +#include "vp9/common/vp9_scale.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_treecoder.h" @@ -56,11 +56,11 @@ typedef enum { } FRAME_TYPE; typedef enum { - EIGHTTAP_SMOOTH, - EIGHTTAP, - EIGHTTAP_SHARP, - BILINEAR, - SWITCHABLE /* should be the last one */ + EIGHTTAP = 0, + EIGHTTAP_SMOOTH = 1, + EIGHTTAP_SHARP = 2, + BILINEAR = 3, + SWITCHABLE = 4 /* should be the last one */ } INTERPOLATIONFILTERTYPE; typedef enum { @@ -71,7 +71,7 @@ typedef enum { D135_PRED, // Directional 135 deg = 180 - 45 D117_PRED, // Directional 117 deg = 180 - 63 D153_PRED, // Directional 153 deg = 180 - 27 - D27_PRED, // Directional 27 deg = round(arctan(1/2) * 180/pi) + D207_PRED, // Directional 207 deg = 180 + 27 D63_PRED, // Directional 63 deg = round(arctan(2/1) * 180/pi) TM_PRED, // True-motion NEARESTMV, @@ -89,9 +89,9 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) { return mode >= NEARESTMV && mode <= NEWMV; } -#define VP9_INTRA_MODES (TM_PRED + 1) +#define INTRA_MODES (TM_PRED + 1) -#define VP9_INTER_MODES (1 + NEWMV - NEARESTMV) +#define INTER_MODES (1 + NEWMV - NEARESTMV) static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) { return (mode - NEARESTMV); @@ -115,45 +115,41 @@ typedef enum { MAX_REF_FRAMES = 4 } MV_REFERENCE_FRAME; -static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) { +static INLINE int b_width_log2(BLOCK_SIZE sb_type) { return b_width_log2_lookup[sb_type]; } -static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) { +static INLINE int b_height_log2(BLOCK_SIZE sb_type) { return b_height_log2_lookup[sb_type]; } -static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) { +static INLINE int mi_width_log2(BLOCK_SIZE sb_type) { return mi_width_log2_lookup[sb_type]; } -static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) { +static INLINE int mi_height_log2(BLOCK_SIZE sb_type) { return mi_height_log2_lookup[sb_type]; } +// This structure now relates to 8x8 block regions. typedef struct { MB_PREDICTION_MODE mode, uv_mode; MV_REFERENCE_FRAME ref_frame[2]; - TX_SIZE txfm_size; - int_mv mv[2]; // for each reference frame used + TX_SIZE tx_size; + int_mv mv[2]; // for each reference frame used int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; int_mv best_mv, best_second_mv; - uint8_t mb_mode_context[MAX_REF_FRAMES]; + uint8_t mode_context[MAX_REF_FRAMES]; - unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */ - unsigned char segment_id; // Segment id for current frame + unsigned char skip_coeff; // 0=need to decode coeffs, 1=no coefficients + unsigned char segment_id; // Segment id for this block. - // Flags used for prediction status of various bistream signals + // Flags used for prediction status of various bit-stream signals unsigned char seg_id_predicted; - // Indicates if the mb is part of the image (1) vs border (0) - // This can be useful in determining whether the MB provides - // a valid predictor - unsigned char mb_in_image; - INTERPOLATIONFILTERTYPE interp_filter; - BLOCK_SIZE_TYPE sb_type; + BLOCK_SIZE sb_type; } MB_MODE_INFO; typedef struct { @@ -161,36 +157,19 @@ typedef struct { union b_mode_info bmi[4]; } MODE_INFO; -static int is_inter_block(const MB_MODE_INFO *mbmi) { +static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) { return mbmi->ref_frame[0] > INTRA_FRAME; } +static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) { + return mbmi->ref_frame[1] > INTRA_FRAME; +} enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 }; -#define VP9_REF_SCALE_SHIFT 14 -#define VP9_REF_NO_SCALE (1 << VP9_REF_SCALE_SHIFT) - -struct scale_factors { - int x_scale_fp; // horizontal fixed point scale factor - int y_scale_fp; // vertical fixed point scale factor - int x_offset_q4; - int x_step_q4; - int y_offset_q4; - int y_step_q4; - - int (*scale_value_x)(int val, const struct scale_factors *scale); - int (*scale_value_y)(int val, const struct scale_factors *scale); - void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col); - MV32 (*scale_mv_q3_to_q4)(const MV *mv, const struct scale_factors *scale); - MV32 (*scale_mv_q4)(const MV *mv, const struct scale_factors *scale); - - convolve_fn_t predict[2][2][2]; // horiz, vert, avg -}; - #if CONFIG_ALPHA enum { MAX_MB_PLANE = 4 }; #else @@ -216,45 +195,27 @@ struct macroblockd_plane { ENTROPY_CONTEXT *left_context; }; -#define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n)) - -#define MAX_REF_LF_DELTAS 4 -#define MAX_MODE_LF_DELTAS 2 - -struct loopfilter { - int filter_level; - - int sharpness_level; - int last_sharpness_level; - - uint8_t mode_ref_delta_enabled; - uint8_t mode_ref_delta_update; - - // 0 = Intra, Last, GF, ARF - signed char ref_deltas[MAX_REF_LF_DELTAS]; - signed char last_ref_deltas[MAX_REF_LF_DELTAS]; - - // 0 = ZERO_MV, MV - signed char mode_deltas[MAX_MODE_LF_DELTAS]; - signed char last_mode_deltas[MAX_MODE_LF_DELTAS]; -}; +#define BLOCK_OFFSET(x, i) ((x) + (i) * 16) typedef struct macroblockd { struct macroblockd_plane plane[MAX_MB_PLANE]; struct scale_factors scale_factor[2]; - MODE_INFO *prev_mode_info_context; - MODE_INFO *mode_info_context; + MODE_INFO *last_mi; + MODE_INFO *this_mi; int mode_info_stride; + MODE_INFO *mic_stream_ptr; + + // A NULL indicates that the 8x8 is not part of the image + MODE_INFO **mi_8x8; + MODE_INFO **prev_mi_8x8; + int up_available; int left_available; int right_available; - struct segmentation seg; - struct loopfilter lf; - // partition contexts PARTITION_CONTEXT *above_seg_context; PARTITION_CONTEXT *left_seg_context; @@ -286,7 +247,7 @@ typedef struct macroblockd { } MACROBLOCKD; -static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) { +static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) { switch (subsize) { case BLOCK_64X64: case BLOCK_64X32: @@ -311,9 +272,8 @@ static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsi } } -static INLINE void update_partition_context(MACROBLOCKD *xd, - BLOCK_SIZE_TYPE sb_type, - BLOCK_SIZE_TYPE sb_size) { +static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type, + BLOCK_SIZE sb_size) { const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2; const int bwl = b_width_log2(sb_type); const int bhl = b_height_log2(sb_type); @@ -331,8 +291,7 @@ static INLINE void update_partition_context(MACROBLOCKD *xd, vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs); } -static INLINE int partition_plane_context(MACROBLOCKD *xd, - BLOCK_SIZE_TYPE sb_type) { +static INLINE int partition_plane_context(MACROBLOCKD *xd, BLOCK_SIZE sb_type) { int bsl = mi_width_log2(sb_type), bs = 1 << bsl; int above = 0, left = 0, i; int boffset = mi_width_log2(BLOCK_64X64) - bsl; @@ -352,10 +311,9 @@ static INLINE int partition_plane_context(MACROBLOCKD *xd, return (left * 2 + above) + bsl * PARTITION_PLOFFSET; } -static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize, - PARTITION_TYPE partition) { - BLOCK_SIZE_TYPE subsize = subsize_lookup[partition][bsize]; - assert(subsize != BLOCK_SIZE_TYPES); +static BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) { + const BLOCK_SIZE subsize = subsize_lookup[partition][bsize]; + assert(subsize < BLOCK_SIZES); return subsize; } @@ -363,7 +321,7 @@ extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT]; static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type, const MACROBLOCKD *xd, int ib) { - const MODE_INFO *const mi = xd->mode_info_context; + const MODE_INFO *const mi = xd->this_mi; const MB_MODE_INFO *const mbmi = &mi->mbmi; if (plane_type != PLANE_TYPE_Y_WITH_DC || @@ -378,13 +336,13 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type, static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type, const MACROBLOCKD *xd) { return plane_type == PLANE_TYPE_Y_WITH_DC ? - mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT; + mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT; } static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type, const MACROBLOCKD *xd) { return plane_type == PLANE_TYPE_Y_WITH_DC ? - mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT; + mode2txfm_map[xd->this_mi->mbmi.mode] : DCT_DCT; } static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) { @@ -404,259 +362,147 @@ static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) { static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) { - return MIN(mbmi->txfm_size, max_uv_txsize_lookup[mbmi->sb_type]); + return MIN(mbmi->tx_size, max_uv_txsize_lookup[mbmi->sb_type]); } -struct plane_block_idx { - int plane; - int block; -}; - -// TODO(jkoleszar): returning a struct so it can be used in a const context, -// expect to refactor this further later. -static INLINE struct plane_block_idx plane_block_idx(int y_blocks, - int b_idx) { - const int v_offset = y_blocks * 5 / 4; - struct plane_block_idx res; - - if (b_idx < y_blocks) { - res.plane = 0; - res.block = b_idx; - } else if (b_idx < v_offset) { - res.plane = 1; - res.block = b_idx - y_blocks; - } else { - assert(b_idx < y_blocks * 3 / 2); - res.plane = 2; - res.block = b_idx - v_offset; - } - return res; +static BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, + const struct macroblockd_plane *pd) { + BLOCK_SIZE bs = ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y]; + assert(bs < BLOCK_SIZES); + return bs; } -static INLINE int plane_block_width(BLOCK_SIZE_TYPE bsize, +static INLINE int plane_block_width(BLOCK_SIZE bsize, const struct macroblockd_plane* plane) { return 4 << (b_width_log2(bsize) - plane->subsampling_x); } -static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize, +static INLINE int plane_block_height(BLOCK_SIZE bsize, const struct macroblockd_plane* plane) { return 4 << (b_height_log2(bsize) - plane->subsampling_y); } -static INLINE int plane_block_width_log2by4( - BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) { - return (b_width_log2(bsize) - plane->subsampling_x); -} - -static INLINE int plane_block_height_log2by4( - BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) { - return (b_height_log2(bsize) - plane->subsampling_y); -} - typedef void (*foreach_transformed_block_visitor)(int plane, int block, - BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg); static INLINE void foreach_transformed_block_in_plane( - const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane, + const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, foreach_transformed_block_visitor visit, void *arg) { - const int bw = b_width_log2(bsize), bh = b_height_log2(bsize); - + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const MB_MODE_INFO* mbmi = &xd->this_mi->mbmi; // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 // transform size varies per plane, look it up in a common way. - const MB_MODE_INFO* mbmi = &xd->mode_info_context->mbmi; const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) - : mbmi->txfm_size; - const int block_size_b = bw + bh; - const int txfrm_size_b = tx_size * 2; - - // subsampled size of the block - const int ss_sum = xd->plane[plane].subsampling_x - + xd->plane[plane].subsampling_y; - const int ss_block_size = block_size_b - ss_sum; - - const int step = 1 << txfrm_size_b; - + : mbmi->tx_size; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int step = 1 << (tx_size << 1); int i; - assert(txfrm_size_b <= block_size_b); - assert(txfrm_size_b <= ss_block_size); - // If mb_to_right_edge is < 0 we are in a situation in which // the current block size extends into the UMV and we won't // visit the sub blocks that are wholly within the UMV. if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { int r, c; - const int sw = bw - xd->plane[plane].subsampling_x; - const int sh = bh - xd->plane[plane].subsampling_y; - int max_blocks_wide = 1 << sw; - int max_blocks_high = 1 << sh; + + int max_blocks_wide = num_4x4_w; + int max_blocks_high = num_4x4_h; // xd->mb_to_right_edge is in units of pixels * 8. This converts // it to 4x4 block sizes. if (xd->mb_to_right_edge < 0) - max_blocks_wide += - (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x)); + max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); if (xd->mb_to_bottom_edge < 0) - max_blocks_high += - (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y)); + max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); i = 0; // Unlike the normal case - in here we have to keep track of the // row and column of the blocks we use so that we know if we are in // the unrestricted motion border. - for (r = 0; r < (1 << sh); r += (1 << tx_size)) { - for (c = 0; c < (1 << sw); c += (1 << tx_size)) { + for (r = 0; r < num_4x4_h; r += (1 << tx_size)) { + for (c = 0; c < num_4x4_w; c += (1 << tx_size)) { if (r < max_blocks_high && c < max_blocks_wide) - visit(plane, i, bsize, txfrm_size_b, arg); + visit(plane, i, plane_bsize, tx_size, arg); i += step; } } } else { - for (i = 0; i < (1 << ss_block_size); i += step) { - visit(plane, i, bsize, txfrm_size_b, arg); - } + for (i = 0; i < num_4x4_w * num_4x4_h; i += step) + visit(plane, i, plane_bsize, tx_size, arg); } } static INLINE void foreach_transformed_block( - const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, + const MACROBLOCKD* const xd, BLOCK_SIZE bsize, foreach_transformed_block_visitor visit, void *arg) { int plane; - for (plane = 0; plane < MAX_MB_PLANE; plane++) { - foreach_transformed_block_in_plane(xd, bsize, plane, - visit, arg); - } + for (plane = 0; plane < MAX_MB_PLANE; plane++) + foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); } static INLINE void foreach_transformed_block_uv( - const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, + const MACROBLOCKD* const xd, BLOCK_SIZE bsize, foreach_transformed_block_visitor visit, void *arg) { int plane; - for (plane = 1; plane < MAX_MB_PLANE; plane++) { - foreach_transformed_block_in_plane(xd, bsize, plane, - visit, arg); - } + for (plane = 1; plane < MAX_MB_PLANE; plane++) + foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); } -// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could -// calculate the subsampled BLOCK_SIZE_TYPE, but that type isn't defined for -// sizes smaller than 16x16 yet. -typedef void (*foreach_predicted_block_visitor)(int plane, int block, - BLOCK_SIZE_TYPE bsize, - int pred_w, int pred_h, - void *arg); -static INLINE void foreach_predicted_block_in_plane( - const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane, - foreach_predicted_block_visitor visit, void *arg) { - int i, x, y; - - // block sizes in number of 4x4 blocks log 2 ("*_b") - // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 - // subsampled size of the block - const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; - const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; - - // size of the predictor to use. - int pred_w, pred_h; - - if (xd->mode_info_context->mbmi.sb_type < BLOCK_8X8) { - assert(bsize == BLOCK_8X8); - pred_w = 0; - pred_h = 0; - } else { - pred_w = bwl; - pred_h = bhl; - } - assert(pred_w <= bwl); - assert(pred_h <= bhl); - - // visit each subblock in raster order - i = 0; - for (y = 0; y < 1 << bhl; y += 1 << pred_h) { - for (x = 0; x < 1 << bwl; x += 1 << pred_w) { - visit(plane, i, bsize, pred_w, pred_h, arg); - i += 1 << pred_w; - } - i += (1 << (bwl + pred_h)) - (1 << bwl); - } -} -static INLINE void foreach_predicted_block( - const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, - foreach_predicted_block_visitor visit, void *arg) { - int plane; - - for (plane = 0; plane < MAX_MB_PLANE; plane++) { - foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg); - } +static int raster_block_offset(BLOCK_SIZE plane_bsize, + int raster_block, int stride) { + const int bw = b_width_log2(plane_bsize); + const int y = 4 * (raster_block >> bw); + const int x = 4 * (raster_block & ((1 << bw) - 1)); + return y * stride + x; } -static INLINE void foreach_predicted_block_uv( - const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, - foreach_predicted_block_visitor visit, void *arg) { - int plane; - - for (plane = 1; plane < MAX_MB_PLANE; plane++) { - foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg); - } +static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize, + int raster_block, int16_t *base) { + const int stride = 4 << b_width_log2(plane_bsize); + return base + raster_block_offset(plane_bsize, raster_block, stride); } -static int raster_block_offset(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, - int plane, int block, int stride) { - const int bw = b_width_log2(bsize) - xd->plane[plane].subsampling_x; - const int y = 4 * (block >> bw), x = 4 * (block & ((1 << bw) - 1)); - return y * stride + x; +static uint8_t* raster_block_offset_uint8(BLOCK_SIZE plane_bsize, + int raster_block, uint8_t *base, + int stride) { + return base + raster_block_offset(plane_bsize, raster_block, stride); } -static int16_t* raster_block_offset_int16(MACROBLOCKD *xd, - BLOCK_SIZE_TYPE bsize, - int plane, int block, int16_t *base) { - const int stride = plane_block_width(bsize, &xd->plane[plane]); - return base + raster_block_offset(xd, bsize, plane, block, stride); -} -static uint8_t* raster_block_offset_uint8(MACROBLOCKD *xd, - BLOCK_SIZE_TYPE bsize, - int plane, int block, - uint8_t *base, int stride) { - return base + raster_block_offset(xd, bsize, plane, block, stride); -} - -static int txfrm_block_to_raster_block(MACROBLOCKD *xd, - BLOCK_SIZE_TYPE bsize, - int plane, int block, - int ss_txfrm_size) { - const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; - const int txwl = ss_txfrm_size / 2; - const int tx_cols_log2 = bwl - txwl; + +static int txfrm_block_to_raster_block(BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, int block) { + const int bwl = b_width_log2(plane_bsize); + const int tx_cols_log2 = bwl - tx_size; const int tx_cols = 1 << tx_cols_log2; - const int raster_mb = block >> ss_txfrm_size; - const int x = (raster_mb & (tx_cols - 1)) << (txwl); - const int y = raster_mb >> tx_cols_log2 << (txwl); + const int raster_mb = block >> (tx_size << 1); + const int x = (raster_mb & (tx_cols - 1)) << tx_size; + const int y = (raster_mb >> tx_cols_log2) << tx_size; return x + (y << bwl); } -static void txfrm_block_to_raster_xy(MACROBLOCKD *xd, - BLOCK_SIZE_TYPE bsize, - int plane, int block, - int ss_txfrm_size, +static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, int block, int *x, int *y) { - const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; - const int txwl = ss_txfrm_size / 2; - const int tx_cols_log2 = bwl - txwl; + const int bwl = b_width_log2(plane_bsize); + const int tx_cols_log2 = bwl - tx_size; const int tx_cols = 1 << tx_cols_log2; - const int raster_mb = block >> ss_txfrm_size; - *x = (raster_mb & (tx_cols - 1)) << (txwl); - *y = raster_mb >> tx_cols_log2 << (txwl); + const int raster_mb = block >> (tx_size << 1); + *x = (raster_mb & (tx_cols - 1)) << tx_size; + *y = (raster_mb >> tx_cols_log2) << tx_size; } -static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block, - BLOCK_SIZE_TYPE bsize, int ss_txfrm_size) { - const int bw = plane_block_width(bsize, &xd->plane[plane]); - const int bh = plane_block_height(bsize, &xd->plane[plane]); +static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize, + int plane, int block, TX_SIZE tx_size) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + uint8_t *const buf = pd->dst.buf; + const int stride = pd->dst.stride; + int x, y; - txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y); + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); x = x * 4 - 1; y = y * 4 - 1; // Copy a pixel into the umv if we are in a situation where the block size @@ -664,41 +510,38 @@ static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block, // TODO(JBB): Should be able to do the full extend in place so we don't have // to do this multiple times. if (xd->mb_to_right_edge < 0) { - int umv_border_start = bw - + (xd->mb_to_right_edge >> (3 + xd->plane[plane].subsampling_x)); + const int bw = 4 << b_width_log2(plane_bsize); + const int umv_border_start = bw + (xd->mb_to_right_edge >> + (3 + pd->subsampling_x)); if (x + bw > umv_border_start) - vpx_memset( - xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride - + umv_border_start, - *(xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride - + umv_border_start - 1), - bw); + vpx_memset(&buf[y * stride + umv_border_start], + buf[y * stride + umv_border_start - 1], bw); } + if (xd->mb_to_bottom_edge < 0) { - int umv_border_start = bh - + (xd->mb_to_bottom_edge >> (3 + xd->plane[plane].subsampling_y)); + const int bh = 4 << b_height_log2(plane_bsize); + const int umv_border_start = bh + (xd->mb_to_bottom_edge >> + (3 + pd->subsampling_y)); int i; - uint8_t c = *(xd->plane[plane].dst.buf - + (umv_border_start - 1) * xd->plane[plane].dst.stride + x); - - uint8_t *d = xd->plane[plane].dst.buf - + umv_border_start * xd->plane[plane].dst.stride + x; + const uint8_t c = buf[(umv_border_start - 1) * stride + x]; + uint8_t *d = &buf[umv_border_start * stride + x]; if (y + bh > umv_border_start) - for (i = 0; i < bh; i++, d += xd->plane[plane].dst.stride) + for (i = 0; i < bh; ++i, d += stride) *d = c; } } -static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, - int plane, int tx_size_in_blocks, - int eob, int aoff, int loff, +static void set_contexts_on_border(MACROBLOCKD *xd, + struct macroblockd_plane *pd, + BLOCK_SIZE plane_bsize, + int tx_size_in_blocks, int has_eob, + int aoff, int loff, ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) { - struct macroblockd_plane *pd = &xd->plane[plane]; + int mi_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize]; + int mi_blocks_high = num_4x4_blocks_high_lookup[plane_bsize]; int above_contexts = tx_size_in_blocks; int left_contexts = tx_size_in_blocks; - int mi_blocks_wide = 1 << plane_block_width_log2by4(bsize, pd); - int mi_blocks_high = 1 << plane_block_height_log2by4(bsize, pd); int pt; // xd->mb_to_right_edge is in units of pixels * 8. This converts @@ -706,26 +549,47 @@ static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, if (xd->mb_to_right_edge < 0) mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + if (xd->mb_to_bottom_edge < 0) + mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + // this code attempts to avoid copying into contexts that are outside // our border. Any blocks that do are set to 0... if (above_contexts + aoff > mi_blocks_wide) above_contexts = mi_blocks_wide - aoff; - if (xd->mb_to_bottom_edge < 0) - mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); - if (left_contexts + loff > mi_blocks_high) left_contexts = mi_blocks_high - loff; for (pt = 0; pt < above_contexts; pt++) - A[pt] = eob > 0; + A[pt] = has_eob; for (pt = above_contexts; pt < tx_size_in_blocks; pt++) A[pt] = 0; for (pt = 0; pt < left_contexts; pt++) - L[pt] = eob > 0; + L[pt] = has_eob; for (pt = left_contexts; pt < tx_size_in_blocks; pt++) L[pt] = 0; } +static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + int has_eob, int aoff, int loff) { + ENTROPY_CONTEXT *const A = pd->above_context + aoff; + ENTROPY_CONTEXT *const L = pd->left_context + loff; + const int tx_size_in_blocks = 1 << tx_size; + + if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { + set_contexts_on_border(xd, pd, plane_bsize, tx_size_in_blocks, has_eob, + aoff, loff, A, L); + } else { + vpx_memset(A, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + vpx_memset(L, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + } +} + +static int get_tx_eob(struct segmentation *seg, int segment_id, + TX_SIZE tx_size) { + const int eob_max = 16 << (tx_size << 1); + return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; +} #endif // VP9_COMMON_VP9_BLOCKD_H_ diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c index fdf37e4..dc41efd 100644 --- a/libvpx/vp9/common/vp9_common_data.c +++ b/libvpx/vp9/common/vp9_common_data.c @@ -13,33 +13,33 @@ #include "vp9/common/vp9_common_data.h" // Log 2 conversion lookup tables for block width and height -const int b_width_log2_lookup[BLOCK_SIZE_TYPES] = +const int b_width_log2_lookup[BLOCK_SIZES] = {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4}; -const int b_height_log2_lookup[BLOCK_SIZE_TYPES] = +const int b_height_log2_lookup[BLOCK_SIZES] = {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4}; -const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES] = +const int num_4x4_blocks_wide_lookup[BLOCK_SIZES] = {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16}; -const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES] = +const int num_4x4_blocks_high_lookup[BLOCK_SIZES] = {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16}; // Log 2 conversion lookup tables for modeinfo width and height -const int mi_width_log2_lookup[BLOCK_SIZE_TYPES] = +const int mi_width_log2_lookup[BLOCK_SIZES] = {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3}; -const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES] = +const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] = {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8}; -const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] = +const int mi_height_log2_lookup[BLOCK_SIZES] = {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3}; -const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] = +const int num_8x8_blocks_high_lookup[BLOCK_SIZES] = {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8}; // MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize))) -const int size_group_lookup[BLOCK_SIZE_TYPES] = +const int size_group_lookup[BLOCK_SIZES] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3}; -const int num_pels_log2_lookup[BLOCK_SIZE_TYPES] = +const int num_pels_log2_lookup[BLOCK_SIZES] = {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12}; -const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = { +const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = { { // 4X4 // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID, @@ -74,51 +74,62 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = { } }; -const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = { +const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = { { // PARTITION_NONE - BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, - BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, + BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, BLOCK_64X64, }, { // PARTITION_HORZ - BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_8X4, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_16X8, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_32X16, BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32, }, { // PARTITION_VERT - BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_4X8, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_8X16, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_16X32, BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64, }, { // PARTITION_SPLIT - BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_4X4, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_8X8, BLOCK_INVALID, BLOCK_INVALID, + BLOCK_16X16, BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32, } }; -const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES] = { - TX_4X4, TX_4X4, TX_4X4, - TX_8X8, TX_8X8, TX_8X8, +const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = { + TX_4X4, TX_4X4, TX_4X4, + TX_8X8, TX_8X8, TX_8X8, TX_16X16, TX_16X16, TX_16X16, TX_32X32, TX_32X32, TX_32X32, TX_32X32 }; -const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES] = { - TX_4X4, TX_4X4, TX_4X4, - TX_4X4, TX_4X4, TX_4X4, - TX_8X8, TX_8X8, TX_8X8, +const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES] = { + TX_4X4, TX_4X4, TX_4X4, + TX_4X4, TX_4X4, TX_4X4, + TX_8X8, TX_8X8, TX_8X8, TX_16X16, TX_16X16, TX_16X16, TX_32X32 }; -const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5] = { - { BLOCK_4X4, BLOCK_4X8, BLOCK_4X8, BLOCK_4X8, BLOCK_4X8 }, - { BLOCK_8X4, BLOCK_8X8, BLOCK_8X16, BLOCK_8X16, BLOCK_8X16 }, - { BLOCK_16X8, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32, BLOCK_16X32 }, - { BLOCK_32X16, BLOCK_32X16, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64 }, - { BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X64 } +const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = { +// ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 +// ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 + {{BLOCK_4X4, BLOCK_INVALID}, {BLOCK_INVALID, BLOCK_INVALID}}, + {{BLOCK_4X8, BLOCK_4X4}, {BLOCK_INVALID, BLOCK_INVALID}}, + {{BLOCK_8X4, BLOCK_INVALID}, {BLOCK_4X4, BLOCK_INVALID}}, + {{BLOCK_8X8, BLOCK_8X4}, {BLOCK_4X8, BLOCK_4X4}}, + {{BLOCK_8X16, BLOCK_8X8}, {BLOCK_INVALID, BLOCK_4X8}}, + {{BLOCK_16X8, BLOCK_INVALID}, {BLOCK_8X8, BLOCK_8X4}}, + {{BLOCK_16X16, BLOCK_16X8}, {BLOCK_8X16, BLOCK_8X8}}, + {{BLOCK_16X32, BLOCK_16X16}, {BLOCK_INVALID, BLOCK_8X16}}, + {{BLOCK_32X16, BLOCK_INVALID}, {BLOCK_16X16, BLOCK_16X8}}, + {{BLOCK_32X32, BLOCK_32X16}, {BLOCK_16X32, BLOCK_16X16}}, + {{BLOCK_32X64, BLOCK_32X32}, {BLOCK_INVALID, BLOCK_16X32}}, + {{BLOCK_64X32, BLOCK_INVALID}, {BLOCK_32X32, BLOCK_32X16}}, + {{BLOCK_64X64, BLOCK_64X32}, {BLOCK_32X64, BLOCK_32X32}}, }; + diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h index bc8c01a..3822bfc 100644 --- a/libvpx/vp9/common/vp9_common_data.h +++ b/libvpx/vp9/common/vp9_common_data.h @@ -13,20 +13,20 @@ #include "vp9/common/vp9_enums.h" -extern const int b_width_log2_lookup[BLOCK_SIZE_TYPES]; -extern const int b_height_log2_lookup[BLOCK_SIZE_TYPES]; -extern const int mi_width_log2_lookup[BLOCK_SIZE_TYPES]; -extern const int mi_height_log2_lookup[BLOCK_SIZE_TYPES]; -extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES]; -extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES]; -extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES]; -extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES]; -extern const int size_group_lookup[BLOCK_SIZE_TYPES]; -extern const int num_pels_log2_lookup[BLOCK_SIZE_TYPES]; -extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES]; -extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES]; -extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES]; -extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES]; -extern const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5]; +extern const int b_width_log2_lookup[BLOCK_SIZES]; +extern const int b_height_log2_lookup[BLOCK_SIZES]; +extern const int mi_width_log2_lookup[BLOCK_SIZES]; +extern const int mi_height_log2_lookup[BLOCK_SIZES]; +extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES]; +extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES]; +extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES]; +extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZES]; +extern const int size_group_lookup[BLOCK_SIZES]; +extern const int num_pels_log2_lookup[BLOCK_SIZES]; +extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES]; +extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES]; +extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES]; +extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES]; +extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2]; #endif // VP9_COMMON_VP9_COMMON_DATA_H diff --git a/libvpx/vp9/common/vp9_convolve.c b/libvpx/vp9/common/vp9_convolve.c index 6f1e418..94231a1 100644 --- a/libvpx/vp9/common/vp9_convolve.c +++ b/libvpx/vp9/common/vp9_convolve.c @@ -14,66 +14,45 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_filter.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -#define VP9_FILTER_WEIGHT 128 -#define VP9_FILTER_SHIFT 7 - -/* Assume a bank of 16 filters to choose from. There are two implementations - * for filter wrapping behavior, since we want to be able to pick which filter - * to start with. We could either: - * - * 1) make filter_ a pointer to the base of the filter array, and then add an - * additional offset parameter, to choose the starting filter. - * 2) use a pointer to 2 periods worth of filters, so that even if the original - * phase offset is at 15/16, we'll have valid data to read. The filter - * tables become [32][8], and the second half is duplicated. - * 3) fix the alignment of the filter tables, so that we know the 0/16 is - * always 256 byte aligned. - * - * Implementations 2 and 3 are likely preferable, as they avoid an extra 2 - * parameters, and switching between them is trivial, with the - * ALIGN_FILTERS_256 macro, below. - */ - #define ALIGN_FILTERS_256 1 - static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x0, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int taps) { - int x, y, k, sum; - const int16_t *filter_x_base = filter_x0; + int x, y, k; -#if ALIGN_FILTERS_256 - filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); -#endif + /* NOTE: This assumes that the filter table is 256-byte aligned. */ + /* TODO(agrange) Modify to make independent of table alignment. */ + const int16_t *const filter_x_base = + (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); /* Adjust base pointer address for this source line */ src -= taps / 2 - 1; for (y = 0; y < h; ++y) { - /* Pointer to filter to use */ - const int16_t *filter_x = filter_x0; - /* Initial phase offset */ - int x0_q4 = (filter_x - filter_x_base) / taps; - int x_q4 = x0_q4; + int x_q4 = (filter_x0 - filter_x_base) / taps; for (x = 0; x < w; ++x) { /* Per-pixel src offset */ - int src_x = (x_q4 - x0_q4) >> 4; + const int src_x = x_q4 >> SUBPEL_BITS; + int sum = 0; - for (sum = 0, k = 0; k < taps; ++k) { + /* Pointer to filter to use */ + const int16_t *const filter_x = filter_x_base + + (x_q4 & SUBPEL_MASK) * taps; + + for (k = 0; k < taps; ++k) sum += src[src_x + k] * filter_x[k]; - } - sum += (VP9_FILTER_WEIGHT >> 1); - dst[x] = clip_pixel(sum >> VP9_FILTER_SHIFT); - /* Adjust source and filter to use for the next pixel */ + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + + /* Move to the next source pixel */ x_q4 += x_step_q4; - filter_x = filter_x_base + (x_q4 & 0xf) * taps; } src += src_stride; dst += dst_stride; @@ -85,37 +64,37 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x0, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int taps) { - int x, y, k, sum; - const int16_t *filter_x_base = filter_x0; + int x, y, k; -#if ALIGN_FILTERS_256 - filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); -#endif + /* NOTE: This assumes that the filter table is 256-byte aligned. */ + /* TODO(agrange) Modify to make independent of table alignment. */ + const int16_t *const filter_x_base = + (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); /* Adjust base pointer address for this source line */ src -= taps / 2 - 1; for (y = 0; y < h; ++y) { - /* Pointer to filter to use */ - const int16_t *filter_x = filter_x0; - /* Initial phase offset */ - int x0_q4 = (filter_x - filter_x_base) / taps; - int x_q4 = x0_q4; + int x_q4 = (filter_x0 - filter_x_base) / taps; for (x = 0; x < w; ++x) { /* Per-pixel src offset */ - int src_x = (x_q4 - x0_q4) >> 4; + const int src_x = x_q4 >> SUBPEL_BITS; + int sum = 0; - for (sum = 0, k = 0; k < taps; ++k) { + /* Pointer to filter to use */ + const int16_t *const filter_x = filter_x_base + + (x_q4 & SUBPEL_MASK) * taps; + + for (k = 0; k < taps; ++k) sum += src[src_x + k] * filter_x[k]; - } - sum += (VP9_FILTER_WEIGHT >> 1); - dst[x] = (dst[x] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1; - /* Adjust source and filter to use for the next pixel */ + dst[x] = ROUND_POWER_OF_TWO(dst[x] + + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + + /* Move to the next source pixel */ x_q4 += x_step_q4; - filter_x = filter_x_base + (x_q4 & 0xf) * taps; } src += src_stride; dst += dst_stride; @@ -127,37 +106,37 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y0, int y_step_q4, int w, int h, int taps) { - int x, y, k, sum; + int x, y, k; - const int16_t *filter_y_base = filter_y0; - -#if ALIGN_FILTERS_256 - filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff); -#endif + /* NOTE: This assumes that the filter table is 256-byte aligned. */ + /* TODO(agrange) Modify to make independent of table alignment. */ + const int16_t *const filter_y_base = + (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff); /* Adjust base pointer address for this source column */ src -= src_stride * (taps / 2 - 1); - for (x = 0; x < w; ++x) { - /* Pointer to filter to use */ - const int16_t *filter_y = filter_y0; + for (x = 0; x < w; ++x) { /* Initial phase offset */ - int y0_q4 = (filter_y - filter_y_base) / taps; - int y_q4 = y0_q4; + int y_q4 = (filter_y0 - filter_y_base) / taps; for (y = 0; y < h; ++y) { /* Per-pixel src offset */ - int src_y = (y_q4 - y0_q4) >> 4; + const int src_y = y_q4 >> SUBPEL_BITS; + int sum = 0; - for (sum = 0, k = 0; k < taps; ++k) { + /* Pointer to filter to use */ + const int16_t *const filter_y = filter_y_base + + (y_q4 & SUBPEL_MASK) * taps; + + for (k = 0; k < taps; ++k) sum += src[(src_y + k) * src_stride] * filter_y[k]; - } - sum += (VP9_FILTER_WEIGHT >> 1); - dst[y * dst_stride] = clip_pixel(sum >> VP9_FILTER_SHIFT); - /* Adjust source and filter to use for the next pixel */ + dst[y * dst_stride] = + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + + /* Move to the next source pixel */ y_q4 += y_step_q4; - filter_y = filter_y_base + (y_q4 & 0xf) * taps; } ++src; ++dst; @@ -169,38 +148,37 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y0, int y_step_q4, int w, int h, int taps) { - int x, y, k, sum; - - const int16_t *filter_y_base = filter_y0; + int x, y, k; -#if ALIGN_FILTERS_256 - filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff); -#endif + /* NOTE: This assumes that the filter table is 256-byte aligned. */ + /* TODO(agrange) Modify to make independent of table alignment. */ + const int16_t *const filter_y_base = + (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff); /* Adjust base pointer address for this source column */ src -= src_stride * (taps / 2 - 1); - for (x = 0; x < w; ++x) { - /* Pointer to filter to use */ - const int16_t *filter_y = filter_y0; + for (x = 0; x < w; ++x) { /* Initial phase offset */ - int y0_q4 = (filter_y - filter_y_base) / taps; - int y_q4 = y0_q4; + int y_q4 = (filter_y0 - filter_y_base) / taps; for (y = 0; y < h; ++y) { /* Per-pixel src offset */ - int src_y = (y_q4 - y0_q4) >> 4; + const int src_y = y_q4 >> SUBPEL_BITS; + int sum = 0; + + /* Pointer to filter to use */ + const int16_t *const filter_y = filter_y_base + + (y_q4 & SUBPEL_MASK) * taps; - for (sum = 0, k = 0; k < taps; ++k) { + for (k = 0; k < taps; ++k) sum += src[(src_y + k) * src_stride] * filter_y[k]; - } - sum += (VP9_FILTER_WEIGHT >> 1); - dst[y * dst_stride] = - (dst[y * dst_stride] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1; - /* Adjust source and filter to use for the next pixel */ + dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + + /* Move to the next source pixel */ y_q4 += y_step_q4; - filter_y = filter_y_base + (y_q4 & 0xf) * taps; } ++src; ++dst; @@ -213,58 +191,27 @@ static void convolve_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h, int taps) { /* Fixed size intermediate buffer places limits on parameters. - * Maximum intermediate_height is 135, for y_step_q4 == 32, + * Maximum intermediate_height is 324, for y_step_q4 == 80, * h == 64, taps == 8. + * y_step_q4 of 80 allows for 1/10 scale for 5 layer svc */ - uint8_t temp[64 * 135]; - int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1; + uint8_t temp[64 * 324]; + int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + taps; assert(w <= 64); assert(h <= 64); assert(taps <= 8); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); + assert(y_step_q4 <= 80); + assert(x_step_q4 <= 80); if (intermediate_height < h) intermediate_height = h; - convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, - temp, 64, - filter_x, x_step_q4, filter_y, y_step_q4, - w, intermediate_height, taps); - convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h, taps); -} - -static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int taps) { - /* Fixed size intermediate buffer places limits on parameters. - * Maximum intermediate_height is 135, for y_step_q4 == 32, - * h == 64, taps == 8. - */ - uint8_t temp[64 * 135]; - int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1; - - assert(w <= 64); - assert(h <= 64); - assert(taps <= 8); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); - - if (intermediate_height < h) - intermediate_height = h; - - convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, - temp, 64, - filter_x, x_step_q4, filter_y, y_step_q4, - w, intermediate_height, taps); - convolve_avg_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h, taps); + convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, temp, 64, + filter_x, x_step_q4, filter_y, y_step_q4, w, + intermediate_height, taps); + convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, taps); } void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, @@ -273,8 +220,7 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { convolve_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h, 8); + filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); } void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, @@ -283,8 +229,7 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { convolve_avg_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h, 8); + filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); } void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, @@ -293,8 +238,7 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { convolve_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h, 8); + filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); } void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, @@ -303,8 +247,7 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { convolve_avg_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h, 8); + filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); } void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, @@ -313,8 +256,7 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { convolve_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h, 8); + filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); } void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, @@ -327,16 +269,9 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, assert(w <= 64); assert(h <= 64); - vp9_convolve8(src, src_stride, - temp, 64, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); - vp9_convolve_avg(temp, 64, - dst, dst_stride, - NULL, 0, /* These unused parameter should be removed! */ - NULL, 0, /* These unused parameter should be removed! */ - w, h); + vp9_convolve8(src, src_stride, temp, 64, + filter_x, x_step_q4, filter_y, y_step_q4, w, h); + vp9_convolve_avg(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); } void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, @@ -361,9 +296,9 @@ void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, int x, y; for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - dst[x] = (dst[x] + src[x] + 1) >> 1; - } + for (x = 0; x < w; ++x) + dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + src += src_stride; dst += dst_stride; } diff --git a/libvpx/vp9/common/vp9_convolve.h b/libvpx/vp9/common/vp9_convolve.h index 3de8111..13220e9 100644 --- a/libvpx/vp9/common/vp9_convolve.h +++ b/libvpx/vp9/common/vp9_convolve.h @@ -13,6 +13,8 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#define FILTER_BITS 7 + typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, diff --git a/libvpx/vp9/common/vp9_debugmodes.c b/libvpx/vp9/common/vp9_debugmodes.c index 370ebe8..79f769e 100644 --- a/libvpx/vp9/common/vp9_debugmodes.c +++ b/libvpx/vp9/common/vp9_debugmodes.c @@ -22,23 +22,24 @@ static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) { * and uses the passed in member offset to print out the value of an integer * for each mbmi member value in the mi structure. */ -static void print_mi_data(VP9_COMMON *common, FILE *file, char *descriptor, +static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor, size_t member_offset) { int mi_row; int mi_col; int mi_index = 0; - MODE_INFO *mi = common->mi; - int rows = common->mi_rows; - int cols = common->mi_cols; + MODE_INFO **mi_8x8 = cm->mi_grid_visible; + int rows = cm->mi_rows; + int cols = cm->mi_cols; char prefix = descriptor[0]; - log_frame_info(common, descriptor, file); + log_frame_info(cm, descriptor, file); mi_index = 0; for (mi_row = 0; mi_row < rows; mi_row++) { fprintf(file, "%c ", prefix); for (mi_col = 0; mi_col < cols; mi_col++) { fprintf(file, "%2d ", - *((int*) ((char *) (&mi[mi_index].mbmi) + member_offset))); + *((int*) ((char *) (&mi_8x8[mi_index]->mbmi) + + member_offset))); mi_index++; } fprintf(file, "\n"); @@ -51,23 +52,23 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) { int mi_col; int mi_index = 0; FILE *mvs = fopen(file, "a"); - MODE_INFO *mi = cm->mi; + MODE_INFO **mi_8x8 = cm->mi_grid_visible; int rows = cm->mi_rows; int cols = cm->mi_cols; print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type)); print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode)); - print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, mb_skip_coeff)); + print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip_coeff)); print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0])); - print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, txfm_size)); + print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size)); print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode)); log_frame_info(cm, "Vectors ",mvs); for (mi_row = 0; mi_row < rows; mi_row++) { fprintf(mvs,"V "); for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row, - mi[mi_index].mbmi.mv[0].as_mv.col); + fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row, + mi_8x8[mi_index]->mbmi.mv[0].as_mv.col); mi_index++; } fprintf(mvs, "\n"); diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c index df3a9fe..32d9e0c 100644 --- a/libvpx/vp9/common/vp9_entropy.c +++ b/libvpx/vp9/common/vp9_entropy.c @@ -377,7 +377,7 @@ static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = { static void extend_model_to_full_distribution(vp9_prob p, vp9_prob *tree_probs) { - const int l = ((p - 1) / 2); + const int l = (p - 1) / 2; const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8; if (p & 1) { vpx_memcpy(tree_probs + UNCONSTRAINED_NODES, @@ -436,11 +436,11 @@ const vp9_extra_bit vp9_extra_bits[12] = { #include "vp9/common/vp9_default_coef_probs.h" -void vp9_default_coef_probs(VP9_COMMON *pc) { - vp9_copy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4); - vp9_copy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8); - vp9_copy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16); - vp9_copy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32); +void vp9_default_coef_probs(VP9_COMMON *cm) { + vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4); + vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8); + vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16); + vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32); } // Neighborhood 5-tuples for various scans and blocksizes, @@ -622,7 +622,6 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size, int t, i, j, k, l; unsigned int branch_ct[UNCONSTRAINED_NODES][2]; vp9_prob coef_probs[UNCONSTRAINED_NODES]; - int entropy_nodes_adapt = UNCONSTRAINED_NODES; for (i = 0; i < BLOCK_TYPES; ++i) for (j = 0; j < REF_TYPES; ++j) @@ -635,7 +634,7 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size, 0); branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0]; coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]); - for (t = 0; t < entropy_nodes_adapt; ++t) + for (t = 0; t < UNCONSTRAINED_NODES; ++t) dst_coef_probs[i][j][k][l][t] = merge_probs( pre_coef_probs[i][j][k][l][t], coef_probs[t], branch_ct[t], count_sat, update_factor); diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h index 861c078..f138c09 100644 --- a/libvpx/vp9/common/vp9_entropy.h +++ b/libvpx/vp9/common/vp9_entropy.h @@ -95,7 +95,7 @@ typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] #define MODULUS_PARAM 13 /* Modulus parameter */ struct VP9Common; -void vp9_default_coef_probs(struct VP9Common *); +void vp9_default_coef_probs(struct VP9Common *cm); extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]); extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]); @@ -154,19 +154,17 @@ extern DECLARE_ALIGNED(16, int16_t, vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); void vp9_coef_tree_initialize(void); -void vp9_adapt_coef_probs(struct VP9Common *); +void vp9_adapt_coef_probs(struct VP9Common *cm); -static INLINE void vp9_reset_sb_tokens_context(MACROBLOCKD* const xd, - BLOCK_SIZE_TYPE bsize) { - /* Clear entropy contexts */ - const int bw = 1 << b_width_log2(bsize); - const int bh = 1 << b_height_log2(bsize); +static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { int i; for (i = 0; i < MAX_MB_PLANE; i++) { - vpx_memset(xd->plane[i].above_context, 0, - sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[i].subsampling_x); - vpx_memset(xd->plane[i].left_context, 0, - sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[i].subsampling_y); + struct macroblockd_plane *const pd = &xd->plane[i]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + vpx_memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * + num_4x4_blocks_wide_lookup[plane_bsize]); + vpx_memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * + num_4x4_blocks_high_lookup[plane_bsize]); } } @@ -338,6 +336,45 @@ static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) { } } +static int get_entropy_context(const MACROBLOCKD *xd, TX_SIZE tx_size, + PLANE_TYPE type, int block_idx, + ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L, + const int16_t **scan, + const uint8_t **band_translate) { + ENTROPY_CONTEXT above_ec = 0, left_ec = 0; + + switch (tx_size) { + case TX_4X4: + *scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx)); + *band_translate = vp9_coefband_trans_4x4; + above_ec = A[0] != 0; + left_ec = L[0] != 0; + break; + case TX_8X8: + *scan = get_scan_8x8(get_tx_type_8x8(type, xd)); + *band_translate = vp9_coefband_trans_8x8plus; + above_ec = !!*(uint16_t *)A; + left_ec = !!*(uint16_t *)L; + break; + case TX_16X16: + *scan = get_scan_16x16(get_tx_type_16x16(type, xd)); + *band_translate = vp9_coefband_trans_8x8plus; + above_ec = !!*(uint32_t *)A; + left_ec = !!*(uint32_t *)L; + break; + case TX_32X32: + *scan = vp9_default_scan_32x32; + *band_translate = vp9_coefband_trans_8x8plus; + above_ec = !!*(uint64_t *)A; + left_ec = !!*(uint64_t *)L; + break; + default: + assert(!"Invalid transform size."); + } + + return combine_entropy_contexts(above_ec, left_ec); +} + enum { VP9_COEF_UPDATE_PROB = 252 }; #endif // VP9_COMMON_VP9_ENTROPY_H_ diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c index 768e5f5..93c89b0 100644 --- a/libvpx/vp9/common/vp9_entropymode.c +++ b/libvpx/vp9/common/vp9_entropymode.c @@ -14,8 +14,8 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_seg_common.h" -const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES] - [VP9_INTRA_MODES - 1] = { +const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES] + [INTRA_MODES - 1] = { { 144, 11, 54, 157, 195, 130, 46, 58, 108 } /* y = dc */, { 118, 15, 123, 148, 131, 101, 44, 93, 131 } /* y = v */, { 113, 12, 23, 188, 226, 142, 26, 32, 125 } /* y = h */, @@ -23,21 +23,21 @@ const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES] { 113, 9, 36, 155, 111, 157, 32, 44, 161 } /* y = d135 */, { 116, 9, 55, 176, 76, 96, 37, 61, 149 } /* y = d117 */, { 115, 9, 28, 141, 161, 167, 21, 25, 193 } /* y = d153 */, - { 120, 12, 32, 145, 195, 142, 32, 38, 86 } /* y = d27 */, + { 120, 12, 32, 145, 195, 142, 32, 38, 86 } /* y = d207 */, { 116, 12, 64, 120, 140, 125, 49, 115, 121 } /* y = d63 */, { 102, 19, 66, 162, 182, 122, 35, 59, 128 } /* y = tm */ }; static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS] - [VP9_INTRA_MODES - 1] = { + [INTRA_MODES - 1] = { { 65, 32, 18, 144, 162, 194, 41, 51, 98 } /* block_size < 8x8 */, { 132, 68, 18, 165, 217, 196, 45, 40, 78 } /* block_size < 16x16 */, { 173, 80, 19, 176, 240, 193, 64, 35, 46 } /* block_size < 32x32 */, { 221, 135, 38, 194, 248, 121, 96, 85, 29 } /* block_size >= 32x32 */ }; -static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES] - [VP9_INTRA_MODES - 1] = { +static const vp9_prob default_if_uv_probs[INTRA_MODES] + [INTRA_MODES - 1] = { { 120, 7, 76, 176, 208, 126, 28, 54, 103 } /* y = dc */, { 48, 12, 154, 155, 139, 90, 34, 117, 119 } /* y = v */, { 67, 6, 25, 204, 243, 158, 13, 21, 96 } /* y = h */, @@ -45,7 +45,7 @@ static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES] { 83, 5, 42, 156, 111, 152, 26, 49, 152 } /* y = d135 */, { 80, 5, 58, 178, 74, 83, 33, 62, 145 } /* y = d117 */, { 86, 5, 32, 154, 192, 168, 14, 22, 163 } /* y = d153 */, - { 85, 5, 32, 156, 216, 148, 19, 29, 73 } /* y = d27 */, + { 85, 5, 32, 156, 216, 148, 19, 29, 73 } /* y = d207 */, { 77, 7, 64, 116, 132, 122, 37, 126, 120 } /* y = d63 */, { 101, 21, 107, 181, 192, 103, 19, 67, 125 } /* y = tm */ }; @@ -98,9 +98,9 @@ static const vp9_prob default_partition_probs[NUM_FRAME_TYPES] } }; -const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES] - [VP9_INTRA_MODES] - [VP9_INTRA_MODES - 1] = { +const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES] + [INTRA_MODES] + [INTRA_MODES - 1] = { { /* above = dc */ { 137, 30, 42, 148, 151, 207, 70, 52, 91 } /* left = dc */, { 92, 45, 102, 136, 116, 180, 74, 90, 100 } /* left = v */, @@ -109,7 +109,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES] { 72, 35, 36, 149, 68, 206, 68, 63, 105 } /* left = d135 */, { 73, 31, 28, 138, 57, 124, 55, 122, 151 } /* left = d117 */, { 67, 23, 21, 140, 126, 197, 40, 37, 171 } /* left = d153 */, - { 86, 27, 28, 128, 154, 212, 45, 43, 53 } /* left = d27 */, + { 86, 27, 28, 128, 154, 212, 45, 43, 53 } /* left = d207 */, { 74, 32, 27, 107, 86, 160, 63, 134, 102 } /* left = d63 */, { 59, 67, 44, 140, 161, 202, 78, 67, 119 } /* left = tm */ }, { /* above = v */ @@ -120,7 +120,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES] { 46, 41, 76, 140, 63, 184, 69, 112, 57 } /* left = d135 */, { 38, 32, 85, 140, 46, 112, 54, 151, 133 } /* left = d117 */, { 39, 27, 61, 131, 110, 175, 44, 75, 136 } /* left = d153 */, - { 52, 30, 74, 113, 130, 175, 51, 64, 58 } /* left = d27 */, + { 52, 30, 74, 113, 130, 175, 51, 64, 58 } /* left = d207 */, { 47, 35, 80, 100, 74, 143, 64, 163, 74 } /* left = d63 */, { 36, 61, 116, 114, 128, 162, 80, 125, 82 } /* left = tm */ }, { /* above = h */ @@ -131,7 +131,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES] { 58, 50, 25, 139, 115, 232, 39, 52, 118 } /* left = d135 */, { 50, 35, 33, 153, 104, 162, 64, 59, 131 } /* left = d117 */, { 44, 24, 16, 150, 177, 202, 33, 19, 156 } /* left = d153 */, - { 55, 27, 12, 153, 203, 218, 26, 27, 49 } /* left = d27 */, + { 55, 27, 12, 153, 203, 218, 26, 27, 49 } /* left = d207 */, { 53, 49, 21, 110, 116, 168, 59, 80, 76 } /* left = d63 */, { 38, 72, 19, 168, 203, 212, 50, 50, 107 } /* left = tm */ }, { /* above = d45 */ @@ -142,7 +142,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES] { 60, 32, 33, 112, 71, 220, 64, 89, 104 } /* left = d135 */, { 53, 26, 34, 130, 56, 149, 84, 120, 103 } /* left = d117 */, { 53, 21, 23, 133, 109, 210, 56, 77, 172 } /* left = d153 */, - { 77, 19, 29, 112, 142, 228, 55, 66, 36 } /* left = d27 */, + { 77, 19, 29, 112, 142, 228, 55, 66, 36 } /* left = d207 */, { 61, 29, 29, 93, 97, 165, 83, 175, 162 } /* left = d63 */, { 47, 47, 43, 114, 137, 181, 100, 99, 95 } /* left = tm */ }, { /* above = d135 */ @@ -153,7 +153,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES] { 52, 31, 22, 158, 40, 209, 58, 62, 89 } /* left = d135 */, { 44, 31, 29, 147, 46, 158, 56, 102, 198 } /* left = d117 */, { 35, 19, 12, 135, 87, 209, 41, 45, 167 } /* left = d153 */, - { 55, 25, 21, 118, 95, 215, 38, 39, 66 } /* left = d27 */, + { 55, 25, 21, 118, 95, 215, 38, 39, 66 } /* left = d207 */, { 51, 38, 25, 113, 58, 164, 70, 93, 97 } /* left = d63 */, { 47, 54, 34, 146, 108, 203, 72, 103, 151 } /* left = tm */ }, { /* above = d117 */ @@ -164,7 +164,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES] { 40, 26, 35, 154, 40, 185, 51, 97, 123 } /* left = d135 */, { 35, 19, 34, 179, 19, 97, 48, 129, 124 } /* left = d117 */, { 36, 20, 26, 136, 62, 164, 33, 77, 154 } /* left = d153 */, - { 45, 18, 32, 130, 90, 157, 40, 79, 91 } /* left = d27 */, + { 45, 18, 32, 130, 90, 157, 40, 79, 91 } /* left = d207 */, { 45, 26, 28, 129, 45, 129, 49, 147, 123 } /* left = d63 */, { 38, 44, 51, 136, 74, 162, 57, 97, 121 } /* left = tm */ }, { /* above = d153 */ @@ -175,10 +175,10 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES] { 47, 29, 17, 153, 64, 220, 59, 51, 114 } /* left = d135 */, { 46, 16, 24, 136, 76, 147, 41, 64, 172 } /* left = d117 */, { 34, 17, 11, 108, 152, 187, 13, 15, 209 } /* left = d153 */, - { 51, 24, 14, 115, 133, 209, 32, 26, 104 } /* left = d27 */, + { 51, 24, 14, 115, 133, 209, 32, 26, 104 } /* left = d207 */, { 55, 30, 18, 122, 79, 179, 44, 88, 116 } /* left = d63 */, { 37, 49, 25, 129, 168, 164, 41, 54, 148 } /* left = tm */ - }, { /* above = d27 */ + }, { /* above = d207 */ { 82, 22, 32, 127, 143, 213, 39, 41, 70 } /* left = dc */, { 62, 44, 61, 123, 105, 189, 48, 57, 64 } /* left = v */, { 47, 25, 17, 175, 222, 220, 24, 30, 86 } /* left = h */, @@ -186,7 +186,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES] { 57, 39, 23, 151, 68, 216, 55, 63, 58 } /* left = d135 */, { 49, 30, 35, 141, 70, 168, 82, 40, 115 } /* left = d117 */, { 51, 25, 15, 136, 129, 202, 38, 35, 139 } /* left = d153 */, - { 68, 26, 16, 111, 141, 215, 29, 28, 28 } /* left = d27 */, + { 68, 26, 16, 111, 141, 215, 29, 28, 28 } /* left = d207 */, { 59, 39, 19, 114, 75, 180, 77, 104, 42 } /* left = d63 */, { 40, 61, 26, 126, 152, 206, 61, 59, 93 } /* left = tm */ }, { /* above = d63 */ @@ -197,7 +197,7 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES] { 48, 31, 27, 114, 63, 183, 82, 116, 56 } /* left = d135 */, { 43, 28, 37, 121, 63, 123, 61, 192, 169 } /* left = d117 */, { 42, 17, 24, 109, 97, 177, 56, 76, 122 } /* left = d153 */, - { 58, 18, 28, 105, 139, 182, 70, 92, 63 } /* left = d27 */, + { 58, 18, 28, 105, 139, 182, 70, 92, 63 } /* left = d207 */, { 46, 23, 32, 74, 86, 150, 67, 183, 88 } /* left = d63 */, { 36, 38, 48, 92, 122, 165, 88, 137, 91 } /* left = tm */ }, { /* above = tm */ @@ -208,14 +208,14 @@ const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES] { 49, 50, 35, 144, 95, 205, 63, 78, 59 } /* left = d135 */, { 41, 53, 52, 148, 71, 142, 65, 128, 51 } /* left = d117 */, { 40, 36, 28, 143, 143, 202, 40, 55, 137 } /* left = d153 */, - { 52, 34, 29, 129, 183, 227, 42, 35, 43 } /* left = d27 */, + { 52, 34, 29, 129, 183, 227, 42, 35, 43 } /* left = d207 */, { 42, 44, 44, 104, 105, 164, 64, 130, 80 } /* left = d63 */, { 43, 81, 53, 140, 169, 204, 68, 84, 72 } /* left = tm */ } }; static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS] - [VP9_INTER_MODES - 1] = { + [INTER_MODES - 1] = { {2, 173, 34}, // 0 = both zero mv {7, 145, 85}, // 1 = one zero mv + one a predicted mv {7, 166, 63}, // 2 = two predicted mvs @@ -226,7 +226,7 @@ static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS] }; /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ -const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = { +const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = { -DC_PRED, 2, /* 0 = DC_NODE */ -TM_PRED, 4, /* 1 = TM_NODE */ -V_PRED, 6, /* 2 = V_NODE */ @@ -235,7 +235,7 @@ const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = { -D135_PRED, -D117_PRED, /* 5 = D135_NODE */ -D45_PRED, 14, /* 6 = D45_NODE */ -D63_PRED, 16, /* 7 = D63_NODE */ - -D153_PRED, -D27_PRED /* 8 = D153_NODE */ + -D153_PRED, -D207_PRED /* 8 = D153_NODE */ }; const vp9_tree_index vp9_inter_mode_tree[6] = { @@ -250,8 +250,8 @@ const vp9_tree_index vp9_partition_tree[6] = { -PARTITION_VERT, -PARTITION_SPLIT }; -struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES]; -struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES]; +struct vp9_token vp9_intra_mode_encodings[INTRA_MODES]; +struct vp9_token vp9_inter_mode_encodings[INTER_MODES]; struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; @@ -317,8 +317,8 @@ static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = { 192, 128, 64 }; -static const vp9_prob default_switchable_interp_prob[VP9_SWITCHABLE_FILTERS+1] - [VP9_SWITCHABLE_FILTERS-1] = { +static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1] + [SWITCHABLE_FILTERS-1] = { { 235, 162, }, { 36, 255, }, { 34, 3, }, @@ -338,14 +338,11 @@ void vp9_init_mbmode_probs(VP9_COMMON *cm) { vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs); } -const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = { - -0, 2, - -1, -2 +const vp9_tree_index vp9_switchable_interp_tree[SWITCHABLE_FILTERS*2-2] = { + -EIGHTTAP, 2, + -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP }; -struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS]; -const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = { - EIGHTTAP, EIGHTTAP_SMOOTH, EIGHTTAP_SHARP}; -const int vp9_switchable_interp_map[SWITCHABLE + 1] = {1, 0, 2, -1, -1}; +struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS]; void vp9_entropy_mode_init() { vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree); @@ -403,17 +400,17 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { counts->single_ref[i][j]); for (i = 0; i < INTER_MODE_CONTEXTS; i++) - update_mode_probs(VP9_INTER_MODES, vp9_inter_mode_tree, + update_mode_probs(INTER_MODES, vp9_inter_mode_tree, counts->inter_mode[i], pre_fc->inter_mode_probs[i], fc->inter_mode_probs[i], NEARESTMV); for (i = 0; i < BLOCK_SIZE_GROUPS; i++) - update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree, + update_mode_probs(INTRA_MODES, vp9_intra_mode_tree, counts->y_mode[i], pre_fc->y_mode_prob[i], fc->y_mode_prob[i], 0); - for (i = 0; i < VP9_INTRA_MODES; ++i) - update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree, + for (i = 0; i < INTRA_MODES; ++i) + update_mode_probs(INTRA_MODES, vp9_intra_mode_tree, counts->uv_mode[i], pre_fc->uv_mode_prob[i], fc->uv_mode_prob[i], 0); @@ -424,8 +421,8 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { fc->partition_prob[INTER_FRAME][i], 0); if (cm->mcomp_filter_type == SWITCHABLE) { - for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) - update_mode_probs(VP9_SWITCHABLE_FILTERS, vp9_switchable_interp_tree, + for (i = 0; i <= SWITCHABLE_FILTERS; i++) + update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree, counts->switchable_interp[i], pre_fc->switchable_interp_prob[i], fc->switchable_interp_prob[i], 0); @@ -443,14 +440,12 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]); - tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], - branch_ct_16x16p); + tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p); for (j = 0; j < TX_SIZES - 2; ++j) fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]); - tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], - branch_ct_32x32p); + tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p); for (j = 0; j < TX_SIZES - 1; ++j) fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]); @@ -475,14 +470,14 @@ static void set_default_lf_deltas(struct loopfilter *lf) { lf->mode_deltas[1] = 0; } -void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) { +void vp9_setup_past_independence(VP9_COMMON *cm) { // Reset the segment feature data to the default stats: // Features disabled, 0, with delta coding (Default state). - struct loopfilter *const lf = &xd->lf; + struct loopfilter *const lf = &cm->lf; int i; - vp9_clearall_segfeatures(&xd->seg); - xd->seg.abs_delta = SEGMENT_DELTADATA; + vp9_clearall_segfeatures(&cm->seg); + cm->seg.abs_delta = SEGMENT_DELTADATA; if (cm->last_frame_seg_map) vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); @@ -515,10 +510,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) { cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); vp9_update_mode_info_border(cm, cm->mip); - vp9_update_mode_info_in_image(cm, cm->mi); - vp9_update_mode_info_border(cm, cm->prev_mip); - vp9_update_mode_info_in_image(cm, cm->prev_mi); vp9_zero(cm->ref_frame_sign_bias); diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h index 17a7c26..4cf4c03 100644 --- a/libvpx/vp9/common/vp9_entropymode.h +++ b/libvpx/vp9/common/vp9_entropymode.h @@ -16,8 +16,8 @@ #define SUBMVREF_COUNT 5 #define TX_SIZE_CONTEXTS 2 -#define VP9_MODE_UPDATE_PROB 252 -#define VP9_SWITCHABLE_FILTERS 3 // number of switchable filters +#define MODE_UPDATE_PROB 252 +#define SWITCHABLE_FILTERS 3 // number of switchable filters // #define MODE_STATS @@ -35,37 +35,32 @@ struct tx_counts { unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2]; }; -extern const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1]; -extern const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES] - [VP9_INTRA_MODES - 1]; +extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; +extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES] + [INTRA_MODES - 1]; extern const vp9_tree_index vp9_intra_mode_tree[]; extern const vp9_tree_index vp9_inter_mode_tree[]; -extern struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES]; -extern struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES]; +extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES]; +extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES]; // probability models for partition information extern const vp9_tree_index vp9_partition_tree[]; extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; -extern const INTERPOLATIONFILTERTYPE vp9_switchable_interp - [VP9_SWITCHABLE_FILTERS]; - -extern const int vp9_switchable_interp_map[SWITCHABLE + 1]; - extern const vp9_tree_index vp9_switchable_interp_tree - [2 * (VP9_SWITCHABLE_FILTERS - 1)]; + [2 * (SWITCHABLE_FILTERS - 1)]; -extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS]; +extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS]; void vp9_entropy_mode_init(); -void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd); +void vp9_setup_past_independence(struct VP9Common *cm); -void vp9_init_mbmode_probs(struct VP9Common *x); +void vp9_init_mbmode_probs(struct VP9Common *cm); -void vp9_adapt_mode_probs(struct VP9Common *); +void vp9_adapt_mode_probs(struct VP9Common *cm); void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, unsigned int (*ct_32x32p)[2]); diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c index 6cfc346..2e973e5 100644 --- a/libvpx/vp9/common/vp9_entropymv.c +++ b/libvpx/vp9/common/vp9_entropymv.c @@ -79,20 +79,59 @@ static const nmv_context default_nmv_context = { #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0) +static const uint8_t log_in_base_2[] = { + 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10 +}; + MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { MV_CLASS_TYPE c = MV_CLASS_0; - if (z < CLASS0_SIZE * 8) c = MV_CLASS_0; - else if (z < CLASS0_SIZE * 16) c = MV_CLASS_1; - else if (z < CLASS0_SIZE * 32) c = MV_CLASS_2; - else if (z < CLASS0_SIZE * 64) c = MV_CLASS_3; - else if (z < CLASS0_SIZE * 128) c = MV_CLASS_4; - else if (z < CLASS0_SIZE * 256) c = MV_CLASS_5; - else if (z < CLASS0_SIZE * 512) c = MV_CLASS_6; - else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7; - else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8; - else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9; - else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10; - else assert(0); + if (z >= CLASS0_SIZE * 4096) + c = MV_CLASS_10; + else + c = log_in_base_2[z >> 3]; + if (offset) *offset = z - mv_class_base(c); return c; @@ -110,8 +149,6 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) { static void inc_mv_component(int v, nmv_component_counts *comp_counts, int incr, int usehp) { int s, z, c, o, d, e, f; - if (!incr) - return; assert (v != 0); /* should not be zero */ s = v < 0; comp_counts->sign[s] += incr; @@ -123,61 +160,39 @@ static void inc_mv_component(int v, nmv_component_counts *comp_counts, d = (o >> 3); /* int mv data */ f = (o >> 1) & 3; /* fractional pel mv data */ e = (o & 1); /* high precision mv data */ + if (c == MV_CLASS_0) { comp_counts->class0[d] += incr; + comp_counts->class0_fp[d][f] += incr; + comp_counts->class0_hp[e] += usehp * incr; } else { int i; int b = c + CLASS0_BITS - 1; // number of bits for (i = 0; i < b; ++i) comp_counts->bits[i][((d >> i) & 1)] += incr; - } - - /* Code the fractional pel bits */ - if (c == MV_CLASS_0) { - comp_counts->class0_fp[d][f] += incr; - } else { comp_counts->fp[f] += incr; - } - - /* Code the high precision bit */ - if (usehp) { - if (c == MV_CLASS_0) { - comp_counts->class0_hp[e] += incr; - } else { - comp_counts->hp[e] += incr; - } + comp_counts->hp[e] += usehp * incr; } } -static void counts_to_context(nmv_component_counts *mvcomp, int usehp) { - int v; - vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount)); - for (v = 1; v <= MV_MAX; v++) { - inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp); - inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp); - } -} void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) { const MV_JOINT_TYPE j = vp9_get_mv_joint(mv); ++counts->joints[j]; - if (mv_joint_vertical(j)) - ++counts->comps[0].mvcount[MV_MAX + mv->row]; + if (mv_joint_vertical(j)) { + inc_mv_component(mv->row, &counts->comps[0], 1, 1); + } - if (mv_joint_horizontal(j)) - ++counts->comps[1].mvcount[MV_MAX + mv->col]; + if (mv_joint_horizontal(j)) { + inc_mv_component(mv->col, &counts->comps[1], 1, 1); + } } static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) { return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR); } -void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) { - counts_to_context(&nmv_count->comps[0], usehp); - counts_to_context(&nmv_count->comps[1], usehp); -} - static unsigned int adapt_probs(unsigned int i, vp9_tree tree, vp9_prob this_probs[], @@ -207,8 +222,6 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { nmv_context *pre_ctx = &pre_fc->nmvc; nmv_context_counts *cts = &cm->counts.mv; - vp9_counts_process(cts, allow_hp); - adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints); for (i = 0; i < 2; ++i) { diff --git a/libvpx/vp9/common/vp9_entropymv.h b/libvpx/vp9/common/vp9_entropymv.h index 85a1f3a..a10c933 100644 --- a/libvpx/vp9/common/vp9_entropymv.h +++ b/libvpx/vp9/common/vp9_entropymv.h @@ -24,7 +24,7 @@ void vp9_init_mv_probs(struct VP9Common *cm); void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp); int vp9_use_mv_hp(const MV *ref); -#define VP9_NMV_UPDATE_PROB 252 +#define NMV_UPDATE_PROB 252 /* Symbols for coding which components are zero jointly */ #define MV_JOINTS 4 @@ -126,6 +126,4 @@ typedef struct { void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx); -void vp9_counts_process(nmv_context_counts *NMVcount, int usehp); - #endif // VP9_COMMON_VP9_ENTROPYMV_H_ diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h index 3208b72..1bf0742 100644 --- a/libvpx/vp9/common/vp9_enums.h +++ b/libvpx/vp9/common/vp9_enums.h @@ -13,37 +13,40 @@ #include "./vpx_config.h" -#define LOG2_MI_SIZE 3 -#define LOG2_MI_BLOCK_SIZE (6 - LOG2_MI_SIZE) // 64 = 2^6 +#define MI_SIZE_LOG2 3 +#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2) // 64 = 2^6 -#define MI_SIZE (1 << LOG2_MI_SIZE) // pixels per mi-unit -#define MI_BLOCK_SIZE (1 << LOG2_MI_BLOCK_SIZE) // mi-units per max block +#define MI_SIZE (1 << MI_SIZE_LOG2) // pixels per mi-unit +#define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2) // mi-units per max block #define MI_MASK (MI_BLOCK_SIZE - 1) -typedef enum BLOCK_SIZE_TYPE { - BLOCK_SIZE_AB4X4, BLOCK_4X4 = BLOCK_SIZE_AB4X4, - BLOCK_SIZE_SB4X8, BLOCK_4X8 = BLOCK_SIZE_SB4X8, - BLOCK_SIZE_SB8X4, BLOCK_8X4 = BLOCK_SIZE_SB8X4, - BLOCK_SIZE_SB8X8, BLOCK_8X8 = BLOCK_SIZE_SB8X8, - BLOCK_SIZE_SB8X16, BLOCK_8X16 = BLOCK_SIZE_SB8X16, - BLOCK_SIZE_SB16X8, BLOCK_16X8 = BLOCK_SIZE_SB16X8, - BLOCK_SIZE_MB16X16, BLOCK_16X16 = BLOCK_SIZE_MB16X16, - BLOCK_SIZE_SB16X32, BLOCK_16X32 = BLOCK_SIZE_SB16X32, - BLOCK_SIZE_SB32X16, BLOCK_32X16 = BLOCK_SIZE_SB32X16, - BLOCK_SIZE_SB32X32, BLOCK_32X32 = BLOCK_SIZE_SB32X32, - BLOCK_SIZE_SB32X64, BLOCK_32X64 = BLOCK_SIZE_SB32X64, - BLOCK_SIZE_SB64X32, BLOCK_64X32 = BLOCK_SIZE_SB64X32, - BLOCK_SIZE_SB64X64, BLOCK_64X64 = BLOCK_SIZE_SB64X64, - BLOCK_SIZE_TYPES, BLOCK_MAX_SB_SEGMENTS = BLOCK_SIZE_TYPES -} BLOCK_SIZE_TYPE; + +typedef enum BLOCK_SIZE { + BLOCK_4X4, + BLOCK_4X8, + BLOCK_8X4, + BLOCK_8X8, + BLOCK_8X16, + BLOCK_16X8, + BLOCK_16X16, + BLOCK_16X32, + BLOCK_32X16, + BLOCK_32X32, + BLOCK_32X64, + BLOCK_64X32, + BLOCK_64X64, + BLOCK_SIZES, + BLOCK_INVALID = BLOCK_SIZES +} BLOCK_SIZE; typedef enum PARTITION_TYPE { PARTITION_NONE, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT, - PARTITION_TYPES, PARTITION_INVALID = PARTITION_TYPES + PARTITION_TYPES, + PARTITION_INVALID = PARTITION_TYPES } PARTITION_TYPE; #define PARTITION_PLOFFSET 4 // number of probability models per block size diff --git a/libvpx/vp9/common/vp9_extend.c b/libvpx/vp9/common/vp9_extend.c index d8496c4..07c68c8 100644 --- a/libvpx/vp9/common/vp9_extend.c +++ b/libvpx/vp9/common/vp9_extend.c @@ -57,15 +57,23 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch, void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst) { - const int et_y = dst->border; - const int el_y = dst->border; - const int eb_y = dst->border + dst->y_height - src->y_height; - const int er_y = dst->border + dst->y_width - src->y_width; - - const int et_uv = dst->border >> (dst->uv_height != dst->y_height); - const int el_uv = dst->border >> (dst->uv_width != dst->y_width); - const int eb_uv = et_uv + dst->uv_height - src->uv_height; - const int er_uv = el_uv + dst->uv_width - src->uv_width; + // Extend src frame in buffer + // Altref filtering assumes 16 pixel extension + const int et_y = 16; + const int el_y = 16; + // Motion estimation may use src block variance with the block size up + // to 64x64, so the right and bottom need to be extended to 64 mulitple + // or up to 16, whichever is greater. + const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width, + 16); + const int er_y = MAX(ALIGN_POWER_OF_TWO(src->y_height, 6) - src->y_height, + 16); + const int uv_width_subsampling = (src->uv_width != src->y_width); + const int uv_height_subsampling = (src->uv_height != src->y_height); + const int et_uv = et_y >> uv_height_subsampling; + const int el_uv = el_y >> uv_width_subsampling; + const int eb_uv = eb_y >> uv_height_subsampling; + const int er_uv = er_y >> uv_width_subsampling; #if CONFIG_ALPHA const int et_a = dst->border >> (dst->alpha_height != dst->y_height); diff --git a/libvpx/vp9/common/vp9_filter.c b/libvpx/vp9/common/vp9_filter.c index e5503cd..4ac2bc9 100644 --- a/libvpx/vp9/common/vp9_filter.c +++ b/libvpx/vp9/common/vp9_filter.c @@ -8,14 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "vpx_ports/mem.h" -#include <stdlib.h> #include "vp9/common/vp9_filter.h" -#include "vpx_ports/mem.h" -#include "vp9_rtcd.h" -#include "vp9/common/vp9_common.h" -DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = { +DECLARE_ALIGNED(256, const int16_t, + vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, { 0, 0, 0, 112, 16, 0, 0, 0 }, @@ -34,8 +32,9 @@ DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = { { 0, 0, 0, 8, 120, 0, 0, 0 } }; -DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = { - /* Lagrangian interpolation filter */ +// Lagrangian interpolation filter +DECLARE_ALIGNED(256, const int16_t, + vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, { 0, 1, -5, 126, 8, -3, 1, 0}, { -1, 3, -10, 122, 18, -6, 2, 0}, @@ -54,9 +53,9 @@ DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = { { 0, 1, -3, 8, 126, -5, 1, 0} }; -DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) - = { - /* dct based filter */ +// DCT based filter +DECLARE_ALIGNED(256, const int16_t, + vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS]) = { {0, 0, 0, 128, 0, 0, 0, 0}, {-1, 3, -7, 127, 8, -3, 1, 0}, {-2, 5, -13, 125, 17, -6, 3, -1}, @@ -75,9 +74,9 @@ DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) {0, 1, -3, 8, 127, -7, 3, -1} }; +// freqmultiplier = 0.5 DECLARE_ALIGNED(256, const int16_t, - vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = { - /* freqmultiplier = 0.5 */ + vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, {-3, -1, 32, 64, 38, 1, -3, 0}, {-2, -2, 29, 63, 41, 2, -3, 0}, diff --git a/libvpx/vp9/common/vp9_filter.h b/libvpx/vp9/common/vp9_filter.h index 1ccfdaa..7b1ffae 100644 --- a/libvpx/vp9/common/vp9_filter.h +++ b/libvpx/vp9/common/vp9_filter.h @@ -12,26 +12,22 @@ #define VP9_COMMON_VP9_FILTER_H_ #include "vpx_config.h" -#include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" -#define BLOCK_HEIGHT_WIDTH 4 -#define VP9_FILTER_WEIGHT 128 -#define VP9_FILTER_SHIFT 7 +#define SUBPEL_BITS 4 +#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) +#define SUBPEL_SHIFTS (1 << SUBPEL_BITS) +#define SUBPEL_TAPS 8 -#define SUBPEL_SHIFTS 16 - -extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8]; -extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]; -extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]; -extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]; -extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]; +extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][SUBPEL_TAPS]; +extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][SUBPEL_TAPS]; +extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][SUBPEL_TAPS]; +extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][SUBPEL_TAPS]; +extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][SUBPEL_TAPS]; // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear // filter kernel as a 2 tap filter. -#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \ - sizeof(vp9_bilinear_filters[0][0])) -#define BF_OFFSET (BF_LENGTH / 2 - 1) -#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET) +#define BILINEAR_FILTERS_2TAP(x) \ + (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1) #endif // VP9_COMMON_VP9_FILTER_H_ diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c index 3af8b8d..49a731f 100644 --- a/libvpx/vp9/common/vp9_findnearmv.c +++ b/libvpx/vp9/common/vp9_findnearmv.c @@ -8,19 +8,16 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <limits.h> - #include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_mvref_common.h" -#include "vp9/common/vp9_sadmxn.h" -static void lower_mv_precision(int_mv *mv, int allow_hp) { - const int use_hp = allow_hp && vp9_use_mv_hp(&mv->as_mv); +static void lower_mv_precision(MV *mv, int allow_hp) { + const int use_hp = allow_hp && vp9_use_mv_hp(mv); if (!use_hp) { - if (mv->as_mv.row & 1) - mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1); - if (mv->as_mv.col & 1) - mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1); + if (mv->row & 1) + mv->row += (mv->row > 0 ? -1 : 1); + if (mv->col & 1) + mv->col += (mv->col > 0 ? -1 : 1); } } @@ -32,7 +29,7 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int i; // Make sure all the candidates are properly clamped etc for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { - lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv); + lower_mv_precision(&mvlist[i].as_mv, xd->allow_high_precision_mv); clamp_mv2(&mvlist[i].as_mv, xd); } *nearest = mvlist[0]; @@ -46,17 +43,14 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col) { int_mv dst_list[MAX_MV_REF_CANDIDATES]; int_mv mv_list[MAX_MV_REF_CANDIDATES]; - MODE_INFO *mi = xd->mode_info_context; - MB_MODE_INFO *const mbmi = &mi->mbmi; + MODE_INFO *const mi = xd->this_mi; assert(ref_idx == 0 || ref_idx == 1); assert(MAX_MV_REF_CANDIDATES == 2); // makes code here slightly easier - vp9_find_mv_refs_idx(cm, xd, xd->mode_info_context, - xd->prev_mode_info_context, - mbmi->ref_frame[ref_idx], - mv_list, cm->ref_frame_sign_bias, block_idx, - mi_row, mi_col); + vp9_find_mv_refs_idx(cm, xd, mi, xd->last_mi, + mi->mbmi.ref_frame[ref_idx], + mv_list, block_idx, mi_row, mi_col); dst_list[1].as_int = 0; if (block_idx == 0) { diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h index e5221ed..ad0d882 100644 --- a/libvpx/vp9/common/vp9_findnearmv.h +++ b/libvpx/vp9/common/vp9_findnearmv.h @@ -36,48 +36,57 @@ static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); } -void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc, +void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, int_mv *dst_nearest, int_mv *dst_near, int block_idx, int ref_idx, int mi_row, int mi_col); -static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) { +static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, + const MODE_INFO *left_mb, int b) { // FIXME(rbultje, jingning): temporary hack because jenkins doesn't // understand this condition. This will go away soon. + const MODE_INFO *mi = cur_mb; + if (b == 0 || b == 2) { /* On L edge, get from MB to left of us */ - --cur_mb; + mi = left_mb; + if (!mi) + return DC_PRED; - if (is_inter_block(&cur_mb->mbmi)) { + if (mi->mbmi.ref_frame[0] != INTRA_FRAME) { return DC_PRED; - } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) { - return ((cur_mb->bmi + 1 + b)->as_mode); + } else if (mi->mbmi.sb_type < BLOCK_8X8) { + return ((mi->bmi + 1 + b)->as_mode); } else { - return cur_mb->mbmi.mode; + return mi->mbmi.mode; } } assert(b == 1 || b == 3); - return (cur_mb->bmi + b - 1)->as_mode; + return (mi->bmi + b - 1)->as_mode; } static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, - int b, int mi_stride) { + const MODE_INFO *above_mb, int b) { + const MODE_INFO *mi = cur_mb; + if (!(b >> 1)) { /* On top edge, get from MB above us */ - cur_mb -= mi_stride; + mi = above_mb; + if (!mi) + return DC_PRED; - if (is_inter_block(&cur_mb->mbmi)) { + if (mi->mbmi.ref_frame[0] != INTRA_FRAME) { return DC_PRED; - } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) { - return ((cur_mb->bmi + 2 + b)->as_mode); + } else if (mi->mbmi.sb_type < BLOCK_8X8) { + return ((mi->bmi + 2 + b)->as_mode); } else { - return cur_mb->mbmi.mode; + return mi->mbmi.mode; } } - return (cur_mb->bmi + b - 2)->as_mode; + return (mi->bmi + b - 2)->as_mode; } #endif // VP9_COMMON_VP9_FINDNEARMV_H_ diff --git a/libvpx/vp9/common/vp9_idct.h b/libvpx/vp9/common/vp9_idct.h index 2d959f0..0c47da6 100644 --- a/libvpx/vp9/common/vp9_idct.h +++ b/libvpx/vp9/common/vp9_idct.h @@ -27,6 +27,9 @@ #define pair_set_epi16(a, b) \ _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16)) +#define pair_set_epi32(a, b) \ + _mm_set_epi32(b, a, b, a) + // Constants: // for (int i = 1; i< 32; ++i) // printf("static const int cospi_%d_64 = %.0f;\n", i, diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c index 66df627..cfb5cd4 100644 --- a/libvpx/vp9/common/vp9_loopfilter.c +++ b/libvpx/vp9/common/vp9_loopfilter.c @@ -22,13 +22,217 @@ struct loop_filter_info { const uint8_t *hev_thr; }; +// This structure holds bit masks for all 8x8 blocks in a 64x64 region. +// Each 1 bit represents a position in which we want to apply the loop filter. +// Left_ entries refer to whether we apply a filter on the border to the +// left of the block. Above_ entries refer to whether or not to apply a +// filter on the above border. Int_ entries refer to whether or not to +// apply borders on the 4x4 edges within the 8x8 block that each bit +// represents. +// Since each transform is accompanied by a potentially different type of +// loop filter there is a different entry in the array for each transform size. +typedef struct { + uint64_t left_y[TX_SIZES]; + uint64_t above_y[TX_SIZES]; + uint64_t int_4x4_y; + uint16_t left_uv[TX_SIZES]; + uint16_t above_uv[TX_SIZES]; + uint16_t int_4x4_uv; +} LOOP_FILTER_MASK; + +// 64 bit masks for left transform size. Each 1 represents a position where +// we should apply a loop filter across the left border of an 8x8 block +// boundary. +// +// In the case of TX_16X16-> ( in low order byte first we end up with +// a mask that looks like this +// +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// 10101010 +// +// A loopfilter should be applied to every other 8x8 horizontally. +static const uint64_t left_64x64_txform_mask[TX_SIZES]= { + 0xffffffffffffffff, // TX_4X4 + 0xffffffffffffffff, // TX_8x8 + 0x5555555555555555, // TX_16x16 + 0x1111111111111111, // TX_32x32 +}; + +// 64 bit masks for above transform size. Each 1 represents a position where +// we should apply a loop filter across the top border of an 8x8 block +// boundary. +// +// In the case of TX_32x32 -> ( in low order byte first we end up with +// a mask that looks like this +// +// 11111111 +// 00000000 +// 00000000 +// 00000000 +// 11111111 +// 00000000 +// 00000000 +// 00000000 +// +// A loopfilter should be applied to every other 4 the row vertically. +static const uint64_t above_64x64_txform_mask[TX_SIZES]= { + 0xffffffffffffffff, // TX_4X4 + 0xffffffffffffffff, // TX_8x8 + 0x00ff00ff00ff00ff, // TX_16x16 + 0x000000ff000000ff, // TX_32x32 +}; + +// 64 bit masks for prediction sizes (left). Each 1 represents a position +// where left border of an 8x8 block. These are aligned to the right most +// appropriate bit, and then shifted into place. +// +// In the case of TX_16x32 -> ( low order byte first ) we end up with +// a mask that looks like this : +// +// 10000000 +// 10000000 +// 10000000 +// 10000000 +// 00000000 +// 00000000 +// 00000000 +// 00000000 +static const uint64_t left_prediction_mask[BLOCK_SIZES] = { + 0x0000000000000001, // BLOCK_4X4, + 0x0000000000000001, // BLOCK_4X8, + 0x0000000000000001, // BLOCK_8X4, + 0x0000000000000001, // BLOCK_8X8, + 0x0000000000000101, // BLOCK_8X16, + 0x0000000000000001, // BLOCK_16X8, + 0x0000000000000101, // BLOCK_16X16, + 0x0000000001010101, // BLOCK_16X32, + 0x0000000000000101, // BLOCK_32X16, + 0x0000000001010101, // BLOCK_32X32, + 0x0101010101010101, // BLOCK_32X64, + 0x0000000001010101, // BLOCK_64X32, + 0x0101010101010101, // BLOCK_64X64 +}; + +// 64 bit mask to shift and set for each prediction size. +static const uint64_t above_prediction_mask[BLOCK_SIZES] = { + 0x0000000000000001, // BLOCK_4X4 + 0x0000000000000001, // BLOCK_4X8 + 0x0000000000000001, // BLOCK_8X4 + 0x0000000000000001, // BLOCK_8X8 + 0x0000000000000001, // BLOCK_8X16, + 0x0000000000000003, // BLOCK_16X8 + 0x0000000000000003, // BLOCK_16X16 + 0x0000000000000003, // BLOCK_16X32, + 0x000000000000000f, // BLOCK_32X16, + 0x000000000000000f, // BLOCK_32X32, + 0x000000000000000f, // BLOCK_32X64, + 0x00000000000000ff, // BLOCK_64X32, + 0x00000000000000ff, // BLOCK_64X64 +}; +// 64 bit mask to shift and set for each prediction size. A bit is set for +// each 8x8 block that would be in the left most block of the given block +// size in the 64x64 block. +static const uint64_t size_mask[BLOCK_SIZES] = { + 0x0000000000000001, // BLOCK_4X4 + 0x0000000000000001, // BLOCK_4X8 + 0x0000000000000001, // BLOCK_8X4 + 0x0000000000000001, // BLOCK_8X8 + 0x0000000000000101, // BLOCK_8X16, + 0x0000000000000003, // BLOCK_16X8 + 0x0000000000000303, // BLOCK_16X16 + 0x0000000003030303, // BLOCK_16X32, + 0x0000000000000f0f, // BLOCK_32X16, + 0x000000000f0f0f0f, // BLOCK_32X32, + 0x0f0f0f0f0f0f0f0f, // BLOCK_32X64, + 0x00000000ffffffff, // BLOCK_64X32, + 0xffffffffffffffff, // BLOCK_64X64 +}; + +// These are used for masking the left and above borders. +static const uint64_t left_border = 0x1111111111111111; +static const uint64_t above_border = 0x000000ff000000ff; + +// 16 bit masks for uv transform sizes. +static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= { + 0xffff, // TX_4X4 + 0xffff, // TX_8x8 + 0x5555, // TX_16x16 + 0x1111, // TX_32x32 +}; + +static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= { + 0xffff, // TX_4X4 + 0xffff, // TX_8x8 + 0x0f0f, // TX_16x16 + 0x000f, // TX_32x32 +}; + +// 16 bit left mask to shift and set for each uv prediction size. +static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = { + 0x0001, // BLOCK_4X4, + 0x0001, // BLOCK_4X8, + 0x0001, // BLOCK_8X4, + 0x0001, // BLOCK_8X8, + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8, + 0x0001, // BLOCK_16X16, + 0x0011, // BLOCK_16X32, + 0x0001, // BLOCK_32X16, + 0x0011, // BLOCK_32X32, + 0x1111, // BLOCK_32X64 + 0x0011, // BLOCK_64X32, + 0x1111, // BLOCK_64X64 +}; +// 16 bit above mask to shift and set for uv each prediction size. +static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = { + 0x0001, // BLOCK_4X4 + 0x0001, // BLOCK_4X8 + 0x0001, // BLOCK_8X4 + 0x0001, // BLOCK_8X8 + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8 + 0x0001, // BLOCK_16X16 + 0x0001, // BLOCK_16X32, + 0x0003, // BLOCK_32X16, + 0x0003, // BLOCK_32X32, + 0x0003, // BLOCK_32X64, + 0x000f, // BLOCK_64X32, + 0x000f, // BLOCK_64X64 +}; + +// 64 bit mask to shift and set for each uv prediction size +static const uint16_t size_mask_uv[BLOCK_SIZES] = { + 0x0001, // BLOCK_4X4 + 0x0001, // BLOCK_4X8 + 0x0001, // BLOCK_8X4 + 0x0001, // BLOCK_8X8 + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8 + 0x0001, // BLOCK_16X16 + 0x0011, // BLOCK_16X32, + 0x0003, // BLOCK_32X16, + 0x0033, // BLOCK_32X32, + 0x3333, // BLOCK_32X64, + 0x00ff, // BLOCK_64X32, + 0xffff, // BLOCK_64X64 +}; +static const uint16_t left_border_uv = 0x1111; +static const uint16_t above_border_uv = 0x000f; + + static void lf_init_lut(loop_filter_info_n *lfi) { lfi->mode_lf_lut[DC_PRED] = 0; lfi->mode_lf_lut[D45_PRED] = 0; lfi->mode_lf_lut[D135_PRED] = 0; lfi->mode_lf_lut[D117_PRED] = 0; lfi->mode_lf_lut[D153_PRED] = 0; - lfi->mode_lf_lut[D27_PRED] = 0; + lfi->mode_lf_lut[D207_PRED] = 0; lfi->mode_lf_lut[D63_PRED] = 0; lfi->mode_lf_lut[V_PRED] = 0; lfi->mode_lf_lut[H_PRED] = 0; @@ -39,7 +243,7 @@ static void lf_init_lut(loop_filter_info_n *lfi) { lfi->mode_lf_lut[NEWMV] = 1; } -static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) { +static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { int lvl; // For each possible value for the loop filter fill out limits @@ -61,8 +265,9 @@ static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) { } } -void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) { +void vp9_loop_filter_init(VP9_COMMON *cm) { loop_filter_info_n *lfi = &cm->lf_info; + struct loopfilter *lf = &cm->lf; int i; // init limits for given sharpness @@ -77,16 +282,15 @@ void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) { vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH); } -void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd, - int default_filt_lvl) { +void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { int seg_id; // n_shift is the a multiplier for lf_deltas // the multiplier is 1 for when filter_lvl is between 0 and 31; // 2 when filter_lvl is between 32 and 63 const int n_shift = default_filt_lvl >> 5; loop_filter_info_n *const lfi = &cm->lf_info; - struct loopfilter *const lf = &xd->lf; - struct segmentation *const seg = &xd->seg; + struct loopfilter *const lf = &cm->lf; + struct segmentation *const seg = &cm->seg; // update limits if sharpness has changed if (lf->last_sharpness_level != lf->sharpness_level) { @@ -98,7 +302,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd, int lvl_seg = default_filt_lvl, ref, mode, intra_lvl; // Set the baseline filter values for each segment - if (vp9_segfeature_active(&xd->seg, seg_id, SEG_LVL_ALT_LF)) { + if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) { const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF); lvl_seg = seg->abs_delta == SEGMENT_ABSDATA ? data @@ -108,7 +312,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd, if (!lf->mode_ref_delta_enabled) { // we could get rid of this if we assume that deltas are set to // zero when not in use; encoder always uses deltas - vpx_memset(lfi->lvl[seg_id][0], lvl_seg, 4 * 4); + vpx_memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id])); continue; } @@ -124,9 +328,9 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd, } } -static int build_lfi(const loop_filter_info_n *const lfi_n, - const MB_MODE_INFO *const mbmi, - struct loop_filter_info *const lfi) { +static int build_lfi(const loop_filter_info_n *lfi_n, + const MB_MODE_INFO *mbmi, + struct loop_filter_info *lfi) { const int seg = mbmi->segment_id; const int ref = mbmi->ref_frame[0]; const int mode = lfi_n->mode_lf_lut[mbmi->mode]; @@ -236,10 +440,360 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } } -static void filter_block_plane(VP9_COMMON *const cm, - struct macroblockd_plane *const plane, - const MODE_INFO *mi, - int mi_row, int mi_col) { +// This function ors into the current lfm structure, where to do loop +// filters for the specific mi we are looking at. It uses information +// including the block_size_type (32x16, 32x32, etc), the transform size, +// whether there were any coefficients encoded, and the loop filter strength +// block we are currently looking at. Shift is used to position the +// 1's we produce. +// TODO(JBB) Need another function for different resolution color.. +static void build_masks(const loop_filter_info_n *const lfi_n, + const MODE_INFO *mi, const int shift_y, + const int shift_uv, + LOOP_FILTER_MASK *lfm) { + const BLOCK_SIZE block_size = mi->mbmi.sb_type; + const TX_SIZE tx_size_y = mi->mbmi.tx_size; + const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi); + const int skip = mi->mbmi.skip_coeff; + const int seg = mi->mbmi.segment_id; + const int ref = mi->mbmi.ref_frame[0]; + const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode]; + const int filter_level = lfi_n->lvl[seg][ref][mode]; + uint64_t *left_y = &lfm->left_y[tx_size_y]; + uint64_t *above_y = &lfm->above_y[tx_size_y]; + uint64_t *int_4x4_y = &lfm->int_4x4_y; + uint16_t *left_uv = &lfm->left_uv[tx_size_uv]; + uint16_t *above_uv = &lfm->above_uv[tx_size_uv]; + uint16_t *int_4x4_uv = &lfm->int_4x4_uv; + + // If filter level is 0 we don't loop filter. + if (!filter_level) + return; + + // These set 1 in the current block size for the block size edges. + // For instance if the block size is 32x16, we'll set : + // above = 1111 + // 0000 + // and + // left = 1000 + // = 1000 + // NOTE : In this example the low bit is left most ( 1000 ) is stored as + // 1, not 8... + // + // U and v set things on a 16 bit scale. + // + *above_y |= above_prediction_mask[block_size] << shift_y; + *above_uv |= above_prediction_mask_uv[block_size] << shift_uv; + *left_y |= left_prediction_mask[block_size] << shift_y; + *left_uv |= left_prediction_mask_uv[block_size] << shift_uv; + + // If the block has no coefficients and is not intra we skip applying + // the loop filter on block edges. + if (skip && ref > INTRA_FRAME) + return; + + // Here we are adding a mask for the transform size. The transform + // size mask is set to be correct for a 64x64 prediction block size. We + // mask to match the size of the block we are working on and then shift it + // into place.. + *above_y |= (size_mask[block_size] & + above_64x64_txform_mask[tx_size_y]) << shift_y; + *above_uv |= (size_mask_uv[block_size] & + above_64x64_txform_mask_uv[tx_size_uv]) << shift_uv; + + *left_y |= (size_mask[block_size] & + left_64x64_txform_mask[tx_size_y]) << shift_y; + *left_uv |= (size_mask_uv[block_size] & + left_64x64_txform_mask_uv[tx_size_uv]) << shift_uv; + + // Here we are trying to determine what to do with the internal 4x4 block + // boundaries. These differ from the 4x4 boundaries on the outside edge of + // an 8x8 in that the internal ones can be skipped and don't depend on + // the prediction block size. + if (tx_size_y == TX_4X4) { + *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y; + } + if (tx_size_uv == TX_4X4) { + *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv; + } +} + +// This function does the same thing as the one above with the exception that +// it only affects the y masks. It exists because for blocks < 16x16 in size, +// we only update u and v masks on the first block. +static void build_y_mask(const loop_filter_info_n *const lfi_n, + const MODE_INFO *mi, const int shift_y, + LOOP_FILTER_MASK *lfm) { + const BLOCK_SIZE block_size = mi->mbmi.sb_type; + const TX_SIZE tx_size_y = mi->mbmi.tx_size; + const int skip = mi->mbmi.skip_coeff; + const int seg = mi->mbmi.segment_id; + const int ref = mi->mbmi.ref_frame[0]; + const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode]; + const int filter_level = lfi_n->lvl[seg][ref][mode]; + uint64_t *left_y = &lfm->left_y[tx_size_y]; + uint64_t *above_y = &lfm->above_y[tx_size_y]; + uint64_t *int_4x4_y = &lfm->int_4x4_y; + + if (!filter_level) + return; + + *above_y |= above_prediction_mask[block_size] << shift_y; + *left_y |= left_prediction_mask[block_size] << shift_y; + + if (skip && ref > INTRA_FRAME) + return; + + *above_y |= (size_mask[block_size] & + above_64x64_txform_mask[tx_size_y]) << shift_y; + + *left_y |= (size_mask[block_size] & + left_64x64_txform_mask[tx_size_y]) << shift_y; + + if (tx_size_y == TX_4X4) { + *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y; + } +} + +// This function sets up the bit masks for the entire 64x64 region represented +// by mi_row, mi_col. +// TODO(JBB): This function only works for yv12. +static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, + MODE_INFO **mi_8x8, const int mode_info_stride, + LOOP_FILTER_MASK *lfm) { + int idx_32, idx_16, idx_8; + const loop_filter_info_n *const lfi_n = &cm->lf_info; + MODE_INFO **mip = mi_8x8; + MODE_INFO **mip2 = mi_8x8; + + // These are offsets to the next mi in the 64x64 block. It is what gets + // added to the mi ptr as we go through each loop. It helps us to avoids + // setting up special row and column counters for each index. The last step + // brings us out back to the starting position. + const int offset_32[] = {4, (mode_info_stride << 2) - 4, 4, + -(mode_info_stride << 2) - 4}; + const int offset_16[] = {2, (mode_info_stride << 1) - 2, 2, + -(mode_info_stride << 1) - 2}; + const int offset[] = {1, mode_info_stride - 1, 1, -mode_info_stride - 1}; + + // Following variables represent shifts to position the current block + // mask over the appropriate block. A shift of 36 to the left will move + // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left + // 4 rows to the appropriate spot. + const int shift_32_y[] = {0, 4, 32, 36}; + const int shift_16_y[] = {0, 2, 16, 18}; + const int shift_8_y[] = {0, 1, 8, 9}; + const int shift_32_uv[] = {0, 2, 8, 10}; + const int shift_16_uv[] = {0, 1, 4, 5}; + int i; + const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ? + cm->mi_rows - mi_row : MI_BLOCK_SIZE); + const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ? + cm->mi_cols - mi_col : MI_BLOCK_SIZE); + + vp9_zero(*lfm); + + // TODO(jimbankoski): Try moving most of the following code into decode + // loop and storing lfm in the mbmi structure so that we don't have to go + // through the recursive loop structure multiple times. + switch (mip[0]->mbmi.sb_type) { + case BLOCK_64X64: + build_masks(lfi_n, mip[0] , 0, 0, lfm); + break; + case BLOCK_64X32: + build_masks(lfi_n, mip[0], 0, 0, lfm); + mip2 = mip + mode_info_stride * 4; + if (4 >= max_rows) + break; + build_masks(lfi_n, mip2[0], 32, 8, lfm); + break; + case BLOCK_32X64: + build_masks(lfi_n, mip[0], 0, 0, lfm); + mip2 = mip + 4; + if (4 >= max_cols) + break; + build_masks(lfi_n, mip2[0], 4, 2, lfm); + break; + default: + for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) { + const int shift_y = shift_32_y[idx_32]; + const int shift_uv = shift_32_uv[idx_32]; + const int mi_32_col_offset = ((idx_32 & 1) << 2); + const int mi_32_row_offset = ((idx_32 >> 1) << 2); + if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows) + continue; + switch (mip[0]->mbmi.sb_type) { + case BLOCK_32X32: + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + break; + case BLOCK_32X16: + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + if (mi_32_row_offset + 2 >= max_rows) + continue; + mip2 = mip + mode_info_stride * 2; + build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm); + break; + case BLOCK_16X32: + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + if (mi_32_col_offset + 2 >= max_cols) + continue; + mip2 = mip + 2; + build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm); + break; + default: + for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) { + const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16]; + const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16]; + const int mi_16_col_offset = mi_32_col_offset + + ((idx_16 & 1) << 1); + const int mi_16_row_offset = mi_32_row_offset + + ((idx_16 >> 1) << 1); + + if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows) + continue; + + switch (mip[0]->mbmi.sb_type) { + case BLOCK_16X16: + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + break; + case BLOCK_16X8: + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + if (mi_16_row_offset + 1 >= max_rows) + continue; + mip2 = mip + mode_info_stride; + build_y_mask(lfi_n, mip2[0], shift_y+8, lfm); + break; + case BLOCK_8X16: + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + if (mi_16_col_offset +1 >= max_cols) + continue; + mip2 = mip + 1; + build_y_mask(lfi_n, mip2[0], shift_y+1, lfm); + break; + default: { + const int shift_y = shift_32_y[idx_32] + + shift_16_y[idx_16] + + shift_8_y[0]; + build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm); + mip += offset[0]; + for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) { + const int shift_y = shift_32_y[idx_32] + + shift_16_y[idx_16] + + shift_8_y[idx_8]; + const int mi_8_col_offset = mi_16_col_offset + + ((idx_8 & 1)); + const int mi_8_row_offset = mi_16_row_offset + + ((idx_8 >> 1)); + + if (mi_8_col_offset >= max_cols || + mi_8_row_offset >= max_rows) + continue; + build_y_mask(lfi_n, mip[0], shift_y, lfm); + } + break; + } + } + } + break; + } + } + break; + } + // The largest loopfilter we have is 16x16 so we use the 16x16 mask + // for 32x32 transforms also also. + lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32]; + lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32]; + lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32]; + lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32]; + + // We do at least 8 tap filter on every 32x32 even if the transform size + // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and + // remove it from the 4x4. + lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border; + lfm->left_y[TX_4X4] &= ~left_border; + lfm->above_y[TX_8X8] |= lfm->above_y[TX_4X4] & above_border; + lfm->above_y[TX_4X4] &= ~above_border; + lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_4X4] & left_border_uv; + lfm->left_uv[TX_4X4] &= ~left_border_uv; + lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_4X4] & above_border_uv; + lfm->above_uv[TX_4X4] &= ~above_border_uv; + + // We do some special edge handling. + if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) { + const uint64_t rows = cm->mi_rows - mi_row; + + // Each pixel inside the border gets a 1, + const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1); + const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1); + + // Remove values completely outside our border. + for (i = 0; i < TX_32X32; i++) { + lfm->left_y[i] &= mask_y; + lfm->above_y[i] &= mask_y; + lfm->left_uv[i] &= mask_uv; + lfm->above_uv[i] &= mask_uv; + } + lfm->int_4x4_y &= mask_y; + lfm->int_4x4_uv &= mask_uv; + + // We don't apply a wide loop filter on the last uv block row. If set + // apply the shorter one instead. + if (rows == 1) { + lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16]; + lfm->above_uv[TX_16X16] = 0; + } + if (rows == 5) { + lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00; + lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00); + } + } + + if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) { + const uint64_t columns = cm->mi_cols - mi_col; + + // Each pixel inside the border gets a 1, the multiply copies the border + // to where we need it. + const uint64_t mask_y = (((1 << columns) - 1)) * 0x0101010101010101; + const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111; + + // Internal edges are not applied on the last column of the image so + // we mask 1 more for the internal edges + const uint16_t mask_uv_int = ((1 << (columns >> 1)) - 1) * 0x1111; + + // Remove the bits outside the image edge. + for (i = 0; i < TX_32X32; i++) { + lfm->left_y[i] &= mask_y; + lfm->above_y[i] &= mask_y; + lfm->left_uv[i] &= mask_uv; + lfm->above_uv[i] &= mask_uv; + } + lfm->int_4x4_y &= mask_y; + lfm->int_4x4_uv &= mask_uv_int; + + // We don't apply a wide loop filter on the last uv column. If set + // apply the shorter one instead. + if (columns == 1) { + lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16]; + lfm->left_uv[TX_16X16] = 0; + } + if (columns == 5) { + lfm->left_uv[TX_8X8] |= (lfm->left_uv[TX_16X16] & 0xcccc); + lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc); + } + } + // We don't a loop filter on the first column in the image. Mask that out. + if (mi_col == 0) { + for (i = 0; i < TX_32X32; i++) { + lfm->left_y[i] &= 0xfefefefefefefefe; + lfm->left_uv[i] &= 0xeeee; + } + } +} +#if CONFIG_NON420 +static void filter_block_plane_non420(VP9_COMMON *cm, + struct macroblockd_plane *plane, + MODE_INFO **mi_8x8, + int mi_row, int mi_col) { const int ss_x = plane->subsampling_x; const int ss_y = plane->subsampling_y; const int row_step = 1 << ss_x; @@ -262,24 +816,25 @@ static void filter_block_plane(VP9_COMMON *const cm, // Determine the vertical edges that need filtering for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) { - const int skip_this = mi[c].mbmi.mb_skip_coeff - && is_inter_block(&mi[c].mbmi); + const MODE_INFO *mi = mi_8x8[c]; + const int skip_this = mi[0].mbmi.skip_coeff + && is_inter_block(&mi[0].mbmi); // left edge of current unit is block/partition edge -> no skip - const int block_edge_left = b_width_log2(mi[c].mbmi.sb_type) ? - !(c & ((1 << (b_width_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1; + const int block_edge_left = b_width_log2(mi[0].mbmi.sb_type) ? + !(c & ((1 << (b_width_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1; const int skip_this_c = skip_this && !block_edge_left; // top edge of current unit is block/partition edge -> no skip - const int block_edge_above = b_height_log2(mi[c].mbmi.sb_type) ? - !(r & ((1 << (b_height_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1; + const int block_edge_above = b_height_log2(mi[0].mbmi.sb_type) ? + !(r & ((1 << (b_height_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1; const int skip_this_r = skip_this && !block_edge_above; const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV) - ? get_uv_tx_size(&mi[c].mbmi) - : mi[c].mbmi.txfm_size; + ? get_uv_tx_size(&mi[0].mbmi) + : mi[0].mbmi.tx_size; const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1; const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; // Filter level can vary per MI - if (!build_lfi(&cm->lf_info, &mi[c].mbmi, lfi[r] + (c >> ss_x))) + if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x))) continue; // Build masks based on the transform size of each block @@ -338,7 +893,7 @@ static void filter_block_plane(VP9_COMMON *const cm, mask_4x4_c & border_mask, mask_4x4_int[r], lfi[r]); dst->buf += 8 * dst->stride; - mi += row_step_stride; + mi_8x8 += row_step_stride; } // Now do horizontal pass @@ -355,33 +910,146 @@ static void filter_block_plane(VP9_COMMON *const cm, dst->buf += 8 * dst->stride; } } +#endif + +static void filter_block_plane(VP9_COMMON *const cm, + struct macroblockd_plane *const plane, + MODE_INFO **mi_8x8, + int mi_row, int mi_col, + LOOP_FILTER_MASK *lfm) { + const int ss_x = plane->subsampling_x; + const int ss_y = plane->subsampling_y; + const int row_step = 1 << ss_x; + const int col_step = 1 << ss_y; + const int row_step_stride = cm->mode_info_stride * row_step; + struct buf_2d *const dst = &plane->dst; + uint8_t* const dst0 = dst->buf; + unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0}; + struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE]; + int r, c; + int row_shift = 3 - ss_x; + int row_mask = 0xff >> (ss_x << 2); + +#define MASK_ROW(value) ((value >> (r_sampled << row_shift)) & row_mask) + + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { + int r_sampled = r >> ss_x; + + // Determine the vertical edges that need filtering + for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) { + const MODE_INFO *mi = mi_8x8[c]; + if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x))) + continue; + } + if (!plane->plane_type) { + mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y); + // Disable filtering on the leftmost column + filter_selectively_vert(dst->buf, dst->stride, + MASK_ROW(lfm->left_y[TX_16X16]), + MASK_ROW(lfm->left_y[TX_8X8]), + MASK_ROW(lfm->left_y[TX_4X4]), + MASK_ROW(lfm->int_4x4_y), + lfi[r]); + } else { + mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_uv); + // Disable filtering on the leftmost column + filter_selectively_vert(dst->buf, dst->stride, + MASK_ROW(lfm->left_uv[TX_16X16]), + MASK_ROW(lfm->left_uv[TX_8X8]), + MASK_ROW(lfm->left_uv[TX_4X4]), + MASK_ROW(lfm->int_4x4_uv), + lfi[r]); + } + dst->buf += 8 * dst->stride; + mi_8x8 += row_step_stride; + } + + // Now do horizontal pass + dst->buf = dst0; + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { + const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; + const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r]; + int r_sampled = r >> ss_x; + + if (!plane->plane_type) { + filter_selectively_horiz(dst->buf, dst->stride, + MASK_ROW(lfm->above_y[TX_16X16]), + MASK_ROW(lfm->above_y[TX_8X8]), + MASK_ROW(lfm->above_y[TX_4X4]), + MASK_ROW(lfm->int_4x4_y), + mi_row + r == 0, lfi[r]); + } else { + filter_selectively_horiz(dst->buf, dst->stride, + MASK_ROW(lfm->above_uv[TX_16X16]), + MASK_ROW(lfm->above_uv[TX_8X8]), + MASK_ROW(lfm->above_uv[TX_4X4]), + mask_4x4_int_r, + mi_row + r == 0, lfi[r]); + } + dst->buf += 8 * dst->stride; + } +#undef MASK_ROW +} void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, VP9_COMMON *cm, MACROBLOCKD *xd, int start, int stop, int y_only) { const int num_planes = y_only ? 1 : MAX_MB_PLANE; int mi_row, mi_col; + LOOP_FILTER_MASK lfm; +#if CONFIG_NON420 + int use_420 = y_only || (xd->plane[1].subsampling_y == 1 && + xd->plane[1].subsampling_x == 1); +#endif for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) { - MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride; + MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride; for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { int plane; setup_dst_planes(xd, frame_buffer, mi_row, mi_col); + + // TODO(JBB): Make setup_mask work for non 420. +#if CONFIG_NON420 + if (use_420) +#endif + setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride, + &lfm); + for (plane = 0; plane < num_planes; ++plane) { - filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col); +#if CONFIG_NON420 + if (use_420) +#endif + filter_block_plane(cm, &xd->plane[plane], mi_8x8 + mi_col, mi_row, + mi_col, &lfm); +#if CONFIG_NON420 + else + filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col, + mi_row, mi_col); +#endif } } } } void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd, - int frame_filter_level, int y_only) { + int frame_filter_level, + int y_only, int partial) { + int start_mi_row, end_mi_row, mi_rows_to_filter; if (!frame_filter_level) return; - vp9_loop_filter_frame_init(cm, xd, frame_filter_level); + start_mi_row = 0; + mi_rows_to_filter = cm->mi_rows; + if (partial && cm->mi_rows > 8) { + start_mi_row = cm->mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = MAX(cm->mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + vp9_loop_filter_frame_init(cm, frame_filter_level); vp9_loop_filter_rows(cm->frame_to_show, cm, xd, - 0, cm->mi_rows, y_only); + start_mi_row, end_mi_row, + y_only); } int vp9_loop_filter_worker(void *arg1, void *arg2) { diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h index 5fc9094..91d40ac 100644 --- a/libvpx/vp9/common/vp9_loopfilter.h +++ b/libvpx/vp9/common/vp9_loopfilter.h @@ -22,6 +22,27 @@ #define SIMD_WIDTH 16 +#define MAX_REF_LF_DELTAS 4 +#define MAX_MODE_LF_DELTAS 2 + +struct loopfilter { + int filter_level; + + int sharpness_level; + int last_sharpness_level; + + uint8_t mode_ref_delta_enabled; + uint8_t mode_ref_delta_update; + + // 0 = Intra, Last, GF, ARF + signed char ref_deltas[MAX_REF_LF_DELTAS]; + signed char last_ref_deltas[MAX_REF_LF_DELTAS]; + + // 0 = ZERO_MV, MV + signed char mode_deltas[MAX_MODE_LF_DELTAS]; + signed char last_mode_deltas[MAX_MODE_LF_DELTAS]; +}; + // Need to align this structure so when it is declared and // passed it can be loaded into vector registers. typedef struct { @@ -39,19 +60,17 @@ typedef struct { struct VP9Common; struct macroblockd; -void vp9_loop_filter_init(struct VP9Common *cm, struct loopfilter *lf); +void vp9_loop_filter_init(struct VP9Common *cm); // Update the loop filter for the current frame. // This should be called before vp9_loop_filter_rows(), vp9_loop_filter_frame() // calls this function directly. -void vp9_loop_filter_frame_init(struct VP9Common *const cm, - struct macroblockd *const xd, - int default_filt_lvl); +void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl); void vp9_loop_filter_frame(struct VP9Common *cm, struct macroblockd *mbd, int filter_level, - int y_only); + int y_only, int partial); // Apply the loop filter to [start, stop) macro block rows in frame_buffer. void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c index 3b72f41..bfeeb57 100644 --- a/libvpx/vp9/common/vp9_mvref_common.c +++ b/libvpx/vp9/common/vp9_mvref_common.c @@ -1,3 +1,4 @@ + /* * Copyright (c) 2012 The WebM project authors. All Rights Reserved. * @@ -36,7 +37,7 @@ static const int mode_2_counter[MB_MODE_COUNT] = { 9, // D135_PRED 9, // D117_PRED 9, // D153_PRED - 9, // D27_PRED + 9, // D207_PRED 9, // D63_PRED 9, // TM_PRED 0, // NEARESTMV @@ -70,33 +71,33 @@ static const int counter_to_context[19] = { BOTH_INTRA // 18 }; -static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = { - // SB4X4 - {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}}, - // SB4X8 - {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}}, - // SB8X4 - {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}}, - // SB8X8 - {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}}, - // SB8X16 - {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}}, - // SB16X8 +static const MV mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = { + // 4X4 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 4X8 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 8X4 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 8X8 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 8X16 {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}}, - // SB16X16 - {{0, -1}, {-1, 0}, {1, -1}, {-1, 1}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}}, - // SB16X32 - {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, - // SB32X16 + // 16X8 + {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}}, + // 16X16 + {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, + // 16X32 {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}}, - // SB32X32 - {{1, -1}, {-1, 1}, {2, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}}, - // SB32X64 - {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}}, - // SB64X32 + // 32X16 + {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, + // 32X32 + {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, + // 32X64 {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}}, - // SB64X64 - {{3, -1}, {-1, 3}, {4, -1}, {-1, 4}, {-1, -1}, {0, -1}, {-1, 0}, {6, -1}} + // 64X32 + {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}}, + // 64X64 + {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}} }; static const int idx_n_column_to_subblock[4][2] = { @@ -109,11 +110,11 @@ static const int idx_n_column_to_subblock[4][2] = { // clamp_mv_ref #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units -static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) { - clamp_mv(&mv->as_mv, xd->mb_to_left_edge - MV_BORDER, - xd->mb_to_right_edge + MV_BORDER, - xd->mb_to_top_edge - MV_BORDER, - xd->mb_to_bottom_edge + MV_BORDER); +static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) { + clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER, + xd->mb_to_right_edge + MV_BORDER, + xd->mb_to_top_edge - MV_BORDER, + xd->mb_to_bottom_edge + MV_BORDER); } // This function returns either the appropriate sub block or block's mv @@ -121,89 +122,75 @@ static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) { static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int check_sub_blocks, int which_mv, int search_col, int block_idx) { - return (check_sub_blocks && candidate->mbmi.sb_type < BLOCK_SIZE_SB8X8 + return check_sub_blocks && candidate->mbmi.sb_type < BLOCK_8X8 ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]] .as_mv[which_mv] - : candidate->mbmi.mv[which_mv]); + : candidate->mbmi.mv[which_mv]; } // Performs mv sign inversion if indicated by the reference frame combination. -static INLINE int_mv scale_mv(const MODE_INFO *candidate, const int which_mv, +static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref, const MV_REFERENCE_FRAME this_ref_frame, const int *ref_sign_bias) { - int_mv return_mv = candidate->mbmi.mv[which_mv]; - - // Sign inversion where appropriate. - if (ref_sign_bias[candidate->mbmi.ref_frame[which_mv]] != - ref_sign_bias[this_ref_frame]) { - return_mv.as_mv.row *= -1; - return_mv.as_mv.col *= -1; + int_mv mv = mbmi->mv[ref]; + if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; } - return return_mv; + return mv; } // This macro is used to add a motion vector mv_ref list if it isn't // already in the list. If it's the second motion vector it will also // skip all additional processing and jump to done! #define ADD_MV_REF_LIST(MV) \ - if (refmv_count) { \ - if ((MV).as_int != mv_ref_list[0].as_int) { \ - mv_ref_list[refmv_count] = (MV); \ - goto Done; \ + do { \ + if (refmv_count) { \ + if ((MV).as_int != mv_ref_list[0].as_int) { \ + mv_ref_list[refmv_count] = (MV); \ + goto Done; \ + } \ + } else { \ + mv_ref_list[refmv_count++] = (MV); \ } \ - } else { \ - mv_ref_list[refmv_count++] = (MV); \ - } + } while (0) // If either reference frame is different, not INTRA, and they // are different from each other scale and add the mv to our list. #define IF_DIFF_REF_FRAME_ADD_MV(CANDIDATE) \ - if ((CANDIDATE)->mbmi.ref_frame[0] != ref_frame) { \ - ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \ - } \ - if ((CANDIDATE)->mbmi.ref_frame[1] != ref_frame && \ - (CANDIDATE)->mbmi.ref_frame[1] > INTRA_FRAME && \ - (CANDIDATE)->mbmi.mv[1].as_int != (CANDIDATE)->mbmi.mv[0].as_int) { \ - ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \ - } + do { \ + if ((CANDIDATE)->ref_frame[0] != ref_frame) \ + ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \ + if ((CANDIDATE)->ref_frame[1] != ref_frame && \ + has_second_ref(CANDIDATE) && \ + (CANDIDATE)->mv[1].as_int != (CANDIDATE)->mv[0].as_int) \ + ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \ + } while (0) + // Checks that the given mi_row, mi_col and search point // are inside the borders of the tile. -static INLINE int is_inside(const int mi_col, const int mi_row, - const int cur_tile_mi_col_start, - const int cur_tile_mi_col_end, const int mi_rows, - const int (*mv_ref_search)[2], int idx) { - int mi_search_col; - const int mi_search_row = mi_row + mv_ref_search[idx][1];; - - // Check that the candidate is within the border. We only need to check - // the left side because all the positive right side ones are for blocks that - // are large enough to support the + value they have within their border. - if (mi_search_row < 0) - return 0; - - mi_search_col = mi_col + mv_ref_search[idx][0]; - if (mi_search_col < cur_tile_mi_col_start) - return 0; - - return 1; +static INLINE int is_inside(const VP9_COMMON *cm, int mi_col, int mi_row, + const MV *mv) { + return !(mi_row + mv->row < 0 || + mi_col + mv->col < cm->cur_tile_mi_col_start || + mi_row + mv->row >= cm->mi_rows || + mi_col + mv->col >= cm->cur_tile_mi_col_end); } // This function searches the neighbourhood of a given MB/SB // to try and find candidate reference vectors. -void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here, - const MODE_INFO *lf_here, - const MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, const int *ref_sign_bias, - const int block_idx, - const int mi_row, const int mi_col) { - int idx; - MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; - int refmv_count = 0; - const int (*mv_ref_search)[2] = mv_ref_blocks[mbmi->sb_type]; - const MODE_INFO *candidate; - const int check_sub_blocks = block_idx >= 0; +void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, + MODE_INFO *mi, const MODE_INFO *prev_mi, + MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int block_idx, + int mi_row, int mi_col) { + const int *ref_sign_bias = cm->ref_frame_sign_bias; + int i, refmv_count = 0; + const MV *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type]; + const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL; int different_ref_found = 0; int context_counter = 0; @@ -213,28 +200,27 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here, // The nearest 2 blocks are treated differently // if the size < 8x8 we get the mv from the bmi substructure, // and we also need to keep a mode count. - for (idx = 0; idx < 2; ++idx) { - if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, - cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx)) - continue; - - candidate = here + mv_ref_search[idx][0] - + mv_ref_search[idx][1] * xd->mode_info_stride; - - // Keep counts for entropy encoding. - context_counter += mode_2_counter[candidate->mbmi.mode]; - - // Check if the candidate comes from the same reference frame. - if (candidate->mbmi.ref_frame[0] == ref_frame) { - ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 0, - mv_ref_search[idx][0], block_idx)); - different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame; - } else { - different_ref_found = 1; - if (candidate->mbmi.ref_frame[1] == ref_frame) { - // Add second motion vector if it has the same ref_frame. - ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 1, - mv_ref_search[idx][0], block_idx)); + for (i = 0; i < 2; ++i) { + const MV *const mv_ref = &mv_ref_search[i]; + if (is_inside(cm, mi_col, mi_row, mv_ref)) { + const int check_sub_blocks = block_idx >= 0; + const MODE_INFO *const candidate_mi = xd->mi_8x8[mv_ref->col + mv_ref->row + * xd->mode_info_stride]; + const MB_MODE_INFO *const candidate = &candidate_mi->mbmi; + // Keep counts for entropy encoding. + context_counter += mode_2_counter[candidate->mode]; + + // Check if the candidate comes from the same reference frame. + if (candidate->ref_frame[0] == ref_frame) { + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 0, + mv_ref->col, block_idx)); + different_ref_found = candidate->ref_frame[1] != ref_frame; + } else { + if (candidate->ref_frame[1] == ref_frame) + // Add second motion vector if it has the same ref_frame. + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, check_sub_blocks, 1, + mv_ref->col, block_idx)); + different_ref_found = 1; } } } @@ -242,68 +228,59 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here, // Check the rest of the neighbors in much the same way // as before except we don't need to keep track of sub blocks or // mode counts. - for (; idx < MVREF_NEIGHBOURS; ++idx) { - if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, - cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx)) - continue; - - candidate = here + mv_ref_search[idx][0] - + mv_ref_search[idx][1] * xd->mode_info_stride; - - if (candidate->mbmi.ref_frame[0] == ref_frame) { - ADD_MV_REF_LIST(candidate->mbmi.mv[0]); - different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame; - } else { - different_ref_found = 1; - if (candidate->mbmi.ref_frame[1] == ref_frame) { - ADD_MV_REF_LIST(candidate->mbmi.mv[1]); + for (; i < MVREF_NEIGHBOURS; ++i) { + const MV *const mv_ref = &mv_ref_search[i]; + if (is_inside(cm, mi_col, mi_row, mv_ref)) { + const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col + + mv_ref->row + * xd->mode_info_stride]->mbmi; + + if (candidate->ref_frame[0] == ref_frame) { + ADD_MV_REF_LIST(candidate->mv[0]); + different_ref_found = candidate->ref_frame[1] != ref_frame; + } else { + if (candidate->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST(candidate->mv[1]); + different_ref_found = 1; } } } // Check the last frame's mode and mv info. - if (lf_here != NULL) { - if (lf_here->mbmi.ref_frame[0] == ref_frame) { - ADD_MV_REF_LIST(lf_here->mbmi.mv[0]); - } else if (lf_here->mbmi.ref_frame[1] == ref_frame) { - ADD_MV_REF_LIST(lf_here->mbmi.mv[1]); - } + if (prev_mbmi) { + if (prev_mbmi->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST(prev_mbmi->mv[0]); + else if (prev_mbmi->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST(prev_mbmi->mv[1]); } // Since we couldn't find 2 mvs from the same reference frame // go back through the neighbors and find motion vectors from // different reference frames. if (different_ref_found) { - for (idx = 0; idx < MVREF_NEIGHBOURS; ++idx) { - if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, - cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx)) - continue; - - candidate = here + mv_ref_search[idx][0] - + mv_ref_search[idx][1] * xd->mode_info_stride; - - // If the candidate is INTRA we don't want to consider its mv. - if (candidate->mbmi.ref_frame[0] == INTRA_FRAME) - continue; - - IF_DIFF_REF_FRAME_ADD_MV(candidate); + for (i = 0; i < MVREF_NEIGHBOURS; ++i) { + const MV *mv_ref = &mv_ref_search[i]; + if (is_inside(cm, mi_col, mi_row, mv_ref)) { + const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col + + mv_ref->row + * xd->mode_info_stride]->mbmi; + + // If the candidate is INTRA we don't want to consider its mv. + if (is_inter_block(candidate)) + IF_DIFF_REF_FRAME_ADD_MV(candidate); + } } } // Since we still don't have a candidate we'll try the last frame. - if (lf_here != NULL && lf_here->mbmi.ref_frame[0] != INTRA_FRAME) { - IF_DIFF_REF_FRAME_ADD_MV(lf_here); - } + if (prev_mbmi && is_inter_block(prev_mbmi)) + IF_DIFF_REF_FRAME_ADD_MV(prev_mbmi); Done: - mbmi->mb_mode_context[ref_frame] = counter_to_context[context_counter]; + mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter]; // Clamp vectors - for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) { - clamp_mv_ref(xd, &mv_ref_list[idx]); - } + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) + clamp_mv_ref(&mv_ref_list[i].as_mv, xd); } - -#undef ADD_MV_REF_LIST -#undef IF_DIFF_REF_FRAME_ADD_MV diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h index c5f89eb..39ebdb0 100644 --- a/libvpx/vp9/common/vp9_mvref_common.h +++ b/libvpx/vp9/common/vp9_mvref_common.h @@ -14,27 +14,20 @@ #ifndef VP9_COMMON_VP9_MVREF_COMMON_H_ #define VP9_COMMON_VP9_MVREF_COMMON_H_ -void vp9_find_mv_refs_idx(VP9_COMMON *cm, - MACROBLOCKD *xd, - MODE_INFO *here, - const MODE_INFO *lf_here, - const MV_REFERENCE_FRAME ref_frame, +void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, + MODE_INFO *mi, const MODE_INFO *prev_mi, + MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, - const int *ref_sign_bias, - const int block_idx, - const int mi_row, - const int mi_col); + int block_idx, + int mi_row, int mi_col); -static INLINE void vp9_find_mv_refs(VP9_COMMON *cm, - MACROBLOCKD *xd, - MODE_INFO *here, - MODE_INFO *lf_here, +static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + MODE_INFO *mi, const MODE_INFO *prev_mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, - int *ref_sign_bias, int mi_row, int mi_col) { - vp9_find_mv_refs_idx(cm, xd, here, lf_here, ref_frame, - mv_ref_list, ref_sign_bias, -1, mi_row, mi_col); + vp9_find_mv_refs_idx(cm, xd, mi, prev_mi, ref_frame, + mv_ref_list, -1, mi_row, mi_col); } #endif // VP9_COMMON_VP9_MVREF_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_onyx.h b/libvpx/vp9/common/vp9_onyx.h index 152046f..f424e6a 100644 --- a/libvpx/vp9/common/vp9_onyx.h +++ b/libvpx/vp9/common/vp9_onyx.h @@ -46,7 +46,8 @@ extern "C" typedef enum { USAGE_STREAM_FROM_SERVER = 0x0, USAGE_LOCAL_FILE_PLAYBACK = 0x1, - USAGE_CONSTRAINED_QUALITY = 0x2 + USAGE_CONSTRAINED_QUALITY = 0x2, + USAGE_CONSTANT_QUALITY = 0x3, } END_USAGE; @@ -130,6 +131,8 @@ extern "C" // END DATARATE CONTROL OPTIONS // ---------------------------------------------------------------- + // Spatial scalability + int ss_number_layers; // these parameters aren't to be used in final build don't use!!! int play_alternate; @@ -210,6 +213,13 @@ extern "C" int vp9_set_internal_size(VP9_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode); + int vp9_set_size_literal(VP9_PTR comp, unsigned int width, + unsigned int height); + + int vp9_switch_layer(VP9_PTR comp, int layer); + + void vp9_set_svc(VP9_PTR comp, int use_svc); + int vp9_get_quantizer(VP9_PTR c); #ifdef __cplusplus diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h index 152a932..0431e14 100644 --- a/libvpx/vp9/common/vp9_onyxc_int.h +++ b/libvpx/vp9/common/vp9_onyxc_int.h @@ -20,7 +20,7 @@ #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_quant_common.h" -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC #include "vp9/common/vp9_postproc.h" #endif @@ -38,14 +38,14 @@ #define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LOG2) typedef struct frame_contexts { - vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1]; - vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1]; + vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1]; + vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS] [PARTITION_TYPES - 1]; vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES]; - vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] - [VP9_SWITCHABLE_FILTERS - 1]; - vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1]; + vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1] + [SWITCHABLE_FILTERS - 1]; + vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1]; vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS]; vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS]; vp9_prob single_ref_prob[REF_CONTEXTS][2]; @@ -56,15 +56,15 @@ typedef struct frame_contexts { } FRAME_CONTEXT; typedef struct { - unsigned int y_mode[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES]; - unsigned int uv_mode[VP9_INTRA_MODES][VP9_INTRA_MODES]; + unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES]; + unsigned int uv_mode[INTRA_MODES][INTRA_MODES]; unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES]; vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES]; unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES] [COEF_BANDS][PREV_COEF_CONTEXTS]; - unsigned int switchable_interp[VP9_SWITCHABLE_FILTERS + 1] - [VP9_SWITCHABLE_FILTERS]; - unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES]; + unsigned int switchable_interp[SWITCHABLE_FILTERS + 1] + [SWITCHABLE_FILTERS]; + unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES]; unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; unsigned int single_ref[REF_CONTEXTS][2][2]; @@ -164,6 +164,10 @@ typedef struct VP9Common { MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */ MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */ + MODE_INFO **mi_grid_base; + MODE_INFO **mi_grid_visible; + MODE_INFO **prev_mi_grid_base; + MODE_INFO **prev_mi_grid_visible; // Persistent mb segment id map used in prediction. unsigned char *last_frame_seg_map; @@ -176,6 +180,9 @@ typedef struct VP9Common { int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */ + struct loopfilter lf; + struct segmentation seg; + /* Y,U,V */ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; @@ -198,7 +205,7 @@ typedef struct VP9Common { unsigned int current_video_frame; int version; -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC struct postproc_state postproc_state; #endif @@ -231,7 +238,19 @@ static void ref_cnt_fb(int *buf, int *idx, int new_idx) { } static int mi_cols_aligned_to_sb(int n_mis) { - return ALIGN_POWER_OF_TWO(n_mis, LOG2_MI_BLOCK_SIZE); + return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2); +} + +static INLINE void set_skip_context(VP9_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col) { + const int above_idx = mi_col * 2; + const int left_idx = (mi_row * 2) & 15; + int i; + for (i = 0; i < MAX_MB_PLANE; i++) { + struct macroblockd_plane *const pd = &xd->plane[i]; + pd->above_context = cm->above_context[i] + (above_idx >> pd->subsampling_x); + pd->left_context = cm->left_context[i] + (left_idx >> pd->subsampling_y); + } } static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd, @@ -240,25 +259,20 @@ static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd, xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK); } -static int check_bsize_coverage(VP9_COMMON *cm, int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize) { - int bsl = mi_width_log2(bsize), bs = 1 << bsl; - int ms = bs / 2; +// return the node index in the prob tree for binary coding +static int check_bsize_coverage(int bs, int mi_rows, int mi_cols, + int mi_row, int mi_col) { + const int r = (mi_row + bs < mi_rows); + const int c = (mi_col + bs < mi_cols); - if ((mi_row + ms < cm->mi_rows) && (mi_col + ms < cm->mi_cols)) + if (r && c) return 0; - // frame width/height are multiples of 8, hence 8x8 block should always - // pass the above check - assert(bsize > BLOCK_SIZE_SB8X8); - - // return the node index in the prob tree for binary coding - // only allow horizontal/split partition types - if ((mi_col + ms < cm->mi_cols) && (mi_row + ms >= cm->mi_rows)) - return 1; - // only allow vertical/split partition types - if ((mi_row + ms < cm->mi_rows) && (mi_col + ms >= cm->mi_cols)) - return 2; + if (c && !r) + return 1; // only allow horizontal/split partition types + + if (r && !c) + return 2; // only allow vertical/split partition types return -1; } diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c index 1157fbb..955e676 100644 --- a/libvpx/vp9/common/vp9_postproc.c +++ b/libvpx/vp9/common/vp9_postproc.c @@ -53,7 +53,7 @@ static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = { { RGB_TO_YUV(0xCC33FF) }, /* Magenta */ }; -static const unsigned char B_PREDICTION_MODE_colors[VP9_INTRA_MODES][3] = { +static const unsigned char B_PREDICTION_MODE_colors[INTRA_MODES][3] = { { RGB_TO_YUV(0x6633ff) }, /* Purple */ { RGB_TO_YUV(0xcc33ff) }, /* Magenta */ { RGB_TO_YUV(0xff33cc) }, /* Pink */ @@ -630,23 +630,21 @@ static void constrain_line(int x0, int *x1, int y0, int *y1, } } -int vp9_post_proc_frame(struct VP9Common *oci, - struct loopfilter *lf, - YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *ppflags) { - int q = lf->filter_level * 10 / 6; +int vp9_post_proc_frame(struct VP9Common *cm, + YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) { + int q = cm->lf.filter_level * 10 / 6; int flags = ppflags->post_proc_flag; int deblock_level = ppflags->deblocking_level; int noise_level = ppflags->noise_level; - if (!oci->frame_to_show) + if (!cm->frame_to_show) return -1; if (q > 63) q = 63; if (!flags) { - *dest = *oci->frame_to_show; + *dest = *cm->frame_to_show; return 0; } @@ -655,52 +653,52 @@ int vp9_post_proc_frame(struct VP9Common *oci, #endif if (flags & VP9D_DEMACROBLOCK) { - deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer, + deblock_and_de_macro_block(cm->frame_to_show, &cm->post_proc_buffer, q + (deblock_level - 5) * 10, 1, 0); } else if (flags & VP9D_DEBLOCK) { - vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q); + vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, q); } else { - vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer); + vp8_yv12_copy_frame(cm->frame_to_show, &cm->post_proc_buffer); } if (flags & VP9D_ADDNOISE) { - if (oci->postproc_state.last_q != q - || oci->postproc_state.last_noise != noise_level) { - fillrd(&oci->postproc_state, 63 - q, noise_level); + if (cm->postproc_state.last_q != q + || cm->postproc_state.last_noise != noise_level) { + fillrd(&cm->postproc_state, 63 - q, noise_level); } - vp9_plane_add_noise(oci->post_proc_buffer.y_buffer, - oci->postproc_state.noise, - oci->postproc_state.blackclamp, - oci->postproc_state.whiteclamp, - oci->postproc_state.bothclamp, - oci->post_proc_buffer.y_width, - oci->post_proc_buffer.y_height, - oci->post_proc_buffer.y_stride); + vp9_plane_add_noise(cm->post_proc_buffer.y_buffer, + cm->postproc_state.noise, + cm->postproc_state.blackclamp, + cm->postproc_state.whiteclamp, + cm->postproc_state.bothclamp, + cm->post_proc_buffer.y_width, + cm->post_proc_buffer.y_height, + cm->post_proc_buffer.y_stride); } #if 0 && CONFIG_POSTPROC_VISUALIZER if (flags & VP9D_DEBUG_TXT_FRAME_INFO) { char message[512]; sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d", - (oci->frame_type == KEY_FRAME), - oci->refresh_golden_frame, - oci->base_qindex, - oci->filter_level, + (cm->frame_type == KEY_FRAME), + cm->refresh_golden_frame, + cm->base_qindex, + cm->filter_level, flags, - oci->mb_cols, oci->mb_rows); - vp9_blit_text(message, oci->post_proc_buffer.y_buffer, - oci->post_proc_buffer.y_stride); + cm->mb_cols, cm->mb_rows); + vp9_blit_text(message, cm->post_proc_buffer.y_buffer, + cm->post_proc_buffer.y_stride); } if (flags & VP9D_DEBUG_TXT_MBLK_MODES) { int i, j; uint8_t *y_ptr; - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; int mb_rows = post->y_height >> 4; int mb_cols = post->y_width >> 4; int mb_index = 0; - MODE_INFO *mi = oci->mi; + MODE_INFO *mi = cm->mi; y_ptr = post->y_buffer + 4 * post->y_stride + 4; @@ -725,11 +723,11 @@ int vp9_post_proc_frame(struct VP9Common *oci, if (flags & VP9D_DEBUG_TXT_DC_DIFF) { int i, j; uint8_t *y_ptr; - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; int mb_rows = post->y_height >> 4; int mb_cols = post->y_width >> 4; int mb_index = 0; - MODE_INFO *mi = oci->mi; + MODE_INFO *mi = cm->mi; y_ptr = post->y_buffer + 4 * post->y_stride + 4; @@ -739,9 +737,9 @@ int vp9_post_proc_frame(struct VP9Common *oci, char zz[4]; int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED && mi[mb_index].mbmi.mode != SPLITMV && - mi[mb_index].mbmi.mb_skip_coeff); + mi[mb_index].mbmi.skip_coeff); - if (oci->frame_type == KEY_FRAME) + if (cm->frame_type == KEY_FRAME) sprintf(zz, "a"); else sprintf(zz, "%c", dc_diff + '0'); @@ -761,19 +759,19 @@ int vp9_post_proc_frame(struct VP9Common *oci, char message[512]; snprintf(message, sizeof(message), "Bitrate: %10.2f framerate: %10.2f ", - oci->bitrate, oci->framerate); - vp9_blit_text(message, oci->post_proc_buffer.y_buffer, - oci->post_proc_buffer.y_stride); + cm->bitrate, cm->framerate); + vp9_blit_text(message, cm->post_proc_buffer.y_buffer, + cm->post_proc_buffer.y_stride); } /* Draw motion vectors */ if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) { - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; int width = post->y_width; int height = post->y_height; - uint8_t *y_buffer = oci->post_proc_buffer.y_buffer; - int y_stride = oci->post_proc_buffer.y_stride; - MODE_INFO *mi = oci->mi; + uint8_t *y_buffer = cm->post_proc_buffer.y_buffer; + int y_stride = cm->post_proc_buffer.y_stride; + MODE_INFO *mi = cm->mi; int x0, y0; for (y0 = 0; y0 < height; y0 += 16) { @@ -882,7 +880,7 @@ int vp9_post_proc_frame(struct VP9Common *oci, } } } - } else if (mi->mbmi.mode >= NEARESTMV) { + } else if (is_inter_mode(mi->mbmi.mode)) { MV *mv = &mi->mbmi.mv.as_mv; const int lx0 = x0 + 8; const int ly0 = y0 + 8; @@ -910,14 +908,14 @@ int vp9_post_proc_frame(struct VP9Common *oci, if ((flags & VP9D_DEBUG_CLR_BLK_MODES) && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) { int y, x; - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; int width = post->y_width; int height = post->y_height; - uint8_t *y_ptr = oci->post_proc_buffer.y_buffer; - uint8_t *u_ptr = oci->post_proc_buffer.u_buffer; - uint8_t *v_ptr = oci->post_proc_buffer.v_buffer; - int y_stride = oci->post_proc_buffer.y_stride; - MODE_INFO *mi = oci->mi; + uint8_t *y_ptr = cm->post_proc_buffer.y_buffer; + uint8_t *u_ptr = cm->post_proc_buffer.u_buffer; + uint8_t *v_ptr = cm->post_proc_buffer.v_buffer; + int y_stride = cm->post_proc_buffer.y_stride; + MODE_INFO *mi = cm->mi; for (y = 0; y < height; y += 16) { for (x = 0; x < width; x += 16) { @@ -975,14 +973,14 @@ int vp9_post_proc_frame(struct VP9Common *oci, if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) && ppflags->display_ref_frame_flag) { int y, x; - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; int width = post->y_width; int height = post->y_height; - uint8_t *y_ptr = oci->post_proc_buffer.y_buffer; - uint8_t *u_ptr = oci->post_proc_buffer.u_buffer; - uint8_t *v_ptr = oci->post_proc_buffer.v_buffer; - int y_stride = oci->post_proc_buffer.y_stride; - MODE_INFO *mi = oci->mi; + uint8_t *y_ptr = cm->post_proc_buffer.y_buffer; + uint8_t *u_ptr = cm->post_proc_buffer.u_buffer; + uint8_t *v_ptr = cm->post_proc_buffer.v_buffer; + int y_stride = cm->post_proc_buffer.y_stride; + MODE_INFO *mi = cm->mi; for (y = 0; y < height; y += 16) { for (x = 0; x < width; x += 16) { @@ -1008,12 +1006,13 @@ int vp9_post_proc_frame(struct VP9Common *oci, } #endif - *dest = oci->post_proc_buffer; + *dest = cm->post_proc_buffer; /* handle problem with extending borders */ - dest->y_width = oci->width; - dest->y_height = oci->height; - dest->uv_height = dest->y_height / 2; + dest->y_width = cm->width; + dest->y_height = cm->height; + dest->uv_width = dest->y_width >> cm->subsampling_x; + dest->uv_height = dest->y_height >> cm->subsampling_y; return 0; } diff --git a/libvpx/vp9/common/vp9_postproc.h b/libvpx/vp9/common/vp9_postproc.h index a814e39..c63beae 100644 --- a/libvpx/vp9/common/vp9_postproc.h +++ b/libvpx/vp9/common/vp9_postproc.h @@ -26,7 +26,7 @@ struct postproc_state { #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_ppflags.h" -int vp9_post_proc_frame(struct VP9Common *oci, struct loopfilter *lf, +int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags); void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q); diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c index 795962a..81fbf1f 100644 --- a/libvpx/vp9/common/vp9_pred_common.c +++ b/libvpx/vp9/common/vp9_pred_common.c @@ -18,50 +18,49 @@ // Returns a context number for the given MB prediction signal unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { - const MODE_INFO *const mi = xd->mode_info_context; - const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; - const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; - const int left_in_image = xd->left_available && left_mbmi->mb_in_image; - const int above_in_image = xd->up_available && above_mbmi->mb_in_image; + const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; + const MODE_INFO * const left_mi = xd->mi_8x8[-1]; + const int left_in_image = xd->left_available && left_mi; + const int above_in_image = xd->up_available && above_mi; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. // left - const int left_mv_pred = is_inter_mode(left_mbmi->mode); - const int left_interp = left_in_image && left_mv_pred ? - vp9_switchable_interp_map[left_mbmi->interp_filter] : - VP9_SWITCHABLE_FILTERS; + const int left_mv_pred = left_in_image ? is_inter_mode(left_mi->mbmi.mode) + : 0; + const int left_interp = left_in_image && left_mv_pred + ? left_mi->mbmi.interp_filter + : SWITCHABLE_FILTERS; // above - const int above_mv_pred = is_inter_mode(above_mbmi->mode); - const int above_interp = above_in_image && above_mv_pred ? - vp9_switchable_interp_map[above_mbmi->interp_filter] : - VP9_SWITCHABLE_FILTERS; - - assert(left_interp != -1); - assert(above_interp != -1); + const int above_mv_pred = above_in_image ? is_inter_mode(above_mi->mbmi.mode) + : 0; + const int above_interp = above_in_image && above_mv_pred + ? above_mi->mbmi.interp_filter + : SWITCHABLE_FILTERS; if (left_interp == above_interp) return left_interp; - else if (left_interp == VP9_SWITCHABLE_FILTERS && - above_interp != VP9_SWITCHABLE_FILTERS) + else if (left_interp == SWITCHABLE_FILTERS && + above_interp != SWITCHABLE_FILTERS) return above_interp; - else if (left_interp != VP9_SWITCHABLE_FILTERS && - above_interp == VP9_SWITCHABLE_FILTERS) + else if (left_interp != SWITCHABLE_FILTERS && + above_interp == SWITCHABLE_FILTERS) return left_interp; else - return VP9_SWITCHABLE_FILTERS; + return SWITCHABLE_FILTERS; } // Returns a context number for the given MB prediction signal unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) { - const MODE_INFO *const mi = xd->mode_info_context; - const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; - const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; - const int left_in_image = xd->left_available && left_mbmi->mb_in_image; - const int above_in_image = xd->up_available && above_mbmi->mb_in_image; - const int left_intra = !is_inter_block(left_mbmi); - const int above_intra = !is_inter_block(above_mbmi); + const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; + const MODE_INFO * const left_mi = xd->mi_8x8[-1]; + const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0; + const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0; + const int left_in_image = xd->left_available && left_mi; + const int above_in_image = xd->up_available && above_mi; + const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; // The mode info data structure has a one element border above and to the // left of the entries corresponding to real macroblocks. @@ -82,35 +81,35 @@ unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) { unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm, const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO *const mi = xd->mode_info_context; - const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi; - const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; - const int left_in_image = xd->left_available && left_mbmi->mb_in_image; - const int above_in_image = xd->up_available && above_mbmi->mb_in_image; + const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; + const MODE_INFO * const left_mi = xd->mi_8x8[-1]; + const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0; + const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0; + const int left_in_image = xd->left_available && left_mi; + const int above_in_image = xd->up_available && above_mi; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. if (above_in_image && left_in_image) { // both edges available - if (above_mbmi->ref_frame[1] <= INTRA_FRAME && - left_mbmi->ref_frame[1] <= INTRA_FRAME) + if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) // neither edge uses comp pred (0/1) pred_context = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^ (left_mbmi->ref_frame[0] == cm->comp_fixed_ref); - else if (above_mbmi->ref_frame[1] <= INTRA_FRAME) + else if (!has_second_ref(above_mbmi)) // one of two edges uses comp pred (2/3) pred_context = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref || - above_mbmi->ref_frame[0] == INTRA_FRAME); - else if (left_mbmi->ref_frame[1] <= INTRA_FRAME) + !is_inter_block(above_mbmi)); + else if (!has_second_ref(left_mbmi)) // one of two edges uses comp pred (2/3) pred_context = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref || - left_mbmi->ref_frame[0] == INTRA_FRAME); + !is_inter_block(left_mbmi)); else // both edges use comp pred (4) pred_context = 4; } else if (above_in_image || left_in_image) { // one edge available const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; - if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) + if (!has_second_ref(edge_mbmi)) // edge does not use comp pred (0/1) pred_context = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref; else @@ -127,11 +126,14 @@ unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm, unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO *const mi = xd->mode_info_context; - const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi; - const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; - const int left_in_image = xd->left_available && left_mbmi->mb_in_image; - const int above_in_image = xd->up_available && above_mbmi->mb_in_image; + const MODE_INFO * const above_mi = xd->mi_8x8[-cm->mode_info_stride]; + const MODE_INFO * const left_mi = xd->mi_8x8[-1]; + const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0; + const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0; + const int left_in_image = xd->left_available && left_mi; + const int above_in_image = xd->up_available && above_mi; + const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. @@ -140,22 +142,19 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, const int var_ref_idx = !fix_ref_idx; if (above_in_image && left_in_image) { // both edges available - if (above_mbmi->ref_frame[0] == INTRA_FRAME && - left_mbmi->ref_frame[0] == INTRA_FRAME) { // intra/intra (2) + if (above_intra && left_intra) { // intra/intra (2) pred_context = 2; - } else if (above_mbmi->ref_frame[0] == INTRA_FRAME || - left_mbmi->ref_frame[0] == INTRA_FRAME) { // intra/inter - const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ? - left_mbmi : above_mbmi; + } else if (above_intra || left_intra) { // intra/inter + const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi; - if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) // single pred (1/3) + if (!has_second_ref(edge_mbmi)) // single pred (1/3) pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]); else // comp pred (1/3) pred_context = 1 + 2 * (edge_mbmi->ref_frame[var_ref_idx] != cm->comp_var_ref[1]); } else { // inter/inter - int l_sg = left_mbmi->ref_frame[1] <= INTRA_FRAME; - int a_sg = above_mbmi->ref_frame[1] <= INTRA_FRAME; + const int l_sg = !has_second_ref(left_mbmi); + const int a_sg = !has_second_ref(above_mbmi); MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0] : above_mbmi->ref_frame[var_ref_idx]; MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0] @@ -189,13 +188,15 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, } else if (above_in_image || left_in_image) { // one edge available const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; - if (edge_mbmi->ref_frame[0] == INTRA_FRAME) + if (!is_inter_block(edge_mbmi)) { pred_context = 2; - else if (edge_mbmi->ref_frame[1] > INTRA_FRAME) - pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx] + } else { + if (has_second_ref(edge_mbmi)) + pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx] != cm->comp_var_ref[1]); - else - pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]); + else + pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]); + } } else { // no edges available (2) pred_context = 2; } @@ -205,91 +206,91 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, } unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO *const mi = xd->mode_info_context; - const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; - const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; - const int left_in_image = xd->left_available && left_mbmi->mb_in_image; - const int above_in_image = xd->up_available && above_mbmi->mb_in_image; + const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; + const MODE_INFO * const left_mi = xd->mi_8x8[-1]; + const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0; + const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0; + const int left_in_image = xd->left_available && left_mi; + const int above_in_image = xd->up_available && above_mi; + const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. if (above_in_image && left_in_image) { // both edges available - if (above_mbmi->ref_frame[0] == INTRA_FRAME && - left_mbmi->ref_frame[0] == INTRA_FRAME) { + if (above_intra && left_intra) { // intra/intra pred_context = 2; - } else if (above_mbmi->ref_frame[0] == INTRA_FRAME || - left_mbmi->ref_frame[0] == INTRA_FRAME) { - const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ? - left_mbmi : above_mbmi; - - if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) + } else if (above_intra || left_intra) { // intra/inter or inter/intra + const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi; + if (!has_second_ref(edge_mbmi)) pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME); else pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME || edge_mbmi->ref_frame[1] == LAST_FRAME); - } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME && - left_mbmi->ref_frame[1] <= INTRA_FRAME) { - pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) + - 2 * (left_mbmi->ref_frame[0] == LAST_FRAME); - } else if (above_mbmi->ref_frame[1] > INTRA_FRAME && - left_mbmi->ref_frame[1] > INTRA_FRAME) { - pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME || - above_mbmi->ref_frame[1] == LAST_FRAME || - left_mbmi->ref_frame[0] == LAST_FRAME || - left_mbmi->ref_frame[1] == LAST_FRAME); - } else { - MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ? - above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; - - if (rfs == LAST_FRAME) - pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME); - else - pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME; + } else { // inter/inter + if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) { + pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) + + 2 * (left_mbmi->ref_frame[0] == LAST_FRAME); + } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) { + pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME || + above_mbmi->ref_frame[1] == LAST_FRAME || + left_mbmi->ref_frame[0] == LAST_FRAME || + left_mbmi->ref_frame[1] == LAST_FRAME); + } else { + const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ? + above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ? + above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ? + above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; + + if (rfs == LAST_FRAME) + pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME); + else + pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME; + } } } else if (above_in_image || left_in_image) { // one edge available const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; - - if (edge_mbmi->ref_frame[0] == INTRA_FRAME) + if (!is_inter_block(edge_mbmi)) { // intra pred_context = 2; - else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) - pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME); - else - pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME || - edge_mbmi->ref_frame[1] == LAST_FRAME); - } else { // no edges available (2) + } else { // inter + if (!has_second_ref(edge_mbmi)) + pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME); + else + pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME || + edge_mbmi->ref_frame[1] == LAST_FRAME); + } + } else { // no edges available pred_context = 2; } + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); return pred_context; } unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO *const mi = xd->mode_info_context; - const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; - const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; - const int left_in_image = xd->left_available && left_mbmi->mb_in_image; - const int above_in_image = xd->up_available && above_mbmi->mb_in_image; + const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; + const MODE_INFO * const left_mi = xd->mi_8x8[-1]; + const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0; + const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0; + const int left_in_image = xd->left_available && left_mi; + const int above_in_image = xd->up_available && above_mi; + const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. if (above_in_image && left_in_image) { // both edges available - if (above_mbmi->ref_frame[0] == INTRA_FRAME && - left_mbmi->ref_frame[0] == INTRA_FRAME) { + if (above_intra && left_intra) { // intra/intra pred_context = 2; - } else if (above_mbmi->ref_frame[0] == INTRA_FRAME || - left_mbmi->ref_frame[0] == INTRA_FRAME) { - const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ? - left_mbmi : above_mbmi; - - if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) { + } else if (above_intra || left_intra) { // intra/inter or inter/intra + const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi; + if (!has_second_ref(edge_mbmi)) { if (edge_mbmi->ref_frame[0] == LAST_FRAME) pred_context = 3; else @@ -298,54 +299,53 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME || edge_mbmi->ref_frame[1] == GOLDEN_FRAME); } - } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME && - left_mbmi->ref_frame[1] <= INTRA_FRAME) { - if (above_mbmi->ref_frame[0] == LAST_FRAME && - left_mbmi->ref_frame[0] == LAST_FRAME) { - pred_context = 3; - } else if (above_mbmi->ref_frame[0] == LAST_FRAME || - left_mbmi->ref_frame[0] == LAST_FRAME) { - const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == LAST_FRAME ? - left_mbmi : above_mbmi; + } else { // inter/inter + if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) { + if (above_mbmi->ref_frame[0] == LAST_FRAME && + left_mbmi->ref_frame[0] == LAST_FRAME) { + pred_context = 3; + } else if (above_mbmi->ref_frame[0] == LAST_FRAME || + left_mbmi->ref_frame[0] == LAST_FRAME) { + const MB_MODE_INFO *edge_mbmi = + above_mbmi->ref_frame[0] == LAST_FRAME ? left_mbmi : above_mbmi; - pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME); + pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME); + } else { + pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) + + 2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME); + } + } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) { + if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] && + above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1]) + pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME || + above_mbmi->ref_frame[1] == GOLDEN_FRAME || + left_mbmi->ref_frame[0] == GOLDEN_FRAME || + left_mbmi->ref_frame[1] == GOLDEN_FRAME); + else + pred_context = 2; } else { - pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) + - 2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME); + const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ? + above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ? + above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ? + above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; + + if (rfs == GOLDEN_FRAME) + pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); + else if (rfs == ALTREF_FRAME) + pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME; + else + pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); } - } else if (above_mbmi->ref_frame[1] > INTRA_FRAME && - left_mbmi->ref_frame[1] > INTRA_FRAME) { - if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] && - above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1]) - pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME || - above_mbmi->ref_frame[1] == GOLDEN_FRAME || - left_mbmi->ref_frame[0] == GOLDEN_FRAME || - left_mbmi->ref_frame[1] == GOLDEN_FRAME); - else - pred_context = 2; - } else { - MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ? - above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; - - if (rfs == GOLDEN_FRAME) - pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); - else if (rfs == ALTREF_FRAME) - pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME; - else - pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); } } else if (above_in_image || left_in_image) { // one edge available const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; - if (edge_mbmi->ref_frame[0] == INTRA_FRAME || - (edge_mbmi->ref_frame[0] == LAST_FRAME && - edge_mbmi->ref_frame[1] <= INTRA_FRAME)) + if (!is_inter_block(edge_mbmi) || + (edge_mbmi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mbmi))) pred_context = 2; - else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) + else if (!has_second_ref(edge_mbmi)) pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME); else pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME || @@ -361,22 +361,23 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { // left of the entries corresponding to real blocks. // The prediction flags in these dummy entries are initialized to 0. unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) { - const MODE_INFO *const mi = xd->mode_info_context; - const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; - const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; - const int left_in_image = xd->left_available && left_mbmi->mb_in_image; - const int above_in_image = xd->up_available && above_mbmi->mb_in_image; - const int max_tx_size = max_txsize_lookup[mi->mbmi.sb_type]; + const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; + const MODE_INFO * const left_mi = xd->mi_8x8[-1]; + const MB_MODE_INFO *const above_mbmi = above_mi ? &above_mi->mbmi : 0; + const MB_MODE_INFO *const left_mbmi = left_mi ? &left_mi->mbmi : 0; + const int left_in_image = xd->left_available && left_mi; + const int above_in_image = xd->up_available && above_mi; + const int max_tx_size = max_txsize_lookup[xd->mi_8x8[0]->mbmi.sb_type]; int above_context = max_tx_size; int left_context = max_tx_size; if (above_in_image) - above_context = above_mbmi->mb_skip_coeff ? max_tx_size - : above_mbmi->txfm_size; + above_context = above_mbmi->skip_coeff ? max_tx_size + : above_mbmi->tx_size; if (left_in_image) - left_context = left_mbmi->mb_skip_coeff ? max_tx_size - : left_mbmi->txfm_size; + left_context = left_mbmi->skip_coeff ? max_tx_size + : left_mbmi->tx_size; if (!left_in_image) left_context = above_context; @@ -387,36 +388,17 @@ unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) { return above_context + left_context > max_tx_size; } -void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize, - int mi_row, int mi_col, uint8_t pred_flag) { - MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col]; - const int bw = 1 << mi_width_log2(bsize); - const int bh = 1 << mi_height_log2(bsize); - const int xmis = MIN(cm->mi_cols - mi_col, bw); - const int ymis = MIN(cm->mi_rows - mi_row, bh); - int x, y; - - for (y = 0; y < ymis; y++) - for (x = 0; x < xmis; x++) - mi[y * cm->mode_info_stride + x].mbmi.seg_id_predicted = pred_flag; +void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) { + xd->this_mi->mbmi.seg_id_predicted = pred_flag; } -void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize, - int mi_row, int mi_col, uint8_t pred_flag) { - MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col]; - const int bw = 1 << mi_width_log2(bsize); - const int bh = 1 << mi_height_log2(bsize); - const int xmis = MIN(cm->mi_cols - mi_col, bw); - const int ymis = MIN(cm->mi_rows - mi_row, bh); - int x, y; - - for (y = 0; y < ymis; y++) - for (x = 0; x < xmis; x++) - mi[y * cm->mode_info_stride + x].mbmi.mb_skip_coeff = pred_flag; +void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize, + uint8_t pred_flag) { + xd->this_mi->mbmi.skip_coeff = pred_flag; } int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, - BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col) { + BLOCK_SIZE bsize, int mi_row, int mi_col) { const int mi_offset = mi_row * cm->mi_cols + mi_col; const int bw = 1 << mi_width_log2(bsize); const int bh = 1 << mi_height_log2(bsize); diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h index 238290b..47ca8ab 100644 --- a/libvpx/vp9/common/vp9_pred_common.h +++ b/libvpx/vp9/common/vp9_pred_common.h @@ -15,32 +15,32 @@ #include "vp9/common/vp9_onyxc_int.h" int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, - BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col); + BLOCK_SIZE bsize, int mi_row, int mi_col); static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) { - const MODE_INFO *const mi = xd->mode_info_context; - const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; - const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; + const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; + const MODE_INFO * const left_mi = xd->mi_8x8[-1]; + const int above_sip = above_mi ? above_mi->mbmi.seg_id_predicted : 0; + const int left_sip = left_mi ? left_mi->mbmi.seg_id_predicted : 0; - return above_mbmi->seg_id_predicted + - (xd->left_available ? left_mbmi->seg_id_predicted : 0); + return above_sip + (xd->left_available ? left_sip : 0); } -static INLINE vp9_prob vp9_get_pred_prob_seg_id(const MACROBLOCKD *xd) { - return xd->seg.pred_probs[vp9_get_pred_context_seg_id(xd)]; +static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg, + const MACROBLOCKD *xd) { + return seg->pred_probs[vp9_get_pred_context_seg_id(xd)]; } -void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize, - int mi_row, int mi_col, uint8_t pred_flag); +void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag); static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) { - const MODE_INFO *const mi = xd->mode_info_context; - const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; - const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; + const MODE_INFO * const above_mi = xd->mi_8x8[-xd->mode_info_stride]; + const MODE_INFO * const left_mi = xd->mi_8x8[-1]; + const int above_skip_coeff = above_mi ? above_mi->mbmi.skip_coeff : 0; + const int left_skip_coeff = left_mi ? left_mi->mbmi.skip_coeff : 0; - return above_mbmi->mb_skip_coeff + - (xd->left_available ? left_mbmi->mb_skip_coeff : 0); + return above_skip_coeff + (xd->left_available ? left_skip_coeff : 0); } static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm, @@ -49,20 +49,14 @@ static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm, } static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) { - return xd->mode_info_context->mbmi.mb_skip_coeff; + return xd->this_mi->mbmi.skip_coeff; } -void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize, - int mi_row, int mi_col, uint8_t pred_flag); +void vp9_set_pred_flag_mbskip(MACROBLOCKD *xd, BLOCK_SIZE bsize, + uint8_t pred_flag); unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd); -static INLINE const vp9_prob *vp9_get_pred_probs_switchable_interp( - const VP9_COMMON *cm, const MACROBLOCKD *xd) { - const int pred_context = vp9_get_pred_context_switchable_interp(xd); - return &cm->fc.switchable_interp_prob[pred_context][0]; -} - unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd); static INLINE vp9_prob vp9_get_pred_prob_intra_inter(const VP9_COMMON *cm, @@ -108,7 +102,7 @@ static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd); -static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context, +static const vp9_prob *get_tx_probs(BLOCK_SIZE bsize, uint8_t context, const struct tx_probs *tx_probs) { if (bsize < BLOCK_16X16) return tx_probs->p8x8[context]; @@ -119,13 +113,14 @@ static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context, } static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd, - const struct tx_probs *tx_probs) { - const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; + const struct tx_probs *tx_probs, + const MODE_INFO *m) { + const BLOCK_SIZE bsize = m->mbmi.sb_type; const int context = vp9_get_pred_context_tx_size(xd); return get_tx_probs(bsize, context, tx_probs); } -static void update_tx_counts(BLOCK_SIZE_TYPE bsize, uint8_t context, +static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context, TX_SIZE tx_size, struct tx_counts *tx_counts) { if (bsize >= BLOCK_32X32) tx_counts->p32x32[context][tx_size]++; diff --git a/libvpx/vp9/common/vp9_quant_common.c b/libvpx/vp9/common/vp9_quant_common.c index 48d86c5..bc40854 100644 --- a/libvpx/vp9/common/vp9_quant_common.c +++ b/libvpx/vp9/common/vp9_quant_common.c @@ -130,12 +130,12 @@ int16_t vp9_ac_quant(int qindex, int delta) { } -int vp9_get_qindex(MACROBLOCKD *xd, int segment_id, int base_qindex) { - if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_ALT_Q)) { - const int data = vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_ALT_Q); - return xd->seg.abs_delta == SEGMENT_ABSDATA ? - data : // Abs value - clamp(base_qindex + data, 0, MAXQ); // Delta value +int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex) { + if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { + const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q); + return seg->abs_delta == SEGMENT_ABSDATA ? + data : // Abs value + clamp(base_qindex + data, 0, MAXQ); // Delta value } else { return base_qindex; } diff --git a/libvpx/vp9/common/vp9_quant_common.h b/libvpx/vp9/common/vp9_quant_common.h index ded9426..83f2fb6 100644 --- a/libvpx/vp9/common/vp9_quant_common.h +++ b/libvpx/vp9/common/vp9_quant_common.h @@ -23,6 +23,6 @@ void vp9_init_quant_tables(); int16_t vp9_dc_quant(int qindex, int delta); int16_t vp9_ac_quant(int qindex, int delta); -int vp9_get_qindex(MACROBLOCKD *mb, int segment_id, int base_qindex); +int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex); #endif // VP9_COMMON_VP9_QUANT_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c index 0b65e06..dc1d46c 100644 --- a/libvpx/vp9/common/vp9_reconinter.c +++ b/libvpx/vp9/common/vp9_reconinter.c @@ -10,171 +10,27 @@ #include <assert.h> +#include "./vpx_scale_rtcd.h" #include "./vpx_config.h" + #include "vpx/vpx_integer.h" + #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" -#include "./vpx_scale_rtcd.h" - -static int scale_value_x_with_scaling(int val, - const struct scale_factors *scale) { - return (val * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT); -} - -static int scale_value_y_with_scaling(int val, - const struct scale_factors *scale) { - return (val * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT); -} - -static int unscaled_value(int val, const struct scale_factors *scale) { - (void) scale; - return val; -} - -static MV32 mv_q3_to_q4_with_scaling(const MV *mv, - const struct scale_factors *scale) { - const MV32 res = { - ((mv->row << 1) * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) - + scale->y_offset_q4, - ((mv->col << 1) * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) - + scale->x_offset_q4 - }; - return res; -} - -static MV32 mv_q3_to_q4_without_scaling(const MV *mv, - const struct scale_factors *scale) { - const MV32 res = { - mv->row << 1, - mv->col << 1 - }; - return res; -} - -static MV32 mv_q4_with_scaling(const MV *mv, - const struct scale_factors *scale) { - const MV32 res = { - (mv->row * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->y_offset_q4, - (mv->col * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->x_offset_q4 - }; - return res; -} - -static MV32 mv_q4_without_scaling(const MV *mv, - const struct scale_factors *scale) { - const MV32 res = { - mv->row, - mv->col - }; - return res; -} - -static void set_offsets_with_scaling(struct scale_factors *scale, - int row, int col) { - const int x_q4 = 16 * col; - const int y_q4 = 16 * row; - - scale->x_offset_q4 = (x_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf; - scale->y_offset_q4 = (y_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf; -} - -static void set_offsets_without_scaling(struct scale_factors *scale, - int row, int col) { - scale->x_offset_q4 = 0; - scale->y_offset_q4 = 0; -} -static int get_fixed_point_scale_factor(int other_size, int this_size) { - // Calculate scaling factor once for each reference frame - // and use fixed point scaling factors in decoding and encoding routines. - // Hardware implementations can calculate scale factor in device driver - // and use multiplication and shifting on hardware instead of division. - return (other_size << VP9_REF_SCALE_SHIFT) / this_size; -} - -void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, - int other_w, int other_h, - int this_w, int this_h) { - scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); - scale->x_offset_q4 = 0; // calculated per-mb - scale->x_step_q4 = (16 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT); - - scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); - scale->y_offset_q4 = 0; // calculated per-mb - scale->y_step_q4 = (16 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT); - - if ((other_w == this_w) && (other_h == this_h)) { - scale->scale_value_x = unscaled_value; - scale->scale_value_y = unscaled_value; - scale->set_scaled_offsets = set_offsets_without_scaling; - scale->scale_mv_q3_to_q4 = mv_q3_to_q4_without_scaling; - scale->scale_mv_q4 = mv_q4_without_scaling; - } else { - scale->scale_value_x = scale_value_x_with_scaling; - scale->scale_value_y = scale_value_y_with_scaling; - scale->set_scaled_offsets = set_offsets_with_scaling; - scale->scale_mv_q3_to_q4 = mv_q3_to_q4_with_scaling; - scale->scale_mv_q4 = mv_q4_with_scaling; - } - - // TODO(agrange): Investigate the best choice of functions to use here - // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what - // to do at full-pel offsets. The current selection, where the filter is - // applied in one direction only, and not at all for 0,0, seems to give the - // best quality, but it may be worth trying an additional mode that does - // do the filtering on full-pel. - if (scale->x_step_q4 == 16) { - if (scale->y_step_q4 == 16) { - // No scaling in either direction. - scale->predict[0][0][0] = vp9_convolve_copy; - scale->predict[0][0][1] = vp9_convolve_avg; - scale->predict[0][1][0] = vp9_convolve8_vert; - scale->predict[0][1][1] = vp9_convolve8_avg_vert; - scale->predict[1][0][0] = vp9_convolve8_horiz; - scale->predict[1][0][1] = vp9_convolve8_avg_horiz; - } else { - // No scaling in x direction. Must always scale in the y direction. - scale->predict[0][0][0] = vp9_convolve8_vert; - scale->predict[0][0][1] = vp9_convolve8_avg_vert; - scale->predict[0][1][0] = vp9_convolve8_vert; - scale->predict[0][1][1] = vp9_convolve8_avg_vert; - scale->predict[1][0][0] = vp9_convolve8; - scale->predict[1][0][1] = vp9_convolve8_avg; - } - } else { - if (scale->y_step_q4 == 16) { - // No scaling in the y direction. Must always scale in the x direction. - scale->predict[0][0][0] = vp9_convolve8_horiz; - scale->predict[0][0][1] = vp9_convolve8_avg_horiz; - scale->predict[0][1][0] = vp9_convolve8; - scale->predict[0][1][1] = vp9_convolve8_avg; - scale->predict[1][0][0] = vp9_convolve8_horiz; - scale->predict[1][0][1] = vp9_convolve8_avg_horiz; - } else { - // Must always scale in both directions. - scale->predict[0][0][0] = vp9_convolve8; - scale->predict[0][0][1] = vp9_convolve8_avg; - scale->predict[0][1][0] = vp9_convolve8; - scale->predict[0][1][1] = vp9_convolve8_avg; - scale->predict[1][0][0] = vp9_convolve8; - scale->predict[1][0][1] = vp9_convolve8_avg; - } - } - // 2D subpel motion always gets filtered in both directions - scale->predict[1][1][0] = vp9_convolve8; - scale->predict[1][1][1] = vp9_convolve8_avg; -} void vp9_setup_interp_filters(MACROBLOCKD *xd, INTERPOLATIONFILTERTYPE mcomp_filter_type, VP9_COMMON *cm) { - if (xd->mode_info_context) { - MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + if (xd->mi_8x8 && xd->this_mi) { + MB_MODE_INFO * mbmi = &xd->this_mi->mbmi; set_scale_factors(xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1, cm->active_ref_scale); + } else { + set_scale_factors(xd, -1, -1, cm->active_ref_scale); } switch (mcomp_filter_type) { @@ -199,17 +55,18 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const MV *src_mv, const struct scale_factors *scale, - int w, int h, int weight, + int w, int h, int ref, const struct subpix_fn_table *subpix, enum mv_precision precision) { - const MV32 mv = precision == MV_PRECISION_Q4 - ? scale->scale_mv_q4(src_mv, scale) - : scale->scale_mv_q3_to_q4(src_mv, scale); - const int subpel_x = mv.col & 15; - const int subpel_y = mv.row & 15; - - src += (mv.row >> 4) * src_stride + (mv.col >> 4); - scale->predict[!!subpel_x][!!subpel_y][weight]( + const int is_q4 = precision == MV_PRECISION_Q4; + const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row << 1, + is_q4 ? src_mv->col : src_mv->col << 1 }; + const MV32 mv = scale->scale_mv(&mv_q4, scale); + const int subpel_x = mv.col & SUBPEL_MASK; + const int subpel_y = mv.row & SUBPEL_MASK; + + src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS); + scale->predict[subpel_x != 0][subpel_y != 0][ref]( src, src_stride, dst, dst_stride, subpix->filter_x[subpel_x], scale->x_step_q4, subpix->filter_y[subpel_y], scale->y_step_q4, @@ -232,20 +89,16 @@ static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) { return res; } - - // TODO(jkoleszar): yet another mv clamping function :-( -MV clamp_mv_to_umv_border_sb(const MV *src_mv, - int bwl, int bhl, int ss_x, int ss_y, - int mb_to_left_edge, int mb_to_top_edge, - int mb_to_right_edge, int mb_to_bottom_edge) { +MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, + int bw, int bh, int ss_x, int ss_y) { // If the MV points so far into the UMV border that no visible pixels // are used for reconstruction, the subpel part of the MV can be // discarded and the MV limited to 16 pixels with equivalent results. - const int spel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 4; - const int spel_right = spel_left - (1 << 4); - const int spel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 4; - const int spel_bottom = spel_top - (1 << 4); + const int spel_left = (VP9_INTERP_EXTEND + bw) << SUBPEL_BITS; + const int spel_right = spel_left - SUBPEL_SHIFTS; + const int spel_top = (VP9_INTERP_EXTEND + bh) << SUBPEL_BITS; + const int spel_bottom = spel_top - SUBPEL_SHIFTS; MV clamped_mv = { src_mv->row << (1 - ss_y), src_mv->col << (1 - ss_x) @@ -253,130 +106,143 @@ MV clamp_mv_to_umv_border_sb(const MV *src_mv, assert(ss_x <= 1); assert(ss_y <= 1); - clamp_mv(&clamped_mv, (mb_to_left_edge << (1 - ss_x)) - spel_left, - (mb_to_right_edge << (1 - ss_x)) + spel_right, - (mb_to_top_edge << (1 - ss_y)) - spel_top, - (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom); + clamp_mv(&clamped_mv, (xd->mb_to_left_edge << (1 - ss_x)) - spel_left, + (xd->mb_to_right_edge << (1 - ss_x)) + spel_right, + (xd->mb_to_top_edge << (1 - ss_y)) - spel_top, + (xd->mb_to_bottom_edge << (1 - ss_y)) + spel_bottom); return clamped_mv; } struct build_inter_predictors_args { MACROBLOCKD *xd; - int x; - int y; - uint8_t* dst[MAX_MB_PLANE]; - int dst_stride[MAX_MB_PLANE]; - uint8_t* pre[2][MAX_MB_PLANE]; - int pre_stride[2][MAX_MB_PLANE]; + int x, y; }; -static void build_inter_predictors(int plane, int block, - BLOCK_SIZE_TYPE bsize, + +static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize, int pred_w, int pred_h, void *argv) { const struct build_inter_predictors_args* const arg = argv; - MACROBLOCKD * const xd = arg->xd; - const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; - const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; - const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl); - const MODE_INFO *const mi = xd->mode_info_context; + MACROBLOCKD *const xd = arg->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int bwl = b_width_log2(bsize) - pd->subsampling_x; + const int bw = 4 << bwl; + const int bh = plane_block_height(bsize, pd); + const int x = 4 * (block & ((1 << bwl) - 1)); + const int y = 4 * (block >> bwl); + const MODE_INFO *mi = xd->this_mi; const int use_second_ref = mi->mbmi.ref_frame[1] > 0; - int which_mv; + int ref; + + assert(x < bw); + assert(y < bh); + assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw); + assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh); - assert(x < (4 << bwl)); - assert(y < (4 << bhl)); - assert(mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 || 4 << pred_w == (4 << bwl)); - assert(mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 || 4 << pred_h == (4 << bhl)); + for (ref = 0; ref < 1 + use_second_ref; ++ref) { + struct scale_factors *const scale = &xd->scale_factor[ref]; + struct buf_2d *const pre_buf = &pd->pre[ref]; + struct buf_2d *const dst_buf = &pd->dst; - for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { - // source - const uint8_t * const base_pre = arg->pre[which_mv][plane]; - const int pre_stride = arg->pre_stride[which_mv][plane]; - const uint8_t *const pre = base_pre + - scaled_buffer_offset(x, y, pre_stride, &xd->scale_factor[which_mv]); - struct scale_factors * const scale = &xd->scale_factor[which_mv]; + const uint8_t *const pre = pre_buf->buf + scaled_buffer_offset(x, y, + pre_buf->stride, scale); - // dest - uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x; + uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the // same MV (the average of the 4 luma MVs) but we could do something // smarter for non-4:2:0. Just punt for now, pending the changes to get // rid of SPLITMV mode entirely. - const MV mv = mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 - ? (plane == 0 ? mi->bmi[block].as_mv[which_mv].as_mv - : mi_mv_pred_q4(mi, which_mv)) - : mi->mbmi.mv[which_mv].as_mv; + const MV mv = mi->mbmi.sb_type < BLOCK_8X8 + ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv + : mi_mv_pred_q4(mi, ref)) + : mi->mbmi.mv[ref].as_mv; // TODO(jkoleszar): This clamping is done in the incorrect place for the // scaling case. It needs to be done on the scaled MV, not the pre-scaling // MV. Note however that it performs the subsampling aware scaling so // that the result is always q4. - const MV res_mv = clamp_mv_to_umv_border_sb(&mv, bwl, bhl, - xd->plane[plane].subsampling_x, - xd->plane[plane].subsampling_y, - xd->mb_to_left_edge, - xd->mb_to_top_edge, - xd->mb_to_right_edge, - xd->mb_to_bottom_edge); + const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, + pd->subsampling_x, + pd->subsampling_y); + scale->set_scaled_offsets(scale, arg->y + y, arg->x + x); - vp9_build_inter_predictor(pre, pre_stride, - dst, arg->dst_stride[plane], - &res_mv, &xd->scale_factor[which_mv], - 4 << pred_w, 4 << pred_h, which_mv, + vp9_build_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, + &res_mv, scale, + 4 << pred_w, 4 << pred_h, ref, &xd->subpix, MV_PRECISION_Q4); } } -void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, - int mi_row, - int mi_col, - BLOCK_SIZE_TYPE bsize) { - struct build_inter_predictors_args args = { - xd, mi_col * MI_SIZE, mi_row * MI_SIZE, - {xd->plane[0].dst.buf, NULL, NULL}, {xd->plane[0].dst.stride, 0, 0}, - {{xd->plane[0].pre[0].buf, NULL, NULL}, - {xd->plane[0].pre[1].buf, NULL, NULL}}, - {{xd->plane[0].pre[0].stride, 0, 0}, {xd->plane[0].pre[1].stride, 0, 0}}, - }; - foreach_predicted_block_in_plane(xd, bsize, 0, build_inter_predictors, &args); +// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could +// calculate the subsampled BLOCK_SIZE, but that type isn't defined for +// sizes smaller than 16x16 yet. +typedef void (*foreach_predicted_block_visitor)(int plane, int block, + BLOCK_SIZE bsize, + int pred_w, int pred_h, + void *arg); +static INLINE void foreach_predicted_block_in_plane( + const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane, + foreach_predicted_block_visitor visit, void *arg) { + int i, x, y; + + // block sizes in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // subsampled size of the block + const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; + const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; + + // size of the predictor to use. + int pred_w, pred_h; + + if (xd->this_mi->mbmi.sb_type < BLOCK_8X8) { + assert(bsize == BLOCK_8X8); + pred_w = 0; + pred_h = 0; + } else { + pred_w = bwl; + pred_h = bhl; + } + assert(pred_w <= bwl); + assert(pred_h <= bhl); + + // visit each subblock in raster order + i = 0; + for (y = 0; y < 1 << bhl; y += 1 << pred_h) { + for (x = 0; x < 1 << bwl; x += 1 << pred_w) { + visit(plane, i, bsize, pred_w, pred_h, arg); + i += 1 << pred_w; + } + i += (1 << (bwl + pred_h)) - (1 << bwl); + } } -void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, - int mi_row, - int mi_col, - BLOCK_SIZE_TYPE bsize) { - struct build_inter_predictors_args args = { - xd, mi_col * MI_SIZE, mi_row * MI_SIZE, -#if CONFIG_ALPHA - {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf, - xd->plane[3].dst.buf}, - {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride, - xd->plane[3].dst.stride}, - {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf, - xd->plane[3].pre[0].buf}, - {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf, - xd->plane[3].pre[1].buf}}, - {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride, - xd->plane[3].pre[0].stride}, - {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride, - xd->plane[3].pre[1].stride}}, -#else - {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf}, - {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride}, - {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf}, - {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf}}, - {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride}, - {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride}}, -#endif - }; - foreach_predicted_block_uv(xd, bsize, build_inter_predictors, &args); + +static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int mi_row, int mi_col, + int plane_from, int plane_to) { + int plane; + for (plane = plane_from; plane <= plane_to; ++plane) { + struct build_inter_predictors_args args = { + xd, mi_col * MI_SIZE, mi_row * MI_SIZE, + }; + foreach_predicted_block_in_plane(xd, bsize, plane, build_inter_predictors, + &args); + } } -void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, - int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize) { - vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); - vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize); +void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0); +} +void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1, + MAX_MB_PLANE - 1); +} +void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, + MAX_MB_PLANE - 1); } // TODO(dkovalev: find better place for this function) @@ -391,8 +257,7 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) { fb->y_crop_width, fb->y_crop_height, cm->width, cm->height); - if (sf->x_scale_fp != VP9_REF_NO_SCALE || - sf->y_scale_fp != VP9_REF_NO_SCALE) + if (vp9_is_scaled(sf)) vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y); } } diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h index 6ec7323..504b793 100644 --- a/libvpx/vp9/common/vp9_reconinter.h +++ b/libvpx/vp9/common/vp9_reconinter.h @@ -15,28 +15,19 @@ #include "vp9/common/vp9_onyxc_int.h" struct subpix_fn_table; -void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, - int mb_row, - int mb_col, - BLOCK_SIZE_TYPE bsize); +void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize); -void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, - int mb_row, - int mb_col, - BLOCK_SIZE_TYPE bsize); +void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize); -void vp9_build_inter_predictors_sb(MACROBLOCKD *mb, - int mb_row, int mb_col, - BLOCK_SIZE_TYPE bsize); +void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize); void vp9_setup_interp_filters(MACROBLOCKD *xd, INTERPOLATIONFILTERTYPE filter, VP9_COMMON *cm); -void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, - int other_w, int other_h, - int this_w, int this_h); - void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const MV *mv_q3, diff --git a/libvpx/vp9/common/vp9_reconintra.c b/libvpx/vp9/common/vp9_reconintra.c index f351224..4a451b9 100644 --- a/libvpx/vp9/common/vp9_reconintra.c +++ b/libvpx/vp9/common/vp9_reconintra.c @@ -8,14 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <stdio.h> - #include "./vpx_config.h" + +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_once.h" + #include "vp9_rtcd.h" + #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_onyxc_int.h" -#include "vpx_mem/vpx_mem.h" -#include "vpx_ports/vpx_once.h" const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = { DCT_DCT, // DC @@ -25,7 +26,7 @@ const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = { ADST_ADST, // D135 ADST_DCT, // D117 DCT_ADST, // D153 - DCT_ADST, // D27 + DCT_ADST, // D207 ADST_DCT, // D63 ADST_ADST, // TM DCT_DCT, // NEARESTMV @@ -35,294 +36,256 @@ const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = { }; #define intra_pred_sized(type, size) \ -void vp9_##type##_predictor_##size##x##size##_c(uint8_t *pred_ptr, \ - ptrdiff_t stride, \ - uint8_t *above_row, \ - uint8_t *left_col) { \ - type##_predictor(pred_ptr, stride, size, above_row, left_col); \ -} + void vp9_##type##_predictor_##size##x##size##_c(uint8_t *dst, \ + ptrdiff_t stride, \ + const uint8_t *above, \ + const uint8_t *left) { \ + type##_predictor(dst, stride, size, above, left); \ + } + #define intra_pred_allsizes(type) \ intra_pred_sized(type, 4) \ intra_pred_sized(type, 8) \ intra_pred_sized(type, 16) \ intra_pred_sized(type, 32) -static INLINE void d27_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, - uint8_t *above_row, uint8_t *left_col) { +static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { int r, c; + // first column - for (r = 0; r < bs - 1; ++r) { - pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] + - left_col[r + 1], 1); - } - pred_ptr[(bs - 1) * stride] = left_col[bs - 1]; - pred_ptr++; + for (r = 0; r < bs - 1; ++r) + dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1); + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + // second column - for (r = 0; r < bs - 2; ++r) { - pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] + - left_col[r + 1] * 2 + - left_col[r + 2], 2); - } - pred_ptr[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left_col[bs - 2] + - left_col[bs - 1] * 3, - 2); - pred_ptr[(bs - 1) * stride] = left_col[bs - 1]; - pred_ptr++; + for (r = 0; r < bs - 2; ++r) + dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1] * 2 + + left[r + 2], 2); + dst[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left[bs - 2] + + left[bs - 1] * 3, 2); + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; // rest of last row - for (c = 0; c < bs - 2; ++c) { - pred_ptr[(bs - 1) * stride + c] = left_col[bs - 1]; - } + for (c = 0; c < bs - 2; ++c) + dst[(bs - 1) * stride + c] = left[bs - 1]; - for (r = bs - 2; r >= 0; --r) { - for (c = 0; c < bs - 2; ++c) { - pred_ptr[r * stride + c] = pred_ptr[(r + 1) * stride + c - 2]; - } - } + for (r = bs - 2; r >= 0; --r) + for (c = 0; c < bs - 2; ++c) + dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; } -intra_pred_allsizes(d27) +intra_pred_allsizes(d207) -static INLINE void d63_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, - uint8_t *above_row, uint8_t *left_col) { +static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { int r, c; for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - if (r & 1) { - pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] + - above_row[r/2 + c + 1] * 2 + - above_row[r/2 + c + 2], 2); - } else { - pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] + - above_row[r/2+ c + 1], 1); - } - } - pred_ptr += stride; + for (c = 0; c < bs; ++c) + dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] + + above[r/2 + c + 1] * 2 + + above[r/2 + c + 2], 2) + : ROUND_POWER_OF_TWO(above[r/2 + c] + + above[r/2 + c + 1], 1); + dst += stride; } } intra_pred_allsizes(d63) -static INLINE void d45_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, - uint8_t *above_row, uint8_t *left_col) { +static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { int r, c; for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - if (r + c + 2 < bs * 2) - pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r + c] + - above_row[r + c + 1] * 2 + - above_row[r + c + 2], 2); - else - pred_ptr[c] = above_row[bs * 2 - 1]; - } - pred_ptr += stride; + for (c = 0; c < bs; ++c) + dst[c] = r + c + 2 < bs * 2 ? ROUND_POWER_OF_TWO(above[r + c] + + above[r + c + 1] * 2 + + above[r + c + 2], 2) + : above[bs * 2 - 1]; + dst += stride; } } intra_pred_allsizes(d45) -static INLINE void d117_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, - uint8_t *above_row, uint8_t *left_col) { +static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { int r, c; + // first row for (c = 0; c < bs; c++) - pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] + above_row[c], 1); - pred_ptr += stride; + dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c], 1); + dst += stride; // second row - pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] + - above_row[-1] * 2 + - above_row[0], 2); + dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); for (c = 1; c < bs; c++) - pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] + - above_row[c - 1] * 2 + - above_row[c], 2); - pred_ptr += stride; + dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2); + dst += stride; // the rest of first col - pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] + - left_col[0] * 2 + - left_col[1], 2); + dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); for (r = 3; r < bs; ++r) - pred_ptr[(r-2) * stride] = ROUND_POWER_OF_TWO(left_col[r - 3] + - left_col[r - 2] * 2 + - left_col[r - 1], 2); + dst[(r - 2) * stride] = ROUND_POWER_OF_TWO(left[r - 3] + left[r - 2] * 2 + + left[r - 1], 2); + // the rest of the block for (r = 2; r < bs; ++r) { for (c = 1; c < bs; c++) - pred_ptr[c] = pred_ptr[-2 * stride + c - 1]; - pred_ptr += stride; + dst[c] = dst[-2 * stride + c - 1]; + dst += stride; } } intra_pred_allsizes(d117) -static INLINE void d135_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, - uint8_t *above_row, uint8_t *left_col) { +static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { int r, c; - pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] + - above_row[-1] * 2 + - above_row[0], 2); + dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); for (c = 1; c < bs; c++) - pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] + - above_row[c - 1] * 2 + - above_row[c], 2); + dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2); - pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] + - left_col[0] * 2 + - left_col[1], 2); + dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); for (r = 2; r < bs; ++r) - pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] + - left_col[r - 1] * 2 + - left_col[r], 2); + dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 + + left[r], 2); - pred_ptr += stride; + dst += stride; for (r = 1; r < bs; ++r) { for (c = 1; c < bs; c++) - pred_ptr[c] = pred_ptr[-stride + c - 1]; - pred_ptr += stride; + dst[c] = dst[-stride + c - 1]; + dst += stride; } } intra_pred_allsizes(d135) -static INLINE void d153_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, - uint8_t *above_row, uint8_t *left_col) { +static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { int r, c; - pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] + left_col[0], 1); + dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0], 1); for (r = 1; r < bs; r++) - pred_ptr[r * stride] = - ROUND_POWER_OF_TWO(left_col[r - 1] + left_col[r], 1); - pred_ptr++; - - pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] + - above_row[-1] * 2 + - above_row[0], 2); - pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] + - left_col[0] * 2 + - left_col[1], 2); + dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 1] + left[r], 1); + dst++; + + dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); + dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); for (r = 2; r < bs; r++) - pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] + - left_col[r - 1] * 2 + - left_col[r], 2); - pred_ptr++; + dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 + + left[r], 2); + dst++; for (c = 0; c < bs - 2; c++) - pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] + - above_row[c] * 2 + - above_row[c + 1], 2); - pred_ptr += stride; + dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c] * 2 + above[c + 1], 2); + dst += stride; + for (r = 1; r < bs; ++r) { for (c = 0; c < bs - 2; c++) - pred_ptr[c] = pred_ptr[-stride + c - 2]; - pred_ptr += stride; + dst[c] = dst[-stride + c - 2]; + dst += stride; } } intra_pred_allsizes(d153) -static INLINE void v_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, - uint8_t *above_row, uint8_t *left_col) { +static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { int r; for (r = 0; r < bs; r++) { - vpx_memcpy(pred_ptr, above_row, bs); - pred_ptr += stride; + vpx_memcpy(dst, above, bs); + dst += stride; } } intra_pred_allsizes(v) -static INLINE void h_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, - uint8_t *above_row, uint8_t *left_col) { +static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { int r; for (r = 0; r < bs; r++) { - vpx_memset(pred_ptr, left_col[r], bs); - pred_ptr += stride; + vpx_memset(dst, left[r], bs); + dst += stride; } } intra_pred_allsizes(h) -static INLINE void tm_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, - uint8_t *above_row, uint8_t *left_col) { +static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { int r, c; - int ytop_left = above_row[-1]; + int ytop_left = above[-1]; for (r = 0; r < bs; r++) { for (c = 0; c < bs; c++) - pred_ptr[c] = clip_pixel(left_col[r] + above_row[c] - ytop_left); - pred_ptr += stride; + dst[c] = clip_pixel(left[r] + above[c] - ytop_left); + dst += stride; } } intra_pred_allsizes(tm) -static INLINE void dc_128_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, - uint8_t *above_row, uint8_t *left_col) { +static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { int r; for (r = 0; r < bs; r++) { - vpx_memset(pred_ptr, 128, bs); - pred_ptr += stride; + vpx_memset(dst, 128, bs); + dst += stride; } } intra_pred_allsizes(dc_128) -static INLINE void dc_left_predictor(uint8_t *pred_ptr, ptrdiff_t stride, - int bs, - uint8_t *above_row, uint8_t *left_col) { - int i, r; - int expected_dc = 128; - int average = 0; - const int count = bs; +static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + int i, r, expected_dc, sum = 0; for (i = 0; i < bs; i++) - average += left_col[i]; - expected_dc = (average + (count >> 1)) / count; + sum += left[i]; + expected_dc = (sum + (bs >> 1)) / bs; for (r = 0; r < bs; r++) { - vpx_memset(pred_ptr, expected_dc, bs); - pred_ptr += stride; + vpx_memset(dst, expected_dc, bs); + dst += stride; } } intra_pred_allsizes(dc_left) -static INLINE void dc_top_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, - uint8_t *above_row, uint8_t *left_col) { - int i, r; - int expected_dc = 128; - int average = 0; - const int count = bs; +static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i, r, expected_dc, sum = 0; for (i = 0; i < bs; i++) - average += above_row[i]; - expected_dc = (average + (count >> 1)) / count; + sum += above[i]; + expected_dc = (sum + (bs >> 1)) / bs; for (r = 0; r < bs; r++) { - vpx_memset(pred_ptr, expected_dc, bs); - pred_ptr += stride; + vpx_memset(dst, expected_dc, bs); + dst += stride; } } intra_pred_allsizes(dc_top) -static INLINE void dc_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, - uint8_t *above_row, uint8_t *left_col) { - int i, r; - int expected_dc = 128; - int average = 0; +static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i, r, expected_dc, sum = 0; const int count = 2 * bs; - for (i = 0; i < bs; i++) - average += above_row[i]; - for (i = 0; i < bs; i++) - average += left_col[i]; - expected_dc = (average + (count >> 1)) / count; + for (i = 0; i < bs; i++) { + sum += above[i]; + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; for (r = 0; r < bs; r++) { - vpx_memset(pred_ptr, expected_dc, bs); - pred_ptr += stride; + vpx_memset(dst, expected_dc, bs); + dst += stride; } } intra_pred_allsizes(dc) #undef intra_pred_allsizes -typedef void (*intra_pred_fn)(uint8_t *pred_ptr, ptrdiff_t stride, - uint8_t *above_row, uint8_t *left_col); +typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left); -static intra_pred_fn pred[VP9_INTRA_MODES][4]; +static intra_pred_fn pred[INTRA_MODES][4]; static intra_pred_fn dc_pred[2][2][4]; static void init_intra_pred_fn_ptrs(void) { @@ -334,7 +297,7 @@ static void init_intra_pred_fn_ptrs(void) { intra_pred_allsizes(pred[V_PRED], v); intra_pred_allsizes(pred[H_PRED], h); - intra_pred_allsizes(pred[D27_PRED], d27); + intra_pred_allsizes(pred[D207_PRED], d207); intra_pred_allsizes(pred[D45_PRED], d45); intra_pred_allsizes(pred[D63_PRED], d63); intra_pred_allsizes(pred[D117_PRED], d117); @@ -350,16 +313,17 @@ static void init_intra_pred_fn_ptrs(void) { #undef intra_pred_allsizes } -static void build_intra_predictors(uint8_t *src, int src_stride, - uint8_t *pred_ptr, int stride, - MB_PREDICTION_MODE mode, TX_SIZE txsz, +static void build_intra_predictors(const uint8_t *ref, int ref_stride, + uint8_t *dst, int dst_stride, + MB_PREDICTION_MODE mode, TX_SIZE tx_size, int up_available, int left_available, int right_available) { int i; DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64); - DECLARE_ALIGNED_ARRAY(16, uint8_t, yabove_data, 128 + 16); - uint8_t *above_row = yabove_data + 16; - const int bs = 4 << txsz; + DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16); + uint8_t *above_row = above_data + 16; + const uint8_t *const_above_row = above_row; + const int bs = 4 << tx_size; // 127 127 127 .. 127 127 127 127 127 127 // 129 A B .. Y Z @@ -369,45 +333,46 @@ static void build_intra_predictors(uint8_t *src, int src_stride, // .. once(init_intra_pred_fn_ptrs); + + // left if (left_available) { for (i = 0; i < bs; i++) - left_col[i] = src[i * src_stride - 1]; + left_col[i] = ref[i * ref_stride - 1]; } else { vpx_memset(left_col, 129, bs); } + // above if (up_available) { - uint8_t *above_ptr = src - src_stride; + const uint8_t *above_ref = ref - ref_stride; if (bs == 4 && right_available && left_available) { - above_row = above_ptr; + const_above_row = above_ref; } else { - vpx_memcpy(above_row, above_ptr, bs); + vpx_memcpy(above_row, above_ref, bs); if (bs == 4 && right_available) - vpx_memcpy(above_row + bs, above_ptr + bs, bs); + vpx_memcpy(above_row + bs, above_ref + bs, bs); else vpx_memset(above_row + bs, above_row[bs - 1], bs); - above_row[-1] = left_available ? above_ptr[-1] : 129; + above_row[-1] = left_available ? above_ref[-1] : 129; } } else { vpx_memset(above_row, 127, bs * 2); above_row[-1] = 127; } + // predict if (mode == DC_PRED) { - dc_pred[left_available][up_available][txsz](pred_ptr, stride, - above_row, left_col); + dc_pred[left_available][up_available][tx_size](dst, dst_stride, + const_above_row, left_col); } else { - pred[mode][txsz](pred_ptr, stride, above_row, left_col); + pred[mode][tx_size](dst, dst_stride, const_above_row, left_col); } } -void vp9_predict_intra_block(MACROBLOCKD *xd, - int block_idx, - int bwl_in, - TX_SIZE tx_size, - int mode, - uint8_t *reference, int ref_stride, - uint8_t *predictor, int pre_stride) { +void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in, + TX_SIZE tx_size, int mode, + const uint8_t *ref, int ref_stride, + uint8_t *dst, int dst_stride) { const int bwl = bwl_in - tx_size; const int wmask = (1 << bwl) - 1; const int have_top = (block_idx >> bwl) || xd->up_available; @@ -415,10 +380,6 @@ void vp9_predict_intra_block(MACROBLOCKD *xd, const int have_right = ((block_idx & wmask) != wmask); assert(bwl >= 0); - build_intra_predictors(reference, ref_stride, - predictor, pre_stride, - mode, - tx_size, - have_top, have_left, - have_right); + build_intra_predictors(ref, ref_stride, dst, dst_stride, mode, tx_size, + have_top, have_left, have_right); } diff --git a/libvpx/vp9/common/vp9_reconintra.h b/libvpx/vp9/common/vp9_reconintra.h index e369a71..e9d0dbf 100644 --- a/libvpx/vp9/common/vp9_reconintra.h +++ b/libvpx/vp9/common/vp9_reconintra.h @@ -14,17 +14,8 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" -MB_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, - int stride, int n, - int tx, int ty); - -MB_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, int block, - uint8_t *ptr, int stride); - -void vp9_predict_intra_block(MACROBLOCKD *xd, - int block_idx, - int bwl_in, - TX_SIZE tx_size, - int mode, uint8_t *ref, int ref_stride, - uint8_t *predictor, int pre_stride); +void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in, + TX_SIZE tx_size, int mode, + const uint8_t *ref, int ref_stride, + uint8_t *dst, int dst_stride); #endif // VP9_COMMON_VP9_RECONINTRA_H_ diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh index 6bb3cb8..042afbb 100644 --- a/libvpx/vp9/common/vp9_rtcd_defs.sh +++ b/libvpx/vp9/common/vp9_rtcd_defs.sh @@ -21,10 +21,11 @@ EOF forward_decls vp9_common_forward_decls # x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly. -[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse2_x86inc=sse2 && ssse3_x86inc=ssse3 +[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse && + sse2_x86inc=sse2 && ssse3_x86inc=ssse3 # this variable is for functions that are 64 bit only. -[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3 +[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3 # # Dequant @@ -45,160 +46,160 @@ specialize vp9_idct_add_32x32 # # RECON # -prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_d27_predictor_4x4 +prototype void vp9_d207_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_d207_predictor_4x4 -prototype void vp9_d45_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_d45_predictor_4x4 ssse3 +prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_d45_predictor_4x4 $ssse3_x86inc -prototype void vp9_d63_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d63_predictor_4x4 -prototype void vp9_h_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_h_predictor_4x4 ssse3 +prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_h_predictor_4x4 $ssse3_x86inc -prototype void vp9_d117_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d117_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d117_predictor_4x4 -prototype void vp9_d135_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d135_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d135_predictor_4x4 -prototype void vp9_d153_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d153_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d153_predictor_4x4 -prototype void vp9_v_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_v_predictor_4x4 sse +prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_v_predictor_4x4 $sse_x86inc -prototype void vp9_tm_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_tm_predictor_4x4 sse +prototype void vp9_tm_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_tm_predictor_4x4 $sse_x86inc -prototype void vp9_dc_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_dc_predictor_4x4 sse +prototype void vp9_dc_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_dc_predictor_4x4 $sse_x86inc -prototype void vp9_dc_top_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_dc_top_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_top_predictor_4x4 -prototype void vp9_dc_left_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_dc_left_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_left_predictor_4x4 -prototype void vp9_dc_128_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_dc_128_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_128_predictor_4x4 -prototype void vp9_d27_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_d27_predictor_8x8 +prototype void vp9_d207_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_d207_predictor_8x8 -prototype void vp9_d45_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_d45_predictor_8x8 ssse3 +prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_d45_predictor_8x8 $ssse3_x86inc -prototype void vp9_d63_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d63_predictor_8x8 -prototype void vp9_h_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_h_predictor_8x8 ssse3 +prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_h_predictor_8x8 $ssse3_x86inc -prototype void vp9_d117_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d117_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d117_predictor_8x8 -prototype void vp9_d135_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d135_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d135_predictor_8x8 -prototype void vp9_d153_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d153_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d153_predictor_8x8 -prototype void vp9_v_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_v_predictor_8x8 sse +prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_v_predictor_8x8 $sse_x86inc -prototype void vp9_tm_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_tm_predictor_8x8 sse2 +prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_tm_predictor_8x8 $sse2_x86inc -prototype void vp9_dc_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_dc_predictor_8x8 sse +prototype void vp9_dc_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_dc_predictor_8x8 $sse_x86inc -prototype void vp9_dc_top_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_dc_top_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_top_predictor_8x8 -prototype void vp9_dc_left_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_dc_left_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_left_predictor_8x8 -prototype void vp9_dc_128_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_dc_128_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_128_predictor_8x8 -prototype void vp9_d27_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_d27_predictor_16x16 +prototype void vp9_d207_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_d207_predictor_16x16 -prototype void vp9_d45_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_d45_predictor_16x16 ssse3 +prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_d45_predictor_16x16 $ssse3_x86inc -prototype void vp9_d63_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d63_predictor_16x16 -prototype void vp9_h_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_h_predictor_16x16 ssse3 +prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_h_predictor_16x16 $ssse3_x86inc -prototype void vp9_d117_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d117_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d117_predictor_16x16 -prototype void vp9_d135_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d135_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d135_predictor_16x16 -prototype void vp9_d153_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d153_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d153_predictor_16x16 -prototype void vp9_v_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_v_predictor_16x16 sse2 +prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_v_predictor_16x16 $sse2_x86inc -prototype void vp9_tm_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_tm_predictor_16x16 sse2 +prototype void vp9_tm_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_tm_predictor_16x16 $sse2_x86inc -prototype void vp9_dc_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_dc_predictor_16x16 sse2 +prototype void vp9_dc_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_dc_predictor_16x16 $sse2_x86inc -prototype void vp9_dc_top_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_dc_top_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_top_predictor_16x16 -prototype void vp9_dc_left_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_dc_left_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_left_predictor_16x16 -prototype void vp9_dc_128_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_dc_128_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_128_predictor_16x16 -prototype void vp9_d27_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_d27_predictor_32x32 +prototype void vp9_d207_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_d207_predictor_32x32 -prototype void vp9_d45_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_d45_predictor_32x32 ssse3 +prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_d45_predictor_32x32 $ssse3_x86inc -prototype void vp9_d63_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d63_predictor_32x32 -prototype void vp9_h_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_h_predictor_32x32 ssse3 +prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_h_predictor_32x32 $ssse3 x86inc -prototype void vp9_d117_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d117_predictor_32x32 -prototype void vp9_d135_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d135_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d135_predictor_32x32 -prototype void vp9_d153_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_d153_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_d153_predictor_32x32 -prototype void vp9_v_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_v_predictor_32x32 sse2 +prototype void vp9_v_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_v_predictor_32x32 $sse2_x86inc -prototype void vp9_tm_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_tm_predictor_32x32 sse2_x86_64 +prototype void vp9_tm_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_tm_predictor_32x32 $sse2_x86_64 -prototype void vp9_dc_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_dc_predictor_32x32 sse2 +prototype void vp9_dc_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" +specialize vp9_dc_predictor_32x32 $sse2_x86inc -prototype void vp9_dc_top_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_dc_top_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_top_predictor_32x32 -prototype void vp9_dc_left_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_dc_left_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_left_predictor_32x32 -prototype void vp9_dc_128_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +prototype void vp9_dc_128_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_128_predictor_32x32 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then @@ -236,7 +237,7 @@ specialize vp9_loop_filter_horizontal_edge mmx neon # # post proc # -if [ "$CONFIG_POSTPROC" = "yes" ]; then +if [ "$CONFIG_VP9_POSTPROC" = "yes" ]; then prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit" specialize vp9_mbpost_proc_down mmx sse2 vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm @@ -267,10 +268,10 @@ specialize vp9_blend_b # Sub Pixel Filters # prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve_copy $sse2_x86inc +specialize vp9_convolve_copy $sse2_x86inc neon prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve_avg $sse2_x86inc +specialize vp9_convolve_avg $sse2_x86inc neon prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" specialize vp9_convolve8 ssse3 neon @@ -294,40 +295,40 @@ specialize vp9_convolve8_avg_vert ssse3 neon # dct # prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct4x4_1_add sse2 +specialize vp9_short_idct4x4_1_add sse2 neon prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct4x4_add sse2 +specialize vp9_short_idct4x4_add sse2 neon prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct8x8_1_add sse2 +specialize vp9_short_idct8x8_1_add sse2 neon prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct8x8_add sse2 neon prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct10_8x8_add sse2 +specialize vp9_short_idct10_8x8_add sse2 neon prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct16x16_1_add sse2 +specialize vp9_short_idct16x16_1_add sse2 neon prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct16x16_add sse2 +specialize vp9_short_idct16x16_add sse2 neon prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct10_16x16_add sse2 +specialize vp9_short_idct10_16x16_add sse2 neon prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct32x32_add sse2 +specialize vp9_short_idct32x32_add sse2 neon prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output" specialize vp9_short_idct1_32x32 prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_short_iht4x4_add sse2 +specialize vp9_short_iht4x4_add sse2 neon prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_short_iht8x8_add sse2 +specialize vp9_short_iht8x8_add sse2 neon prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type" specialize vp9_short_iht16x16_add sse2 @@ -342,12 +343,6 @@ specialize vp9_short_iwalsh4x4_1_add prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_iwalsh4x4_add -prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad" -specialize vp9_sad32x3 - -prototype unsigned int vp9_sad3x32 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad" -specialize vp9_sad3x32 - # # Encoder functions below this point. # @@ -356,217 +351,214 @@ if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then # variance prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance32x16 sse2 +specialize vp9_variance32x16 $sse2_x86inc prototype unsigned int vp9_variance16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance16x32 sse2 +specialize vp9_variance16x32 $sse2_x86inc prototype unsigned int vp9_variance64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance64x32 sse2 +specialize vp9_variance64x32 $sse2_x86inc prototype unsigned int vp9_variance32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance32x64 sse2 +specialize vp9_variance32x64 $sse2_x86inc prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance32x32 sse2 +specialize vp9_variance32x32 $sse2_x86inc prototype unsigned int vp9_variance64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance64x64 sse2 +specialize vp9_variance64x64 $sse2_x86inc prototype unsigned int vp9_variance16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance16x16 mmx sse2 +specialize vp9_variance16x16 mmx $sse2_x86inc prototype unsigned int vp9_variance16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance16x8 mmx sse2 +specialize vp9_variance16x8 mmx $sse2_x86inc prototype unsigned int vp9_variance8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance8x16 mmx sse2 +specialize vp9_variance8x16 mmx $sse2_x86inc prototype unsigned int vp9_variance8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance8x8 mmx sse2 +specialize vp9_variance8x8 mmx $sse2_x86inc prototype void vp9_get_sse_sum_8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum" specialize vp9_get_sse_sum_8x8 sse2 vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2 prototype unsigned int vp9_variance8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance8x4 sse2 +specialize vp9_variance8x4 $sse2_x86inc prototype unsigned int vp9_variance4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance4x8 sse2 +specialize vp9_variance4x8 $sse2_x86inc prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance4x4 mmx sse2 +specialize vp9_variance4x4 mmx $sse2_x86inc prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x64 sse2 ssse3 +specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3 +specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x64 sse2 ssse3 +specialize vp9_sub_pixel_variance32x64 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3 +specialize vp9_sub_pixel_avg_variance32x64 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x32 sse2 ssse3 +specialize vp9_sub_pixel_variance64x32 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3 +specialize vp9_sub_pixel_avg_variance64x32 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x16 sse2 ssse3 +specialize vp9_sub_pixel_variance32x16 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3 +specialize vp9_sub_pixel_avg_variance32x16 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x32 sse2 ssse3 +specialize vp9_sub_pixel_variance16x32 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3 +specialize vp9_sub_pixel_avg_variance16x32 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x32 sse2 ssse3 +specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3 +specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x16 sse2 ssse3 +specialize vp9_sub_pixel_variance16x16 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3 +specialize vp9_sub_pixel_avg_variance16x16 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x16 sse2 ssse3 +specialize vp9_sub_pixel_variance8x16 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3 +specialize vp9_sub_pixel_avg_variance8x16 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x8 sse2 ssse3 +specialize vp9_sub_pixel_variance16x8 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3 +specialize vp9_sub_pixel_avg_variance16x8 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x8 sse2 ssse3 +specialize vp9_sub_pixel_variance8x8 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3 +specialize vp9_sub_pixel_avg_variance8x8 $sse2_x86inc $ssse3_x86inc # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x4 sse2 ssse3 +specialize vp9_sub_pixel_variance8x4 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3 +specialize vp9_sub_pixel_avg_variance8x4 $sse2_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x8 sse ssse3 +specialize vp9_sub_pixel_variance4x8 $sse_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance4x8 sse ssse3 +specialize vp9_sub_pixel_avg_variance4x8 $sse_x86inc $ssse3_x86inc prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x4 sse ssse3 +specialize vp9_sub_pixel_variance4x4 $sse_x86inc $ssse3_x86inc #vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance4x4 sse ssse3 +specialize vp9_sub_pixel_avg_variance4x4 $sse_x86inc $ssse3_x86inc prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad64x64 sse2 +specialize vp9_sad64x64 $sse2_x86inc prototype unsigned int vp9_sad32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad32x64 sse2 +specialize vp9_sad32x64 $sse2_x86inc prototype unsigned int vp9_sad64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad64x32 sse2 +specialize vp9_sad64x32 $sse2_x86inc prototype unsigned int vp9_sad32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad32x16 sse2 +specialize vp9_sad32x16 $sse2_x86inc prototype unsigned int vp9_sad16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad16x32 sse2 +specialize vp9_sad16x32 $sse2_x86inc prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad32x32 sse2 +specialize vp9_sad32x32 $sse2_x86inc prototype unsigned int vp9_sad16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad16x16 mmx sse2 +specialize vp9_sad16x16 mmx $sse2_x86inc prototype unsigned int vp9_sad16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad16x8 mmx sse2 +specialize vp9_sad16x8 mmx $sse2_x86inc prototype unsigned int vp9_sad8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad8x16 mmx sse2 +specialize vp9_sad8x16 mmx $sse2_x86inc prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad8x8 mmx sse2 +specialize vp9_sad8x8 mmx $sse2_x86inc prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad8x4 sse2 +specialize vp9_sad8x4 $sse2_x86inc prototype unsigned int vp9_sad4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad4x8 sse +specialize vp9_sad4x8 $sse_x86inc prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad4x4 mmx sse +specialize vp9_sad4x4 mmx $sse_x86inc prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad64x64_avg sse2 +specialize vp9_sad64x64_avg $sse2_x86inc prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad32x64_avg sse2 +specialize vp9_sad32x64_avg $sse2_x86inc prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad64x32_avg sse2 +specialize vp9_sad64x32_avg $sse2_x86inc prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad32x16_avg sse2 +specialize vp9_sad32x16_avg $sse2_x86inc prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad16x32_avg sse2 +specialize vp9_sad16x32_avg $sse2_x86inc prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad32x32_avg sse2 +specialize vp9_sad32x32_avg $sse2_x86inc prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad16x16_avg sse2 +specialize vp9_sad16x16_avg $sse2_x86inc prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad16x8_avg sse2 +specialize vp9_sad16x8_avg $sse2_x86inc prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad8x16_avg sse2 +specialize vp9_sad8x16_avg $sse2_x86inc prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad8x8_avg sse2 +specialize vp9_sad8x8_avg $sse2_x86inc prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad8x4_avg sse2 +specialize vp9_sad8x4_avg $sse2_x86inc prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad4x8_avg sse +specialize vp9_sad4x8_avg $sse_x86inc prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" -specialize vp9_sad4x4_avg sse +specialize vp9_sad4x4_avg $sse_x86inc prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_h sse2 -vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt +specialize vp9_variance_halfpixvar16x16_h $sse2_x86inc prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_v sse2 -vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt +specialize vp9_variance_halfpixvar16x16_v $sse2_x86inc prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_hv sse2 -vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt +specialize vp9_variance_halfpixvar16x16_hv $sse2_x86inc prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_variance_halfpixvar64x64_h @@ -678,8 +670,7 @@ specialize vp9_sad4x4x4d sse #specialize vp9_sub_pixel_mse16x16 sse2 mmx prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" -specialize vp9_mse16x16 mmx sse2 -vp9_mse16x16_sse2=vp9_mse16x16_wmt +specialize vp9_mse16x16 mmx $sse2_x86inc prototype unsigned int vp9_mse8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" specialize vp9_mse8x16 @@ -743,7 +734,7 @@ prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int p specialize vp9_short_fdct8x4 sse2 prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct32x32 +specialize vp9_short_fdct32x32 sse2 prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch" specialize vp9_short_fdct32x32_rd sse2 diff --git a/libvpx/vp9/common/vp9_scale.c b/libvpx/vp9/common/vp9_scale.c new file mode 100644 index 0000000..989206c --- /dev/null +++ b/libvpx/vp9/common/vp9_scale.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_filter.h" +#include "vp9/common/vp9_scale.h" + +static INLINE int scaled_x(int val, const struct scale_factors *scale) { + return val * scale->x_scale_fp >> REF_SCALE_SHIFT; +} + +static INLINE int scaled_y(int val, const struct scale_factors *scale) { + return val * scale->y_scale_fp >> REF_SCALE_SHIFT; +} + +static int unscaled_value(int val, const struct scale_factors *scale) { + (void) scale; + return val; +} + +static MV32 scaled_mv(const MV *mv, const struct scale_factors *scale) { + const MV32 res = { + scaled_y(mv->row, scale) + scale->y_offset_q4, + scaled_x(mv->col, scale) + scale->x_offset_q4 + }; + return res; +} + +static MV32 unscaled_mv(const MV *mv, const struct scale_factors *scale) { + const MV32 res = { + mv->row, + mv->col + }; + return res; +} + +static void set_offsets_with_scaling(struct scale_factors *scale, + int row, int col) { + scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale) & SUBPEL_MASK; + scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale) & SUBPEL_MASK; +} + +static void set_offsets_without_scaling(struct scale_factors *scale, + int row, int col) { + scale->x_offset_q4 = 0; + scale->y_offset_q4 = 0; +} + +static int get_fixed_point_scale_factor(int other_size, int this_size) { + // Calculate scaling factor once for each reference frame + // and use fixed point scaling factors in decoding and encoding routines. + // Hardware implementations can calculate scale factor in device driver + // and use multiplication and shifting on hardware instead of division. + return (other_size << REF_SCALE_SHIFT) / this_size; +} + +static int check_scale_factors(int other_w, int other_h, + int this_w, int this_h) { + return 2 * this_w >= other_w && + 2 * this_h >= other_h && + this_w <= 16 * other_w && + this_h <= 16 * other_h; +} + +void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, + int other_w, int other_h, + int this_w, int this_h) { + if (!check_scale_factors(other_w, other_h, this_w, this_h)) { + scale->x_scale_fp = REF_INVALID_SCALE; + scale->y_scale_fp = REF_INVALID_SCALE; + return; + } + + scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); + scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); + scale->x_step_q4 = scaled_x(16, scale); + scale->y_step_q4 = scaled_y(16, scale); + scale->x_offset_q4 = 0; // calculated per block + scale->y_offset_q4 = 0; // calculated per block + + if (vp9_is_scaled(scale)) { + scale->scale_value_x = scaled_x; + scale->scale_value_y = scaled_y; + scale->set_scaled_offsets = set_offsets_with_scaling; + scale->scale_mv = scaled_mv; + } else { + scale->scale_value_x = unscaled_value; + scale->scale_value_y = unscaled_value; + scale->set_scaled_offsets = set_offsets_without_scaling; + scale->scale_mv = unscaled_mv; + } + + // TODO(agrange): Investigate the best choice of functions to use here + // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what + // to do at full-pel offsets. The current selection, where the filter is + // applied in one direction only, and not at all for 0,0, seems to give the + // best quality, but it may be worth trying an additional mode that does + // do the filtering on full-pel. + if (scale->x_step_q4 == 16) { + if (scale->y_step_q4 == 16) { + // No scaling in either direction. + scale->predict[0][0][0] = vp9_convolve_copy; + scale->predict[0][0][1] = vp9_convolve_avg; + scale->predict[0][1][0] = vp9_convolve8_vert; + scale->predict[0][1][1] = vp9_convolve8_avg_vert; + scale->predict[1][0][0] = vp9_convolve8_horiz; + scale->predict[1][0][1] = vp9_convolve8_avg_horiz; + } else { + // No scaling in x direction. Must always scale in the y direction. + scale->predict[0][0][0] = vp9_convolve8_vert; + scale->predict[0][0][1] = vp9_convolve8_avg_vert; + scale->predict[0][1][0] = vp9_convolve8_vert; + scale->predict[0][1][1] = vp9_convolve8_avg_vert; + scale->predict[1][0][0] = vp9_convolve8; + scale->predict[1][0][1] = vp9_convolve8_avg; + } + } else { + if (scale->y_step_q4 == 16) { + // No scaling in the y direction. Must always scale in the x direction. + scale->predict[0][0][0] = vp9_convolve8_horiz; + scale->predict[0][0][1] = vp9_convolve8_avg_horiz; + scale->predict[0][1][0] = vp9_convolve8; + scale->predict[0][1][1] = vp9_convolve8_avg; + scale->predict[1][0][0] = vp9_convolve8_horiz; + scale->predict[1][0][1] = vp9_convolve8_avg_horiz; + } else { + // Must always scale in both directions. + scale->predict[0][0][0] = vp9_convolve8; + scale->predict[0][0][1] = vp9_convolve8_avg; + scale->predict[0][1][0] = vp9_convolve8; + scale->predict[0][1][1] = vp9_convolve8_avg; + scale->predict[1][0][0] = vp9_convolve8; + scale->predict[1][0][1] = vp9_convolve8_avg; + } + } + // 2D subpel motion always gets filtered in both directions + scale->predict[1][1][0] = vp9_convolve8; + scale->predict[1][1][1] = vp9_convolve8_avg; +} diff --git a/libvpx/vp9/common/vp9_scale.h b/libvpx/vp9/common/vp9_scale.h new file mode 100644 index 0000000..7a720d0 --- /dev/null +++ b/libvpx/vp9/common/vp9_scale.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_SCALE_H_ +#define VP9_COMMON_VP9_SCALE_H_ + +#include "vp9/common/vp9_mv.h" +#include "vp9/common/vp9_convolve.h" + +#define REF_SCALE_SHIFT 14 +#define REF_NO_SCALE (1 << REF_SCALE_SHIFT) +#define REF_INVALID_SCALE -1 + +struct scale_factors { + int x_scale_fp; // horizontal fixed point scale factor + int y_scale_fp; // vertical fixed point scale factor + int x_offset_q4; + int x_step_q4; + int y_offset_q4; + int y_step_q4; + + int (*scale_value_x)(int val, const struct scale_factors *scale); + int (*scale_value_y)(int val, const struct scale_factors *scale); + void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col); + MV32 (*scale_mv)(const MV *mv, const struct scale_factors *scale); + + convolve_fn_t predict[2][2][2]; // horiz, vert, avg +}; + +void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, + int other_w, int other_h, + int this_w, int this_h); + +static int vp9_is_valid_scale(const struct scale_factors *sf) { + return sf->x_scale_fp != REF_INVALID_SCALE && + sf->y_scale_fp != REF_INVALID_SCALE; +} + +static int vp9_is_scaled(const struct scale_factors *sf) { + return sf->x_scale_fp != REF_NO_SCALE || + sf->y_scale_fp != REF_NO_SCALE; +} + +#endif // VP9_COMMON_VP9_SCALE_H_ diff --git a/libvpx/vp9/common/vp9_subpelvar.h b/libvpx/vp9/common/vp9_subpelvar.h index ad674f1..fe75481 100644 --- a/libvpx/vp9/common/vp9_subpelvar.h +++ b/libvpx/vp9/common/vp9_subpelvar.h @@ -11,7 +11,8 @@ #ifndef VP9_COMMON_VP9_SUBPELVAR_H_ #define VP9_COMMON_VP9_SUBPELVAR_H_ -#include "vp9/common/vp9_filter.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_convolve.h" static void variance(const uint8_t *src_ptr, int source_stride, @@ -78,10 +79,10 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, for (i = 0; i < output_height; i++) { for (j = 0; j < output_width; j++) { - // Apply bilinear filter - output_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[pixel_step] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT; + output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + + (int)src_ptr[pixel_step] * vp9_filter[1], + FILTER_BITS); + src_ptr++; } @@ -127,20 +128,16 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, unsigned int output_width, const int16_t *vp9_filter) { unsigned int i, j; - int Temp; for (i = 0; i < output_height; i++) { for (j = 0; j < output_width; j++) { - // Apply filter - Temp = ((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[pixel_step] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2); - output_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT); + output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + + (int)src_ptr[pixel_step] * vp9_filter[1], + FILTER_BITS); src_ptr++; } - // Next row... - src_ptr += src_pixels_per_line - output_width; + src_ptr += src_pixels_per_line - output_width; output_ptr += output_width; } } diff --git a/libvpx/vp9/common/vp9_systemdependent.h b/libvpx/vp9/common/vp9_systemdependent.h index 1b9147e..cc909e2 100644 --- a/libvpx/vp9/common/vp9_systemdependent.h +++ b/libvpx/vp9/common/vp9_systemdependent.h @@ -34,6 +34,6 @@ static int round(double x) { #endif struct VP9Common; -void vp9_machine_specific_config(struct VP9Common *); +void vp9_machine_specific_config(struct VP9Common *cm); #endif // VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ diff --git a/libvpx/vp9/common/vp9_tile_common.c b/libvpx/vp9/common/vp9_tile_common.c index a72d2ab..1791c1a 100644 --- a/libvpx/vp9/common/vp9_tile_common.c +++ b/libvpx/vp9/common/vp9_tile_common.c @@ -14,7 +14,7 @@ #define MAX_TILE_WIDTH_B64 64 static int to_sbs(n_mis) { - return mi_cols_aligned_to_sb(n_mis) >> LOG2_MI_BLOCK_SIZE; + return mi_cols_aligned_to_sb(n_mis) >> MI_BLOCK_SIZE_LOG2; } static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off, diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c index 4af4f94..fa4dd9b 100644 --- a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c +++ b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -17,23 +17,11 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]); - DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]); - - DECLARE_ALIGNED(16, unsigned char, flat_op[3][8]); - DECLARE_ALIGNED(16, unsigned char, flat_oq[3][8]); - - DECLARE_ALIGNED(16, unsigned char, ap[8][8]); - DECLARE_ALIGNED(16, unsigned char, aq[8][8]); - - __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); - __m128i p7, p6, p5; - __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; - __m128i q5, q6, q7; - int i = 0; + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; const unsigned int extended_thresh = _thresh[0] * 0x01010101u; const unsigned int extended_limit = _limit[0] * 0x01010101u; const unsigned int extended_blimit = _blimit[0] * 0x01010101u; @@ -44,41 +32,35 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, const __m128i blimit = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); - p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); - p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); - p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); - q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); - q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); - q4 = _mm_loadl_epi64((__m128i *)(s + 4 * p)); - - _mm_storel_epi64((__m128i *)ap[4], p4); - _mm_storel_epi64((__m128i *)ap[3], p3); - _mm_storel_epi64((__m128i *)ap[2], p2); - _mm_storel_epi64((__m128i *)ap[1], p1); - _mm_storel_epi64((__m128i *)ap[0], p0); - _mm_storel_epi64((__m128i *)aq[4], q4); - _mm_storel_epi64((__m128i *)aq[3], q3); - _mm_storel_epi64((__m128i *)aq[2], q2); - _mm_storel_epi64((__m128i *)aq[1], q1); - _mm_storel_epi64((__m128i *)aq[0], q0); - + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4), + (__m64 *)(s + 4 * p))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3), + (__m64 *)(s + 3 * p))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2), + (__m64 *)(s + 2 * p))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1), + (__m64 *)(s + 1 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0), + (__m64 *)(s - 0 * p))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); { - const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), - _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), - _mm_subs_epu8(q0, q1)); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), - _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), - _mm_subs_epu8(q1, p1)); - __m128i work; + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), + _mm_subs_epu8(q0p0, q1p1)); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), + _mm_subs_epu8(p0q0, q0p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), + _mm_subs_epu8(p1q1, q1p1)); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); @@ -88,19 +70,16 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); + mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), - _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), - _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), - _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), - _mm_subs_epu8(q2, q3))); + + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1), + _mm_subs_epu8(q1p1, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), + _mm_subs_epu8(q2p2, q3p3))); mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); } @@ -110,21 +89,19 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - - __m128i ps1 = _mm_xor_si128(p1, t80); - __m128i ps0 = _mm_xor_si128(p0, t80); - __m128i qs0 = _mm_xor_si128(q0, t80); - __m128i qs1 = _mm_xor_si128(q1, t80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); @@ -134,82 +111,60 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); - /* Filter1 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); - /* Filter2 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + /* Filter1 >> 3 */ + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); /* filt >> 1 */ - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - filt = _mm_andnot_si128(hev, filt); - ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), + filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); // loopfilter done { __m128i work; - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), - _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), - _mm_subs_epu8(q0, q2))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), - _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), - _mm_subs_epu8(q0, q3))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), - _mm_subs_epu8(p0, p4)), - _mm_or_si128(_mm_subs_epu8(q4, q0), - _mm_subs_epu8(q0, q4))); + flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0), + _mm_subs_epu8(q0p0, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), + _mm_subs_epu8(q0p0, q3p3))); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); - p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); - q5 = _mm_loadl_epi64((__m128i *)(s + 5 * p)); - flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0), - _mm_subs_epu8(p0, p5)), - _mm_or_si128(_mm_subs_epu8(q5, q0), - _mm_subs_epu8(q0, q5))); - _mm_storel_epi64((__m128i *)ap[5], p5); - _mm_storel_epi64((__m128i *)aq[5], q5); - flat2 = _mm_max_epu8(work, flat2); - p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); - q6 = _mm_loadl_epi64((__m128i *)(s + 6 * p)); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0), - _mm_subs_epu8(p0, p6)), - _mm_or_si128(_mm_subs_epu8(q6, q0), - _mm_subs_epu8(q0, q6))); - _mm_storel_epi64((__m128i *)ap[6], p6); - _mm_storel_epi64((__m128i *)aq[6], q6); - flat2 = _mm_max_epu8(work, flat2); + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5), + (__m64 *)(s + 5 * p))); + + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6), + (__m64 *)(s + 6 * p))); + + flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0), + _mm_subs_epu8(q0p0, q4p4)), + _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), + _mm_subs_epu8(q0p0, q5p5))); + + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7), + (__m64 *)(s + 7 * p))); + + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0), + _mm_subs_epu8(q0p0, q6p6)), + _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), + _mm_subs_epu8(q0p0, q7p7))); - p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); - q7 = _mm_loadl_epi64((__m128i *)(s + 7 * p)); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0), - _mm_subs_epu8(p0, p7)), - _mm_or_si128(_mm_subs_epu8(q7, q0), - _mm_subs_epu8(q0, q7))); - _mm_storel_epi64((__m128i *)ap[7], p7); - _mm_storel_epi64((__m128i *)aq[7], q7); flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask @@ -220,260 +175,198 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, { const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); - { - __m128i workp_shft; - __m128i a, b, c; - - p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7])), zero); - p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6])), zero); - p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5])), zero); - p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4])), zero); - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3])), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2])), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1])), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0])), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0])), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1])), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2])), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3])), zero); - q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4])), zero); - q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5])), zero); - q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6])), zero); - q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7])), zero); - - c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 - c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); - - b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2)); - a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1)); - a = _mm_add_epi16(_mm_add_epi16(p0, q0), a); - - _mm_storel_epi64((__m128i *)&flat_op[2][i*8], - _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) - , b)); - - c = _mm_add_epi16(_mm_add_epi16(p5, eight), c); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[6][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - a = _mm_add_epi16(q1, a); - b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1); - _mm_storel_epi64((__m128i *)&flat_op[1][i*8], - _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) - , b)); - - c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[5][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - a = _mm_add_epi16(q2, a); - b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0); - _mm_storel_epi64((__m128i *)&flat_op[0][i*8], - _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) - , b)); - - c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[4][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - a = _mm_add_epi16(q3, a); - b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0); - _mm_storel_epi64((__m128i *)&flat_oq[0][i*8], - _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) - , b)); - - c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[3][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - b = _mm_add_epi16(q3, b); - b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1); - _mm_storel_epi64((__m128i *)&flat_oq[1][i*8], - _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) - , b)); - - c = _mm_add_epi16(q4, c); - c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[2][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - b = _mm_add_epi16(q3, b); - b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2); - _mm_storel_epi64((__m128i *)&flat_oq[2][i*8], - _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) - , b)); - a = _mm_add_epi16(q5, a); - c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[1][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - a = _mm_add_epi16(q6, a); - c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[0][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - a = _mm_add_epi16(q7, a); - c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - a = _mm_add_epi16(q7, a); - c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - a = _mm_add_epi16(q7, a); - c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - a = _mm_add_epi16(q7, a); - c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - a = _mm_add_epi16(q7, a); - c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - a = _mm_add_epi16(q7, a); - c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - a = _mm_add_epi16(q7, a); - c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6); - workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - } + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero);; + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, + pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16(four, + _mm_add_epi16(pixetFilter_p2p1p0, + pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(p7_16, p0_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(q7_16, q0_16)), 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p1_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q1_16)), 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p2_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q2_16)), 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, + _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, + _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p3_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q3_16)), 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p4_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q4_16)), 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p5_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q5_16)), 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, + _mm_add_epi16(sum_p7, p6_16)), 4); + res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, + _mm_add_epi16(sum_q7, q6_16)), 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - work_a = _mm_loadl_epi64((__m128i *)ap[2]); - p2 = _mm_loadl_epi64((__m128i *)flat_op[2]); - work_a = _mm_andnot_si128(flat, work_a); - p2 = _mm_and_si128(flat, p2); - p2 = _mm_or_si128(work_a, p2); - _mm_storel_epi64((__m128i *)flat_op[2], p2); - - p1 = _mm_loadl_epi64((__m128i *)flat_op[1]); - work_a = _mm_andnot_si128(flat, ps1); - p1 = _mm_and_si128(flat, p1); - p1 = _mm_or_si128(work_a, p1); - _mm_storel_epi64((__m128i *)flat_op[1], p1); - - p0 = _mm_loadl_epi64((__m128i *)flat_op[0]); - work_a = _mm_andnot_si128(flat, ps0); - p0 = _mm_and_si128(flat, p0); - p0 = _mm_or_si128(work_a, p0); - _mm_storel_epi64((__m128i *)flat_op[0], p0); - - q0 = _mm_loadl_epi64((__m128i *)flat_oq[0]); - work_a = _mm_andnot_si128(flat, qs0); - q0 = _mm_and_si128(flat, q0); - q0 = _mm_or_si128(work_a, q0); - _mm_storel_epi64((__m128i *)flat_oq[0], q0); - - q1 = _mm_loadl_epi64((__m128i *)flat_oq[1]); - work_a = _mm_andnot_si128(flat, qs1); - q1 = _mm_and_si128(flat, q1); - q1 = _mm_or_si128(work_a, q1); - _mm_storel_epi64((__m128i *)flat_oq[1], q1); - - work_a = _mm_loadl_epi64((__m128i *)aq[2]); - q2 = _mm_loadl_epi64((__m128i *)flat_oq[2]); - work_a = _mm_andnot_si128(flat, work_a); - q2 = _mm_and_si128(flat, q2); - q2 = _mm_or_si128(work_a, q2); - _mm_storel_epi64((__m128i *)flat_oq[2], q2); - - // write out op6 - op3 - { - unsigned char *dst = (s - 7 * p); - for (i = 6; i > 2; i--) { - __m128i flat2_output; - work_a = _mm_loadl_epi64((__m128i *)ap[i]); - flat2_output = _mm_loadl_epi64((__m128i *)flat2_op[i]); - work_a = _mm_andnot_si128(flat2, work_a); - flat2_output = _mm_and_si128(flat2, flat2_output); - work_a = _mm_or_si128(work_a, flat2_output); - _mm_storel_epi64((__m128i *)dst, work_a); - dst += p; - } - } - - work_a = _mm_loadl_epi64((__m128i *)flat_op[2]); - p2 = _mm_loadl_epi64((__m128i *)flat2_op[2]); - work_a = _mm_andnot_si128(flat2, work_a); - p2 = _mm_and_si128(flat2, p2); - p2 = _mm_or_si128(work_a, p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - - work_a = _mm_loadl_epi64((__m128i *)flat_op[1]); - p1 = _mm_loadl_epi64((__m128i *)flat2_op[1]); - work_a = _mm_andnot_si128(flat2, work_a); - p1 = _mm_and_si128(flat2, p1); - p1 = _mm_or_si128(work_a, p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - - work_a = _mm_loadl_epi64((__m128i *)flat_op[0]); - p0 = _mm_loadl_epi64((__m128i *)flat2_op[0]); - work_a = _mm_andnot_si128(flat2, work_a); - p0 = _mm_and_si128(flat2, p0); - p0 = _mm_or_si128(work_a, p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - - work_a = _mm_loadl_epi64((__m128i *)flat_oq[0]); - q0 = _mm_loadl_epi64((__m128i *)flat2_oq[0]); - work_a = _mm_andnot_si128(flat2, work_a); - q0 = _mm_and_si128(flat2, q0); - q0 = _mm_or_si128(work_a, q0); - _mm_storel_epi64((__m128i *)(s - 0 * p), q0); - - work_a = _mm_loadl_epi64((__m128i *)flat_oq[1]); - q1 = _mm_loadl_epi64((__m128i *)flat2_oq[1]); - work_a = _mm_andnot_si128(flat2, work_a); - q1 = _mm_and_si128(flat2, q1); - q1 = _mm_or_si128(work_a, q1); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - - work_a = _mm_loadl_epi64((__m128i *)flat_oq[2]); - q2 = _mm_loadl_epi64((__m128i *)flat2_oq[2]); - work_a = _mm_andnot_si128(flat2, work_a); - q2 = _mm_and_si128(flat2, q2); - q2 = _mm_or_si128(work_a, q2); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); - - // write out oq3 - oq7 - { - unsigned char *dst = (s + 3 * p); - for (i = 3; i < 7; i++) { - __m128i flat2_output; - work_a = _mm_loadl_epi64((__m128i *)aq[i]); - flat2_output = _mm_loadl_epi64((__m128i *)flat2_oq[i]); - work_a = _mm_andnot_si128(flat2, work_a); - flat2_output = _mm_and_si128(flat2, flat2_output); - work_a = _mm_or_si128(work_a, flat2_output); - _mm_storel_epi64((__m128i *)dst, work_a); - dst += p; - } - } + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); } } diff --git a/libvpx/vp9/decoder/vp9_dboolhuff.c b/libvpx/vp9/decoder/vp9_dboolhuff.c index 31b1ae2..06acec4 100644 --- a/libvpx/vp9/decoder/vp9_dboolhuff.c +++ b/libvpx/vp9/decoder/vp9_dboolhuff.c @@ -16,7 +16,7 @@ // This is meant to be a large, positive constant that can still be efficiently // loaded as an immediate (on platforms like ARM, for example). // Even relatively modest values like 100 would work fine. -#define VP9_LOTS_OF_BITS 0x40000000 +#define LOTS_OF_BITS 0x40000000 int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) { @@ -41,13 +41,13 @@ void vp9_reader_fill(vp9_reader *r) { const uint8_t *buffer = r->buffer; VP9_BD_VALUE value = r->value; int count = r->count; - int shift = VP9_BD_VALUE_SIZE - 8 - (count + 8); + int shift = BD_VALUE_SIZE - 8 - (count + 8); int loop_end = 0; const int bits_left = (int)((buffer_end - buffer)*CHAR_BIT); const int x = shift + CHAR_BIT - bits_left; if (x >= 0) { - count += VP9_LOTS_OF_BITS; + count += LOTS_OF_BITS; loop_end = x; } @@ -66,7 +66,7 @@ void vp9_reader_fill(vp9_reader *r) { const uint8_t *vp9_reader_find_end(vp9_reader *r) { // Find the end of the coded buffer - while (r->count > CHAR_BIT && r->count < VP9_BD_VALUE_SIZE) { + while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) { r->count -= CHAR_BIT; r->buffer--; } @@ -83,10 +83,10 @@ int vp9_reader_has_error(vp9_reader *r) { // // When reading a byte from the user's buffer, count is filled with 8 and // one byte is filled into the value buffer. When we reach the end of the - // data, count is additionally filled with VP9_LOTS_OF_BITS. So when - // count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted. + // data, count is additionally filled with LOTS_OF_BITS. So when + // count == LOTS_OF_BITS - 1, the user's data has been exhausted. // // 1 if we have tried to decode bits after the end of stream was encountered. // 0 No error. - return r->count > VP9_BD_VALUE_SIZE && r->count < VP9_LOTS_OF_BITS; + return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS; } diff --git a/libvpx/vp9/decoder/vp9_dboolhuff.h b/libvpx/vp9/decoder/vp9_dboolhuff.h index c46dd73..c864516 100644 --- a/libvpx/vp9/decoder/vp9_dboolhuff.h +++ b/libvpx/vp9/decoder/vp9_dboolhuff.h @@ -20,7 +20,7 @@ typedef size_t VP9_BD_VALUE; -#define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT) +#define BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT) typedef struct { const uint8_t *buffer_end; @@ -52,7 +52,7 @@ static int vp9_read(vp9_reader *br, int probability) { value = br->value; count = br->count; - bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8); + bigsplit = (VP9_BD_VALUE)split << (BD_VALUE_SIZE - 8); range = split; diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c index a3e2ad3..84a29b1 100644 --- a/libvpx/vp9/decoder/vp9_decodemv.c +++ b/libvpx/vp9/decoder/vp9_decodemv.c @@ -43,7 +43,7 @@ static int read_segment_id(vp9_reader *r, const struct segmentation *seg) { } static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, - BLOCK_SIZE_TYPE bsize, vp9_reader *r) { + BLOCK_SIZE bsize, vp9_reader *r) { const uint8_t context = vp9_get_pred_context_tx_size(xd); const vp9_prob *tx_probs = get_tx_probs(bsize, context, &cm->fc.tx_probs); TX_SIZE tx_size = vp9_read(r, tx_probs[0]); @@ -58,7 +58,7 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, } static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode, - BLOCK_SIZE_TYPE bsize, int allow_select, + BLOCK_SIZE bsize, int allow_select, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; @@ -75,7 +75,7 @@ static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode, return TX_4X4; } -static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize, +static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize, int mi_row, int mi_col, int segment_id) { const int mi_offset = mi_row * cm->mi_cols + mi_col; const int bw = 1 << mi_width_log2(bsize); @@ -94,8 +94,8 @@ static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize, static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, vp9_reader *r) { MACROBLOCKD *const xd = &pbi->mb; - struct segmentation *const seg = &xd->seg; - const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; + struct segmentation *const seg = &pbi->common.seg; + const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type; int segment_id; if (!seg->enabled) @@ -113,8 +113,8 @@ static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - struct segmentation *const seg = &xd->seg; - const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; + struct segmentation *const seg = &cm->seg; + const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type; int pred_segment_id, segment_id; if (!seg->enabled) @@ -126,9 +126,9 @@ static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, return pred_segment_id; if (seg->temporal_update) { - const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(xd); + const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd); const int pred_flag = vp9_read(r, pred_prob); - vp9_set_pred_flag_seg_id(cm, bsize, mi_row, mi_col, pred_flag); + vp9_set_pred_flag_seg_id(xd, pred_flag); segment_id = pred_flag ? pred_segment_id : read_segment_id(r, seg); } else { @@ -141,7 +141,7 @@ static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, static uint8_t read_skip_coeff(VP9D_COMP *pbi, int segment_id, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - int skip_coeff = vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP); + int skip_coeff = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); if (!skip_coeff) { const int ctx = vp9_get_pred_context_mbskip(xd); skip_coeff = vp9_read(r, vp9_get_pred_prob_mbskip(cm, xd)); @@ -155,19 +155,20 @@ static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m, VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; MB_MODE_INFO *const mbmi = &m->mbmi; - const BLOCK_SIZE_TYPE bsize = mbmi->sb_type; - const int mis = cm->mode_info_stride; + const BLOCK_SIZE bsize = mbmi->sb_type; + const MODE_INFO *above_mi = xd->mi_8x8[-cm->mode_info_stride]; + const MODE_INFO *left_mi = xd->mi_8x8[-1]; mbmi->segment_id = read_intra_segment_id(pbi, mi_row, mi_col, r); - mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r); - mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r); + mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r); + mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r); mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE; - if (bsize >= BLOCK_SIZE_SB8X8) { - const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis); + if (bsize >= BLOCK_8X8) { + const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0); const MB_PREDICTION_MODE L = xd->left_available ? - left_block_mode(m, 0) : DC_PRED; + left_block_mode(m, left_mi, 0) : DC_PRED; mbmi->mode = read_intra_mode(r, vp9_kf_y_mode_prob[A][L]); } else { // Only 4x4, 4x8, 8x4 blocks @@ -178,9 +179,9 @@ static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m, for (idy = 0; idy < 2; idy += num_4x4_h) { for (idx = 0; idx < 2; idx += num_4x4_w) { const int ib = idy * 2 + idx; - const MB_PREDICTION_MODE A = above_block_mode(m, ib, mis); + const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, ib); const MB_PREDICTION_MODE L = (xd->left_available || idx) ? - left_block_mode(m, ib) : DC_PRED; + left_block_mode(m, left_mi, ib) : DC_PRED; const MB_PREDICTION_MODE b_mode = read_intra_mode(r, vp9_kf_y_mode_prob[A][L]); m->bmi[ib].as_mode = b_mode; @@ -251,7 +252,7 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref, } static void update_mv(vp9_reader *r, vp9_prob *p) { - if (vp9_read(r, VP9_NMV_UPDATE_PROB)) + if (vp9_read(r, NMV_UPDATE_PROB)) *p = (vp9_read_literal(r, 7) << 1) | 1; } @@ -303,8 +304,8 @@ static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r, FRAME_CONTEXT *const fc = &cm->fc; FRAME_COUNTS *const counts = &cm->counts; - if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME)) { - ref_frame[0] = vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME); + if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + ref_frame[0] = vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); ref_frame[1] = NONE; } else { const int comp_ctx = vp9_get_pred_context_comp_inter_inter(cm, xd); @@ -345,17 +346,17 @@ static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r, static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) { int i, j; - for (j = 0; j < VP9_SWITCHABLE_FILTERS + 1; ++j) - for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j) + for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) + if (vp9_read(r, MODE_UPDATE_PROB)) vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]); } static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) { int i, j; for (i = 0; i < INTER_MODE_CONTEXTS; ++i) - for (j = 0; j < VP9_INTER_MODES - 1; ++j) - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + for (j = 0; j < INTER_MODES - 1; ++j) + if (vp9_read(r, MODE_UPDATE_PROB)) vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]); } @@ -370,23 +371,23 @@ static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type( VP9D_COMP *pbi, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - const vp9_prob *probs = vp9_get_pred_probs_switchable_interp(cm, xd); - const int index = treed_read(r, vp9_switchable_interp_tree, probs); const int ctx = vp9_get_pred_context_switchable_interp(xd); - ++cm->counts.switchable_interp[ctx][index]; - return vp9_switchable_interp[index]; + const int type = treed_read(r, vp9_switchable_interp_tree, + cm->fc.switchable_interp_prob[ctx]); + ++cm->counts.switchable_interp[ctx][type]; + return type; } static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; MB_MODE_INFO *const mbmi = &mi->mbmi; - const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type; + const BLOCK_SIZE bsize = mi->mbmi.sb_type; mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE; - if (bsize >= BLOCK_SIZE_SB8X8) { + if (bsize >= BLOCK_8X8) { const int size_group = size_group_lookup[bsize]; mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[size_group]); cm->counts.y_mode[size_group][mbmi->mode]++; @@ -420,8 +421,8 @@ static int read_is_inter_block(VP9D_COMP *pbi, int segment_id, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME)) { - return vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) != + if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME; } else { const int ctx = vp9_get_pred_context_intra_inter(xd); @@ -439,54 +440,56 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *const mbmi = &mi->mbmi; int_mv *const mv0 = &mbmi->mv[0]; int_mv *const mv1 = &mbmi->mv[1]; - const BLOCK_SIZE_TYPE bsize = mbmi->sb_type; + const BLOCK_SIZE bsize = mbmi->sb_type; const int allow_hp = xd->allow_high_precision_mv; int_mv nearest, nearby, best_mv; int_mv nearest_second, nearby_second, best_mv_second; uint8_t inter_mode_ctx; - MV_REFERENCE_FRAME ref0, ref1; + MV_REFERENCE_FRAME ref0; + int is_compound; + mbmi->uv_mode = DC_PRED; read_ref_frames(pbi, r, mbmi->segment_id, mbmi->ref_frame); ref0 = mbmi->ref_frame[0]; - ref1 = mbmi->ref_frame[1]; + is_compound = has_second_ref(mbmi); - vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context, - ref0, mbmi->ref_mvs[ref0], cm->ref_frame_sign_bias, + vp9_find_mv_refs(cm, xd, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0], mi_row, mi_col); - inter_mode_ctx = mbmi->mb_mode_context[ref0]; + inter_mode_ctx = mbmi->mode_context[ref0]; - if (vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP)) + if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { mbmi->mode = ZEROMV; - else if (bsize >= BLOCK_SIZE_SB8X8) - mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx); - - mbmi->uv_mode = DC_PRED; + assert(bsize >= BLOCK_8X8); + } else { + if (bsize >= BLOCK_8X8) + mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx); + } // nearest, nearby - if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) { + if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) { vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref0], &nearest, &nearby); best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int; } - mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE - ? read_switchable_filter_type(pbi, r) - : cm->mcomp_filter_type; - - if (ref1 > INTRA_FRAME) { - vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context, - ref1, mbmi->ref_mvs[ref1], cm->ref_frame_sign_bias, - mi_row, mi_col); + if (is_compound) { + const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1]; + vp9_find_mv_refs(cm, xd, mi, xd->last_mi, + ref1, mbmi->ref_mvs[ref1], mi_row, mi_col); - if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) { + if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) { vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref1], &nearest_second, &nearby_second); best_mv_second.as_int = mbmi->ref_mvs[ref1][0].as_int; } } - if (bsize < BLOCK_SIZE_SB8X8) { + mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE + ? read_switchable_filter_type(pbi, r) + : cm->mcomp_filter_type; + + if (bsize < BLOCK_8X8) { const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; // 1 or 2 const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; // 1 or 2 int idx, idy; @@ -500,7 +503,7 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0, mi_row, mi_col); - if (ref1 > 0) + if (is_compound) vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest_second, &nearby_second, j, 1, mi_row, mi_col); @@ -511,30 +514,30 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, read_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc, &cm->counts.mv, allow_hp); - if (ref1 > 0) + if (is_compound) read_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc, &cm->counts.mv, allow_hp); break; case NEARESTMV: blockmv.as_int = nearest.as_int; - if (ref1 > 0) + if (is_compound) secondmv.as_int = nearest_second.as_int; break; case NEARMV: blockmv.as_int = nearby.as_int; - if (ref1 > 0) + if (is_compound) secondmv.as_int = nearby_second.as_int; break; case ZEROMV: blockmv.as_int = 0; - if (ref1 > 0) + if (is_compound) secondmv.as_int = 0; break; default: assert(!"Invalid inter mode value"); } mi->bmi[j].as_mv[0].as_int = blockmv.as_int; - if (ref1 > 0) + if (is_compound) mi->bmi[j].as_mv[1].as_int = secondmv.as_int; if (num_4x4_h == 2) @@ -551,33 +554,25 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, switch (mbmi->mode) { case NEARMV: mv0->as_int = nearby.as_int; - clamp_mv2(&mv0->as_mv, xd); - - if (ref1 > 0) { + if (is_compound) mv1->as_int = nearby_second.as_int; - clamp_mv2(&mv1->as_mv, xd); - } break; case NEARESTMV: mv0->as_int = nearest.as_int; - clamp_mv2(&mv0->as_mv, xd); - - if (ref1 > 0) { + if (is_compound) mv1->as_int = nearest_second.as_int; - clamp_mv2(&mv1->as_mv, xd); - } break; case ZEROMV: mv0->as_int = 0; - if (ref1 > 0) + if (is_compound) mv1->as_int = 0; break; case NEWMV: read_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->counts.mv, allow_hp); - if (ref1 > 0) + if (is_compound) read_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc, &cm->counts.mv, allow_hp); break; @@ -596,10 +591,10 @@ static void read_inter_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, mbmi->mv[0].as_int = 0; mbmi->mv[1].as_int = 0; mbmi->segment_id = read_inter_segment_id(pbi, mi_row, mi_col, r); - mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r); + mbmi->skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r); inter_block = read_is_inter_block(pbi, mbmi->segment_id, r); - mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, mbmi->sb_type, - !mbmi->mb_skip_coeff || !inter_block, r); + mbmi->tx_size = read_tx_size(pbi, cm->tx_mode, mbmi->sb_type, + !mbmi->skip_coeff || !inter_block, r); if (inter_block) read_inter_block_mode_info(pbi, mi, mi_row, mi_col, r); @@ -615,20 +610,20 @@ static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) { if (cm->comp_pred_mode == HYBRID_PREDICTION) for (i = 0; i < COMP_INTER_CONTEXTS; i++) - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + if (vp9_read(r, MODE_UPDATE_PROB)) vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]); if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) for (i = 0; i < REF_CONTEXTS; i++) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + if (vp9_read(r, MODE_UPDATE_PROB)) vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]); - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + if (vp9_read(r, MODE_UPDATE_PROB)) vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]); } if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) for (i = 0; i < REF_CONTEXTS; i++) - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + if (vp9_read(r, MODE_UPDATE_PROB)) vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]); } @@ -639,7 +634,7 @@ void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) { // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove. // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs)); for (k = 0; k < MBSKIP_CONTEXTS; ++k) - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + if (vp9_read(r, MODE_UPDATE_PROB)) vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]); if (cm->frame_type != KEY_FRAME && !cm->intra_only) { @@ -653,19 +648,19 @@ void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) { read_switchable_interp_probs(&cm->fc, r); for (i = 0; i < INTRA_INTER_CONTEXTS; i++) - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + if (vp9_read(r, MODE_UPDATE_PROB)) vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]); read_comp_pred(cm, r); for (j = 0; j < BLOCK_SIZE_GROUPS; j++) - for (i = 0; i < VP9_INTRA_MODES - 1; ++i) - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + for (i = 0; i < INTRA_MODES - 1; ++i) + if (vp9_read(r, MODE_UPDATE_PROB)) vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]); for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j) for (i = 0; i < PARTITION_TYPES - 1; ++i) - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + if (vp9_read(r, MODE_UPDATE_PROB)) vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]); read_mv_probs(r, nmvc, xd->allow_high_precision_mv); @@ -675,20 +670,21 @@ void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) { void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - MODE_INFO *mi = xd->mode_info_context; - const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type; + MODE_INFO *mi = xd->this_mi; + const BLOCK_SIZE bsize = mi->mbmi.sb_type; const int bw = 1 << mi_width_log2(bsize); const int bh = 1 << mi_height_log2(bsize); const int y_mis = MIN(bh, cm->mi_rows - mi_row); const int x_mis = MIN(bw, cm->mi_cols - mi_col); - int x, y; + int x, y, z; if (cm->frame_type == KEY_FRAME || cm->intra_only) read_intra_frame_mode_info(pbi, mi, mi_row, mi_col, r); else read_inter_frame_mode_info(pbi, mi, mi_row, mi_col, r); - for (y = 0; y < y_mis; y++) - for (x = !y; x < x_mis; x++) - mi[y * cm->mode_info_stride + x] = *mi; + for (y = 0, z = 0; y < y_mis; y++, z += cm->mode_info_stride) + for (x = !y; x < x_mis; x++) { + xd->mi_8x8[z + x] = mi; + } } diff --git a/libvpx/vp9/decoder/vp9_decodframe.c b/libvpx/vp9/decoder/vp9_decodframe.c index feb6024..34ed0c7 100644 --- a/libvpx/vp9/decoder/vp9_decodframe.c +++ b/libvpx/vp9/decoder/vp9_decodframe.c @@ -63,44 +63,40 @@ static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) { for (i = 0; i < TX_SIZE_CONTEXTS; ++i) for (j = 0; j < TX_SIZES - 3; ++j) - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + if (vp9_read(r, MODE_UPDATE_PROB)) vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]); for (i = 0; i < TX_SIZE_CONTEXTS; ++i) for (j = 0; j < TX_SIZES - 2; ++j) - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + if (vp9_read(r, MODE_UPDATE_PROB)) vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]); for (i = 0; i < TX_SIZE_CONTEXTS; ++i) for (j = 0; j < TX_SIZES - 1; ++j) - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + if (vp9_read(r, MODE_UPDATE_PROB)) vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]); } -static void init_dequantizer(VP9_COMMON *cm, MACROBLOCKD *xd) { +static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) { int i; - const int segment_id = xd->mode_info_context->mbmi.segment_id; - xd->q_index = vp9_get_qindex(xd, segment_id, cm->base_qindex); + xd->plane[0].dequant = cm->y_dequant[q_index]; - xd->plane[0].dequant = cm->y_dequant[xd->q_index]; for (i = 1; i < MAX_MB_PLANE; i++) - xd->plane[i].dequant = cm->uv_dequant[xd->q_index]; + xd->plane[i].dequant = cm->uv_dequant[q_index]; } -static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { +static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { MACROBLOCKD* const xd = arg; - struct macroblockd_plane *pd = &xd->plane[plane]; - int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16); + struct macroblockd_plane *const pd = &xd->plane[plane]; + int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block); const int stride = pd->dst.stride; const int eob = pd->eobs[block]; - const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane, - block, ss_txfrm_size); - uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane, - raster_block, + const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, + block); + uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block, pd->dst.buf, stride); - - switch (ss_txfrm_size / 2) { + switch (tx_size) { case TX_4X4: { const TX_TYPE tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block); if (tx_type == DCT_DCT) @@ -120,87 +116,78 @@ static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, case TX_32X32: vp9_idct_add_32x32(qcoeff, dst, stride, eob); break; + default: + assert(!"Invalid transform size"); } } -static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { +static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { MACROBLOCKD* const xd = arg; - struct macroblockd_plane *pd = &xd->plane[plane]; - MODE_INFO *const mi = xd->mode_info_context; - - const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane, - block, ss_txfrm_size); - uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane, - raster_block, + struct macroblockd_plane *const pd = &xd->plane[plane]; + MODE_INFO *const mi = xd->this_mi; + const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, + block); + uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block, pd->dst.buf, pd->dst.stride); - const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2); - int b_mode; - int plane_b_size; - const int tx_ib = raster_block >> tx_size; - const int mode = plane == 0 ? mi->mbmi.mode - : mi->mbmi.uv_mode; - - if (plane == 0 && mi->mbmi.sb_type < BLOCK_8X8) { - assert(bsize == BLOCK_8X8); - b_mode = mi->bmi[raster_block].as_mode; - } else { - b_mode = mode; - } + const MB_PREDICTION_MODE mode = (plane == 0) + ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[raster_block].as_mode + : mi->mbmi.mode) + : mi->mbmi.uv_mode; if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) - extend_for_intra(xd, plane, block, bsize, ss_txfrm_size); + extend_for_intra(xd, plane_bsize, plane, block, tx_size); - plane_b_size = b_width_log2(bsize) - pd->subsampling_x; - vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode, - dst, pd->dst.stride, - dst, pd->dst.stride); + vp9_predict_intra_block(xd, raster_block >> tx_size, + b_width_log2(plane_bsize), tx_size, mode, + dst, pd->dst.stride, dst, pd->dst.stride); - // Early exit if there are no coefficients - if (mi->mbmi.mb_skip_coeff) - return; - - decode_block(plane, block, bsize, ss_txfrm_size, arg); + if (!mi->mbmi.skip_coeff) + decode_block(plane, block, plane_bsize, tx_size, arg); } -static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize, vp9_reader *r) { +static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE bsize, vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; + MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; - if (xd->mode_info_context->mbmi.mb_skip_coeff) { - vp9_reset_sb_tokens_context(xd, bsize); + if (mbmi->skip_coeff) { + reset_skip_context(xd, bsize); return -1; } else { - if (xd->seg.enabled) - init_dequantizer(&pbi->common, xd); + if (cm->seg.enabled) + setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id, + cm->base_qindex)); // TODO(dkovalev) if (!vp9_reader_has_error(r)) return vp9_decode_tokens(pbi, r, bsize); } } -static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize, +static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize, int mi_row, int mi_col) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - const int bh = 1 << mi_height_log2(bsize); - const int bw = 1 << mi_width_log2(bsize); - const int mi_idx = mi_row * cm->mode_info_stride + mi_col; - int i; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int offset = mi_row * cm->mode_info_stride + mi_col; + + xd->mode_info_stride = cm->mode_info_stride; + + xd->mi_8x8 = cm->mi_grid_visible + offset; + xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset; + + // we are using the mode info context stream here + xd->this_mi = + xd->mi_8x8[0] = xd->mic_stream_ptr; + xd->this_mi->mbmi.sb_type = bsize; + xd->mic_stream_ptr++; - xd->mode_info_context = cm->mi + mi_idx; - xd->mode_info_context->mbmi.sb_type = bsize; // Special case: if prev_mi is NULL, the previous mode info context // cannot be used. - xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + mi_idx : NULL; - - for (i = 0; i < MAX_MB_PLANE; i++) { - struct macroblockd_plane *pd = &xd->plane[i]; - pd->above_context = cm->above_context[i] + - (mi_col * 2 >> pd->subsampling_x); - pd->left_context = cm->left_context[i] + - (((mi_row * 2) & 15) >> pd->subsampling_y); - } + xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL; + set_skip_context(cm, xd, mi_row, mi_col); set_partition_seg_context(cm, xd, mi_row, mi_col); // Distance of Mb to the various image edges. These are specified to 8th pel @@ -213,17 +200,21 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize, static void set_ref(VP9D_COMP *pbi, int i, int mi_row, int mi_col) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; - const int ref = mbmi->ref_frame[i] - 1; - + MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; + const int ref = mbmi->ref_frame[i] - LAST_FRAME; const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[ref]]; - xd->scale_factor[i] = cm->active_ref_scale[ref]; - setup_pre_planes(xd, i, cfg, mi_row, mi_col, &xd->scale_factor[i]); + const struct scale_factors *sf = &cm->active_ref_scale[ref]; + if (!vp9_is_valid_scale(sf)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid scale factors"); + + xd->scale_factor[i] = *sf; + setup_pre_planes(xd, i, cfg, mi_row, mi_col, sf); xd->corrupted |= cfg->corrupted; } static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col, - vp9_reader *r, BLOCK_SIZE_TYPE bsize) { + vp9_reader *r, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; const int less8x8 = bsize < BLOCK_8X8; @@ -240,7 +231,7 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col, bsize = BLOCK_8X8; // Has to be called after set_offsets - mbmi = &xd->mode_info_context->mbmi; + mbmi = &xd->this_mi->mbmi; if (!is_inter_block(mbmi)) { // Intra reconstruction @@ -251,7 +242,7 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col, int eobtotal; set_ref(pbi, 0, mi_row, mi_col); - if (mbmi->ref_frame[1] > INTRA_FRAME) + if (has_second_ref(mbmi)) set_ref(pbi, 1, mi_row, mi_col); vp9_setup_interp_filters(xd, mbmi->interp_filter, cm); @@ -264,7 +255,7 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col, assert(mbmi->sb_type == bsize); if (eobtotal == 0) // skip loopfilter - vp9_set_pred_flag_mbskip(cm, bsize, mi_row, mi_col, 1); + vp9_set_pred_flag_mbskip(xd, bsize, 1); else if (eobtotal > 0) foreach_transformed_block(xd, bsize, decode_block, xd); } @@ -273,14 +264,14 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col, } static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col, - vp9_reader* r, BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const pc = &pbi->common; + vp9_reader* r, BLOCK_SIZE bsize) { + VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - int bs = (1 << mi_width_log2(bsize)) / 2, n; + const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2; PARTITION_TYPE partition = PARTITION_NONE; - BLOCK_SIZE_TYPE subsize; + BLOCK_SIZE subsize; - if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols) + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; if (bsize < BLOCK_8X8) { @@ -288,24 +279,25 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col, return; } else { int pl; - const int idx = check_bsize_coverage(pc, mi_row, mi_col, bsize); - set_partition_seg_context(pc, xd, mi_row, mi_col); + const int idx = check_bsize_coverage(hbs, cm->mi_rows, cm->mi_cols, + mi_row, mi_col); + set_partition_seg_context(cm, xd, mi_row, mi_col); pl = partition_plane_context(xd, bsize); if (idx == 0) partition = treed_read(r, vp9_partition_tree, - pc->fc.partition_prob[pc->frame_type][pl]); + cm->fc.partition_prob[cm->frame_type][pl]); else if (idx > 0 && - !vp9_read(r, pc->fc.partition_prob[pc->frame_type][pl][idx])) + !vp9_read(r, cm->fc.partition_prob[cm->frame_type][pl][idx])) partition = (idx == 1) ? PARTITION_HORZ : PARTITION_VERT; else partition = PARTITION_SPLIT; - pc->counts.partition[pl][partition]++; + cm->counts.partition[pl][partition]++; } subsize = get_subsize(bsize, partition); - *(get_sb_index(xd, subsize)) = 0; + *get_sb_index(xd, subsize) = 0; switch (partition) { case PARTITION_NONE: @@ -313,23 +305,24 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col, break; case PARTITION_HORZ: decode_modes_b(pbi, mi_row, mi_col, r, subsize); - *(get_sb_index(xd, subsize)) = 1; - if (mi_row + bs < pc->mi_rows) - decode_modes_b(pbi, mi_row + bs, mi_col, r, subsize); + *get_sb_index(xd, subsize) = 1; + if (mi_row + hbs < cm->mi_rows) + decode_modes_b(pbi, mi_row + hbs, mi_col, r, subsize); break; case PARTITION_VERT: decode_modes_b(pbi, mi_row, mi_col, r, subsize); - *(get_sb_index(xd, subsize)) = 1; - if (mi_col + bs < pc->mi_cols) - decode_modes_b(pbi, mi_row, mi_col + bs, r, subsize); + *get_sb_index(xd, subsize) = 1; + if (mi_col + hbs < cm->mi_cols) + decode_modes_b(pbi, mi_row, mi_col + hbs, r, subsize); break; - case PARTITION_SPLIT: + case PARTITION_SPLIT: { + int n; for (n = 0; n < 4; n++) { - int j = n >> 1, i = n & 0x01; - *(get_sb_index(xd, subsize)) = n; - decode_modes_sb(pbi, mi_row + j * bs, mi_col + i * bs, r, subsize); + const int j = n >> 1, i = n & 1; + *get_sb_index(xd, subsize) = n; + decode_modes_sb(pbi, mi_row + j * hbs, mi_col + i * hbs, r, subsize); } - break; + } break; default: assert(!"Invalid partition type"); } @@ -337,7 +330,7 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col, // update partition context if (bsize >= BLOCK_8X8 && (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) { - set_partition_seg_context(pc, xd, mi_row, mi_col); + set_partition_seg_context(cm, xd, mi_row, mi_col); update_partition_context(xd, subsize, bsize); } } @@ -345,18 +338,18 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col, static void setup_token_decoder(VP9D_COMP *pbi, const uint8_t *data, size_t read_size, vp9_reader *r) { - VP9_COMMON *pc = &pbi->common; + VP9_COMMON *cm = &pbi->common; const uint8_t *data_end = pbi->source + pbi->source_sz; // Validate the calculated partition length. If the buffer // described by the partition can't be fully read, then restrict // it to the portion that can be (for EC mode) or throw an error. if (!read_is_valid(data, read_size, data_end)) - vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt tile length"); if (vp9_reader_init(r, data, read_size)) - vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate bool decoder %d", 1); } @@ -470,8 +463,7 @@ static void setup_loopfilter(struct loopfilter *lf, static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) { const int old = *delta_q; - if (vp9_rb_read_bit(rb)) - *delta_q = vp9_rb_read_signed_literal(rb, 4); + *delta_q = vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0; return old != *delta_q; } @@ -498,8 +490,11 @@ static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) { static INTERPOLATIONFILTERTYPE read_interp_filter_type( struct vp9_read_bit_buffer *rb) { + const INTERPOLATIONFILTERTYPE literal_to_type[] = { EIGHTTAP_SMOOTH, + EIGHTTAP, + EIGHTTAP_SHARP }; return vp9_rb_read_bit(rb) ? SWITCHABLE - : vp9_rb_read_literal(rb, 2); + : literal_to_type[vp9_rb_read_literal(rb, 2)]; } static void read_frame_size(struct vp9_read_bit_buffer *rb, @@ -552,8 +547,8 @@ static void setup_frame_size(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) { int width, height; read_frame_size(rb, &width, &height); - setup_display_size(&pbi->common, rb); apply_frame_size(pbi, width, height); + setup_display_size(&pbi->common, rb); } static void setup_frame_size_with_refs(VP9D_COMP *pbi, @@ -579,35 +574,36 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi, vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Referenced frame with invalid size"); - setup_display_size(cm, rb); apply_frame_size(pbi, width, height); + setup_display_size(cm, rb); } static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) { const int num_threads = pbi->oxcf.max_threads; - VP9_COMMON *const pc = &pbi->common; + VP9_COMMON *const cm = &pbi->common; int mi_row, mi_col; + YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[cm->new_fb_idx]; if (pbi->do_loopfilter_inline) { if (num_threads > 1) { LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; - lf_data->frame_buffer = &pbi->common.yv12_fb[pbi->common.new_fb_idx]; - lf_data->cm = pc; + lf_data->frame_buffer = fb; + lf_data->cm = cm; lf_data->xd = pbi->mb; + lf_data->stop = 0; lf_data->y_only = 0; } - vp9_loop_filter_frame_init(pc, &pbi->mb, pbi->mb.lf.filter_level); + vp9_loop_filter_frame_init(cm, cm->lf.filter_level); } - for (mi_row = pc->cur_tile_mi_row_start; mi_row < pc->cur_tile_mi_row_end; + for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end; mi_row += MI_BLOCK_SIZE) { // For a SB there are 2 left contexts, each pertaining to a MB row within - vpx_memset(&pc->left_context, 0, sizeof(pc->left_context)); - vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context)); - for (mi_col = pc->cur_tile_mi_col_start; mi_col < pc->cur_tile_mi_col_end; - mi_col += MI_BLOCK_SIZE) { + vp9_zero(cm->left_context); + vp9_zero(cm->left_seg_context); + for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; + mi_col += MI_BLOCK_SIZE) decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_64X64); - } if (pbi->do_loopfilter_inline) { // delay the loopfilter by 1 macroblock row. @@ -617,28 +613,32 @@ static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) { if (num_threads > 1) { LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + // decoding has completed: finish up the loop filter in this thread. + if (mi_row + MI_BLOCK_SIZE >= cm->cur_tile_mi_row_end) continue; + vp9_worker_sync(&pbi->lf_worker); lf_data->start = lf_start; lf_data->stop = mi_row; pbi->lf_worker.hook = vp9_loop_filter_worker; vp9_worker_launch(&pbi->lf_worker); } else { - YV12_BUFFER_CONFIG *const fb = - &pbi->common.yv12_fb[pbi->common.new_fb_idx]; - vp9_loop_filter_rows(fb, pc, &pbi->mb, lf_start, mi_row, 0); + vp9_loop_filter_rows(fb, cm, &pbi->mb, lf_start, mi_row, 0); } } } if (pbi->do_loopfilter_inline) { - YV12_BUFFER_CONFIG *const fb = &pbi->common.yv12_fb[pbi->common.new_fb_idx]; + int lf_start; if (num_threads > 1) { - // TODO(jzern): since the loop filter is delayed one mb row, this will be - // forced to wait for the last row scheduled in the for loop. + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + vp9_worker_sync(&pbi->lf_worker); + lf_start = lf_data->stop; + } else { + lf_start = mi_row - MI_BLOCK_SIZE; } - vp9_loop_filter_rows(fb, pc, &pbi->mb, - mi_row - MI_BLOCK_SIZE, pc->mi_rows, 0); + vp9_loop_filter_rows(fb, cm, &pbi->mb, + lf_start, cm->mi_rows, 0); } } @@ -661,20 +661,20 @@ static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { vp9_reader residual_bc; - VP9_COMMON *const pc = &pbi->common; + VP9_COMMON *const cm = &pbi->common; const uint8_t *const data_end = pbi->source + pbi->source_sz; - const int aligned_mi_cols = mi_cols_aligned_to_sb(pc->mi_cols); - const int tile_cols = 1 << pc->log2_tile_cols; - const int tile_rows = 1 << pc->log2_tile_rows; + const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; int tile_row, tile_col; // Note: this memset assumes above_context[0], [1] and [2] // are allocated as part of the same buffer. - vpx_memset(pc->above_context[0], 0, - sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols); + vpx_memset(cm->above_context[0], 0, + sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE * (2 * aligned_mi_cols)); - vpx_memset(pc->above_seg_context, 0, + vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) * aligned_mi_cols); if (pbi->oxcf.inv_tile_order) { @@ -699,9 +699,9 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { } for (tile_row = 0; tile_row < tile_rows; tile_row++) { - vp9_get_tile_row_offsets(pc, tile_row); + vp9_get_tile_row_offsets(cm, tile_row); for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) { - vp9_get_tile_col_offsets(pc, tile_col); + vp9_get_tile_col_offsets(cm, tile_col); setup_token_decoder(pbi, data_ptr2[tile_row][tile_col], data_end - data_ptr2[tile_row][tile_col], &residual_bc); @@ -715,16 +715,16 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { int has_more; for (tile_row = 0; tile_row < tile_rows; tile_row++) { - vp9_get_tile_row_offsets(pc, tile_row); + vp9_get_tile_row_offsets(cm, tile_row); for (tile_col = 0; tile_col < tile_cols; tile_col++) { size_t size; - vp9_get_tile_col_offsets(pc, tile_col); + vp9_get_tile_col_offsets(cm, tile_col); has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1; if (has_more) { if (!read_is_valid(data, 4, data_end)) - vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt tile length"); size = read_be32(data); @@ -810,7 +810,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)]; ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->new_fb_idx, frame_to_show); pbi->refresh_frame_flags = 0; - xd->lf.filter_level = 0; + cm->lf.filter_level = 0; return 0; } @@ -861,7 +861,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES); setup_frame_size(pbi, rb); } else { - pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES); + pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES); for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) { const int ref = vp9_rb_read_literal(rb, NUM_REF_FRAMES_LOG2); @@ -892,11 +892,11 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LOG2); if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->intra_only) - vp9_setup_past_independence(cm, xd); + vp9_setup_past_independence(cm); - setup_loopfilter(&xd->lf, rb); + setup_loopfilter(&cm->lf, rb); setup_quantization(pbi, rb); - setup_segmentation(&xd->seg, rb); + setup_segmentation(&cm->seg, rb); setup_tile_info(cm, rb); @@ -937,17 +937,17 @@ void vp9_init_dequantizer(VP9_COMMON *cm) { int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { int i; - VP9_COMMON *const pc = &pbi->common; + VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; const uint8_t *data = pbi->source; const uint8_t *data_end = pbi->source + pbi->source_sz; struct vp9_read_bit_buffer rb = { data, data_end, 0, - pc, error_handler }; + cm, error_handler }; const size_t first_partition_size = read_uncompressed_header(pbi, &rb); - const int keyframe = pc->frame_type == KEY_FRAME; - YV12_BUFFER_CONFIG *new_fb = &pc->yv12_fb[pc->new_fb_idx]; + const int keyframe = cm->frame_type == KEY_FRAME; + YV12_BUFFER_CONFIG *new_fb = &cm->yv12_fb[cm->new_fb_idx]; if (!first_partition_size) { // showing a frame directly @@ -958,51 +958,39 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { xd->corrupted = 0; new_fb->corrupted = 0; pbi->do_loopfilter_inline = - (pc->log2_tile_rows | pc->log2_tile_cols) == 0 && pbi->mb.lf.filter_level; + (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level; if (!pbi->decoded_key_frame && !keyframe) return -1; if (!read_is_valid(data, first_partition_size, data_end)) - vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt header length"); - xd->mode_info_context = pc->mi; - xd->prev_mode_info_context = pc->prev_mi; - xd->mode_info_stride = pc->mode_info_stride; - - init_dequantizer(pc, &pbi->mb); + setup_plane_dequants(cm, &pbi->mb, cm->base_qindex); - if (!keyframe) - vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc); + xd->mi_8x8 = cm->mi_grid_visible; + xd->mic_stream_ptr = cm->mi; + xd->mode_info_stride = cm->mode_info_stride; - pc->fc = pc->frame_contexts[pc->frame_context_idx]; + cm->fc = cm->frame_contexts[cm->frame_context_idx]; - vp9_zero(pc->counts); - - // Initialize xd pointers. Any reference should do for xd->pre, so use 0. - setup_pre_planes(xd, 0, &pc->yv12_fb[pc->active_ref_idx[0]], 0, 0, NULL); - setup_dst_planes(xd, new_fb, 0, 0); + vp9_zero(cm->counts); new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size); - // Create the segmentation map structure and set to 0 - if (!pc->last_frame_seg_map) - CHECK_MEM_ERROR(pc, pc->last_frame_seg_map, - vpx_calloc((pc->mi_rows * pc->mi_cols), 1)); - - setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y); + setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y); // clear out the coeff buffer for (i = 0; i < MAX_MB_PLANE; ++i) vp9_zero(xd->plane[i].qcoeff); - set_prev_mi(pc); + set_prev_mi(cm); *p_data_end = decode_tiles(pbi, data + first_partition_size); - pc->last_width = pc->width; - pc->last_height = pc->height; + cm->last_width = cm->width; + cm->last_height = cm->height; new_fb->corrupted |= xd->corrupted; @@ -1010,21 +998,21 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { if (keyframe && !new_fb->corrupted) pbi->decoded_key_frame = 1; else - vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "A stream must start with a complete key frame"); } - if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) { - vp9_adapt_coef_probs(pc); + if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { + vp9_adapt_coef_probs(cm); - if (!keyframe && !pc->intra_only) { - vp9_adapt_mode_probs(pc); - vp9_adapt_mv_probs(pc, xd->allow_high_precision_mv); + if (!keyframe && !cm->intra_only) { + vp9_adapt_mode_probs(cm); + vp9_adapt_mv_probs(cm, xd->allow_high_precision_mv); } } - if (pc->refresh_frame_context) - pc->frame_contexts[pc->frame_context_idx] = pc->fc; + if (cm->refresh_frame_context) + cm->frame_contexts[cm->frame_context_idx] = cm->fc; return 0; } diff --git a/libvpx/vp9/decoder/vp9_decodframe.h b/libvpx/vp9/decoder/vp9_decodframe.h index 00b6d67..c665f6f 100644 --- a/libvpx/vp9/decoder/vp9_decodframe.h +++ b/libvpx/vp9/decoder/vp9_decodframe.h @@ -15,7 +15,7 @@ struct VP9Common; struct VP9Decompressor; -void vp9_init_dequantizer(struct VP9Common *pc); +void vp9_init_dequantizer(struct VP9Common *cm); int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end); #endif // VP9_DECODER_VP9_DECODFRAME_H_ diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c index 0021643..cd74a0b 100644 --- a/libvpx/vp9/decoder/vp9_detokenize.c +++ b/libvpx/vp9/decoder/vp9_detokenize.c @@ -94,9 +94,8 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) { FRAME_CONTEXT *const fc = &cm->fc; FRAME_COUNTS *const counts = &cm->counts; - ENTROPY_CONTEXT above_ec, left_ec; - const int ref = is_inter_block(&xd->mode_info_context->mbmi); - int band, pt, c = 0; + const int ref = is_inter_block(&xd->this_mi->mbmi); + int band, c = 0; vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] = fc->coef_probs[tx_size][type][ref]; vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]; @@ -104,41 +103,10 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, vp9_prob *prob; vp9_coeff_count_model *coef_counts = counts->coef[tx_size]; const int16_t *scan, *nb; + const uint8_t *band_translate; uint8_t token_cache[1024]; - const uint8_t * band_translate; - - switch (tx_size) { - default: - case TX_4X4: { - scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx)); - above_ec = A[0] != 0; - left_ec = L[0] != 0; - band_translate = vp9_coefband_trans_4x4; - break; - } - case TX_8X8: { - scan = get_scan_8x8(get_tx_type_8x8(type, xd)); - above_ec = !!*(uint16_t *)A; - left_ec = !!*(uint16_t *)L; - band_translate = vp9_coefband_trans_8x8plus; - break; - } - case TX_16X16: { - scan = get_scan_16x16(get_tx_type_16x16(type, xd)); - above_ec = !!*(uint32_t *)A; - left_ec = !!*(uint32_t *)L; - band_translate = vp9_coefband_trans_8x8plus; - break; - } - case TX_32X32: - scan = vp9_default_scan_32x32; - above_ec = !!*(uint64_t *)A; - left_ec = !!*(uint64_t *)L; - band_translate = vp9_coefband_trans_8x8plus; - break; - } - - pt = combine_entropy_contexts(above_ec, left_ec); + int pt = get_entropy_context(xd, tx_size, type, block_idx, A, L, + &scan, &band_translate); nb = vp9_get_coef_neighbors_handle(scan); while (1) { @@ -242,54 +210,38 @@ SKIP_START: return c; } -static int get_eob(struct segmentation *seg, int segment_id, int eob_max) { - return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; -} - struct decode_block_args { VP9D_COMP *pbi; vp9_reader *r; int *eobtotal; }; -static void decode_block(int plane, int block, - BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, - void *argv) { +static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *argv) { const struct decode_block_args* const arg = argv; - const int bw = b_width_log2(bsize); // find the maximum eob for this transform size, adjusted by segment MACROBLOCKD *xd = &arg->pbi->mb; + struct segmentation *seg = &arg->pbi->common.seg; struct macroblockd_plane* pd = &xd->plane[plane]; - const int segment_id = xd->mode_info_context->mbmi.segment_id; - const TX_SIZE ss_tx_size = ss_txfrm_size / 2; - const int seg_eob = get_eob(&xd->seg, segment_id, 16 << ss_txfrm_size); - const int off = block >> ss_txfrm_size; - const int mod = bw - ss_tx_size - pd->subsampling_x; - const int aoff = (off & ((1 << mod) - 1)) << ss_tx_size; - const int loff = (off >> mod) << ss_tx_size; - const int tx_size_in_blocks = 1 << ss_tx_size; - ENTROPY_CONTEXT *A = pd->above_context + aoff; - ENTROPY_CONTEXT *L = pd->left_context + loff; - const int eob = decode_coefs(&arg->pbi->common, xd, arg->r, block, - pd->plane_type, seg_eob, - BLOCK_OFFSET(pd->qcoeff, block, 16), - ss_tx_size, pd->dequant, A, L); + const int segment_id = xd->this_mi->mbmi.segment_id; + const int seg_eob = get_tx_eob(seg, segment_id, tx_size); + int aoff, loff, eob; + + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff); + + eob = decode_coefs(&arg->pbi->common, xd, arg->r, block, + pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block), + tx_size, pd->dequant, + pd->above_context + aoff, pd->left_context + loff); + + set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff); - if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { - set_contexts_on_border(xd, bsize, plane, tx_size_in_blocks, eob, aoff, loff, - A, L); - } else { - int pt; - for (pt = 0; pt < tx_size_in_blocks; pt++) - A[pt] = L[pt] = eob > 0; - } pd->eobs[block] = eob; *arg->eobtotal += eob; } -int vp9_decode_tokens(VP9D_COMP *pbi, vp9_reader *r, BLOCK_SIZE_TYPE bsize) { +int vp9_decode_tokens(VP9D_COMP *pbi, vp9_reader *r, BLOCK_SIZE bsize) { int eobtotal = 0; struct decode_block_args args = {pbi, r, &eobtotal}; foreach_transformed_block(&pbi->mb, bsize, decode_block, &args); diff --git a/libvpx/vp9/decoder/vp9_detokenize.h b/libvpx/vp9/decoder/vp9_detokenize.h index f98fe8d..cf07c56 100644 --- a/libvpx/vp9/decoder/vp9_detokenize.h +++ b/libvpx/vp9/decoder/vp9_detokenize.h @@ -15,6 +15,6 @@ #include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/decoder/vp9_dboolhuff.h" -int vp9_decode_tokens(VP9D_COMP* pbi, vp9_reader *r, BLOCK_SIZE_TYPE bsize); +int vp9_decode_tokens(VP9D_COMP* pbi, vp9_reader *r, BLOCK_SIZE bsize); #endif // VP9_DECODER_VP9_DETOKENIZE_H_ diff --git a/libvpx/vp9/decoder/vp9_onyxd_if.c b/libvpx/vp9/decoder/vp9_onyxd_if.c index 5a01dd7..17d5def 100644 --- a/libvpx/vp9/decoder/vp9_onyxd_if.c +++ b/libvpx/vp9/decoder/vp9_onyxd_if.c @@ -13,7 +13,7 @@ #include <stdio.h> #include "vp9/common/vp9_onyxc_int.h" -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC #include "vp9/common/vp9_postproc.h" #endif #include "vp9/decoder/vp9_onyxd.h" @@ -110,35 +110,36 @@ void vp9_initialize_dec() { VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) { VP9D_COMP *const pbi = vpx_memalign(32, sizeof(VP9D_COMP)); + VP9_COMMON *const cm = pbi ? &pbi->common : NULL; - if (!pbi) + if (!cm) return NULL; vp9_zero(*pbi); - if (setjmp(pbi->common.error.jmp)) { - pbi->common.error.setjmp = 0; + if (setjmp(cm->error.jmp)) { + cm->error.setjmp = 0; vp9_remove_decompressor(pbi); return NULL; } - pbi->common.error.setjmp = 1; + cm->error.setjmp = 1; vp9_initialize_dec(); - vp9_create_common(&pbi->common); + vp9_create_common(cm); pbi->oxcf = *oxcf; - pbi->common.current_video_frame = 0; pbi->ready_for_new_data = 1; + cm->current_video_frame = 0; // vp9_init_dequantizer() is first called here. Add check in // frame_init_dequantizer() to avoid unnecessary calling of // vp9_init_dequantizer() for every frame. - vp9_init_dequantizer(&pbi->common); + vp9_init_dequantizer(cm); - vp9_loop_filter_init(&pbi->common, &pbi->mb.lf); + vp9_loop_filter_init(cm); - pbi->common.error.setjmp = 0; + cm->error.setjmp = 0; pbi->decoded_key_frame = 0; if (pbi->oxcf.max_threads > 1) { @@ -160,9 +161,6 @@ void vp9_remove_decompressor(VP9D_PTR ptr) { if (!pbi) return; - if (pbi->common.last_frame_seg_map) - vpx_free(pbi->common.last_frame_seg_map); - vp9_remove_common(&pbi->common); vp9_worker_end(&pbi->lf_worker); vpx_free(pbi->lf_worker.data1); @@ -187,21 +185,21 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr, * later commit that adds VP9-specific controls for this functionality. */ if (ref_frame_flag == VP9_LAST_FLAG) { - ref_fb_idx = pbi->common.ref_frame_map[0]; + ref_fb_idx = cm->ref_frame_map[0]; } else { - vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame"); - return pbi->common.error.error_code; + return cm->error.error_code; } if (!equal_dimensions(&cm->yv12_fb[ref_fb_idx], sd)) { - vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Incorrect buffer dimensions"); } else { vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd); } - return pbi->common.error.error_code; + return cm->error.error_code; } @@ -261,22 +259,21 @@ int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) { /* If any buffer updating is signaled it should be done here. */ static void swap_frame_buffers(VP9D_COMP *pbi) { int ref_index = 0, mask; + VP9_COMMON *const cm = &pbi->common; for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { - if (mask & 1) { - ref_cnt_fb(pbi->common.fb_idx_ref_cnt, - &pbi->common.ref_frame_map[ref_index], - pbi->common.new_fb_idx); - } + if (mask & 1) + ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->ref_frame_map[ref_index], + cm->new_fb_idx); ++ref_index; } - pbi->common.frame_to_show = &pbi->common.yv12_fb[pbi->common.new_fb_idx]; - pbi->common.fb_idx_ref_cnt[pbi->common.new_fb_idx]--; + cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx]; + cm->fb_idx_ref_cnt[cm->new_fb_idx]--; - /* Invalidate these references until the next frame starts. */ + // Invalidate these references until the next frame starts. for (ref_index = 0; ref_index < 3; ref_index++) - pbi->common.active_ref_idx[ref_index] = INT_MAX; + cm->active_ref_idx[ref_index] = INT_MAX; } int vp9_receive_compressed_data(VP9D_PTR ptr, @@ -293,7 +290,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, if (ptr == 0) return -1; - pbi->common.error.error_code = VPX_CODEC_OK; + cm->error.error_code = VPX_CODEC_OK; pbi->source = source; pbi->source_sz = size; @@ -314,8 +311,8 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, cm->new_fb_idx = get_free_fb(cm); - if (setjmp(pbi->common.error.jmp)) { - pbi->common.error.setjmp = 0; + if (setjmp(cm->error.jmp)) { + cm->error.setjmp = 0; /* We do not know if the missing frame(s) was supposed to update * any of the reference buffers, but we act conservative and @@ -334,13 +331,13 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, return -1; } - pbi->common.error.setjmp = 1; + cm->error.setjmp = 1; retcode = vp9_decode_frame(pbi, psource); if (retcode < 0) { - pbi->common.error.error_code = VPX_CODEC_ERROR; - pbi->common.error.setjmp = 0; + cm->error.error_code = VPX_CODEC_ERROR; + cm->error.setjmp = 0; if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) cm->fb_idx_ref_cnt[cm->new_fb_idx]--; return retcode; @@ -360,7 +357,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, if (!pbi->do_loopfilter_inline) { /* Apply the loop filter if appropriate. */ - vp9_loop_filter_frame(cm, &pbi->mb, pbi->mb.lf.filter_level, 0); + vp9_loop_filter_frame(cm, &pbi->mb, pbi->common.lf.filter_level, 0, 0); } #if WRITE_RECON_BUFFER == 2 @@ -389,12 +386,17 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, if (cm->show_frame) { // current mip will be the prev_mip for the next frame MODE_INFO *temp = cm->prev_mip; + MODE_INFO **temp2 = cm->prev_mi_grid_base; cm->prev_mip = cm->mip; cm->mip = temp; + cm->prev_mi_grid_base = cm->mi_grid_base; + cm->mi_grid_base = temp2; // update the upper left visible macroblock ptrs cm->mi = cm->mip + cm->mode_info_stride + 1; cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; + cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1; + cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1; cm->current_video_frame++; } @@ -403,7 +405,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, pbi->last_time_stamp = time_stamp; pbi->source_sz = 0; - pbi->common.error.setjmp = 0; + cm->error.setjmp = 0; return retcode; } @@ -424,15 +426,17 @@ int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd, *time_stamp = pbi->last_time_stamp; *time_end_stamp = 0; -#if CONFIG_POSTPROC - ret = vp9_post_proc_frame(&pbi->common, &pbi->mb.lf, sd, flags); +#if CONFIG_VP9_POSTPROC + ret = vp9_post_proc_frame(&pbi->common, sd, flags); #else if (pbi->common.frame_to_show) { *sd = *pbi->common.frame_to_show; sd->y_width = pbi->common.width; sd->y_height = pbi->common.height; - sd->uv_height = pbi->common.height / 2; + sd->uv_width = sd->y_width >> pbi->common.subsampling_x; + sd->uv_height = sd->y_height >> pbi->common.subsampling_y; + ret = 0; } else { ret = -1; diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c index 98ef420..957cfd2 100644 --- a/libvpx/vp9/encoder/vp9_bitstream.c +++ b/libvpx/vp9/encoder/vp9_bitstream.c @@ -41,9 +41,9 @@ unsigned __int64 Sectionbits[500]; #endif #ifdef ENTROPY_STATS -int intra_mode_stats[VP9_INTRA_MODES] - [VP9_INTRA_MODES] - [VP9_INTRA_MODES]; +int intra_mode_stats[INTRA_MODES] + [INTRA_MODES] + [INTRA_MODES]; vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES]; extern unsigned int active_section; @@ -54,8 +54,8 @@ extern unsigned int active_section; int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES]; int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1]; int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2]; -int64_t switchable_interp_stats[VP9_SWITCHABLE_FILTERS+1] - [VP9_SWITCHABLE_FILTERS]; +int64_t switchable_interp_stats[SWITCHABLE_FILTERS+1] + [SWITCHABLE_FILTERS]; void init_tx_count_stats() { vp9_zero(tx_count_32x32p_stats); @@ -88,8 +88,8 @@ static void update_tx_count_stats(VP9_COMMON *cm) { static void update_switchable_interp_stats(VP9_COMMON *cm) { int i, j; - for (i = 0; i < VP9_SWITCHABLE_FILTERS+1; ++i) - for (j = 0; j < VP9_SWITCHABLE_FILTERS; ++j) { + for (i = 0; i < SWITCHABLE_FILTERS+1; ++i) + for (j = 0; j < SWITCHABLE_FILTERS; ++j) { switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j]; } } @@ -141,11 +141,11 @@ void write_switchable_interp_stats() { fclose(fp); printf( - "vp9_default_switchable_filter_count[VP9_SWITCHABLE_FILTERS+1]" - "[VP9_SWITCHABLE_FILTERS] = {\n"); - for (i = 0; i < VP9_SWITCHABLE_FILTERS+1; i++) { + "vp9_default_switchable_filter_count[SWITCHABLE_FILTERS+1]" + "[SWITCHABLE_FILTERS] = {\n"); + for (i = 0; i < SWITCHABLE_FILTERS+1; i++) { printf(" { "); - for (j = 0; j < VP9_SWITCHABLE_FILTERS; j++) { + for (j = 0; j < SWITCHABLE_FILTERS; j++) { printf("%"PRId64", ", switchable_interp_stats[i][j]); } printf("},\n"); @@ -181,7 +181,7 @@ static void update_mode( n--; for (i = 0; i < n; ++i) { - vp9_cond_prob_diff_update(w, &Pcur[i], VP9_MODE_UPDATE_PROB, bct[i]); + vp9_cond_prob_diff_update(w, &Pcur[i], MODE_UPDATE_PROB, bct[i]); } } @@ -189,19 +189,20 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi, vp9_writer* const bc) { VP9_COMMON *const cm = &cpi->common; int j; - vp9_prob pnew[VP9_INTRA_MODES - 1]; - unsigned int bct[VP9_INTRA_MODES - 1][2]; + vp9_prob pnew[INTRA_MODES - 1]; + unsigned int bct[INTRA_MODES - 1][2]; for (j = 0; j < BLOCK_SIZE_GROUPS; j++) - update_mode(bc, VP9_INTRA_MODES, vp9_intra_mode_tree, pnew, + update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, pnew, cm->fc.y_mode_prob[j], bct, (unsigned int *)cpi->y_mode_count[j]); } -static void write_selected_tx_size(const VP9_COMP *cpi, TX_SIZE tx_size, - BLOCK_SIZE_TYPE bsize, vp9_writer *w) { +static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m, + TX_SIZE tx_size, BLOCK_SIZE bsize, + vp9_writer *w) { const MACROBLOCKD *const xd = &cpi->mb.e_mbd; - const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs); + const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs, m); vp9_write(w, tx_size != TX_4X4, tx_probs[0]); if (bsize >= BLOCK_16X16 && tx_size != TX_4X4) { vp9_write(w, tx_size != TX_8X8, tx_probs[1]); @@ -213,10 +214,10 @@ static void write_selected_tx_size(const VP9_COMP *cpi, TX_SIZE tx_size, static int write_skip_coeff(const VP9_COMP *cpi, int segment_id, MODE_INFO *m, vp9_writer *w) { const MACROBLOCKD *const xd = &cpi->mb.e_mbd; - if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) { + if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { - const int skip_coeff = m->mbmi.mb_skip_coeff; + const int skip_coeff = m->mbmi.skip_coeff; vp9_write(w, skip_coeff, vp9_get_pred_prob_mbskip(&cpi->common, xd)); return skip_coeff; } @@ -228,7 +229,7 @@ void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) { for (k = 0; k < MBSKIP_CONTEXTS; ++k) vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], - VP9_MODE_UPDATE_PROB, cm->counts.mbskip[k]); + MODE_UPDATE_PROB, cm->counts.mbskip[k]); } static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) { @@ -237,43 +238,43 @@ static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) { static void update_switchable_interp_probs(VP9_COMP *const cpi, vp9_writer* const bc) { - VP9_COMMON *const pc = &cpi->common; - unsigned int branch_ct[VP9_SWITCHABLE_FILTERS + 1] - [VP9_SWITCHABLE_FILTERS - 1][2]; - vp9_prob new_prob[VP9_SWITCHABLE_FILTERS + 1][VP9_SWITCHABLE_FILTERS - 1]; + VP9_COMMON *const cm = &cpi->common; + unsigned int branch_ct[SWITCHABLE_FILTERS + 1] + [SWITCHABLE_FILTERS - 1][2]; + vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1]; int i, j; - for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) { + for (j = 0; j <= SWITCHABLE_FILTERS; ++j) { vp9_tree_probs_from_distribution( vp9_switchable_interp_tree, new_prob[j], branch_ct[j], - pc->counts.switchable_interp[j], 0); + cm->counts.switchable_interp[j], 0); } - for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) { - for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) { - vp9_cond_prob_diff_update(bc, &pc->fc.switchable_interp_prob[j][i], - VP9_MODE_UPDATE_PROB, branch_ct[j][i]); + for (j = 0; j <= SWITCHABLE_FILTERS; ++j) { + for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) { + vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i], + MODE_UPDATE_PROB, branch_ct[j][i]); } } #ifdef MODE_STATS if (!cpi->dummy_packing) - update_switchable_interp_stats(pc); + update_switchable_interp_stats(cm); #endif } -static void update_inter_mode_probs(VP9_COMMON *pc, vp9_writer* const bc) { +static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) { int i, j; for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { - unsigned int branch_ct[VP9_INTER_MODES - 1][2]; - vp9_prob new_prob[VP9_INTER_MODES - 1]; + unsigned int branch_ct[INTER_MODES - 1][2]; + vp9_prob new_prob[INTER_MODES - 1]; vp9_tree_probs_from_distribution(vp9_inter_mode_tree, new_prob, branch_ct, - pc->counts.inter_mode[i], NEARESTMV); + cm->counts.inter_mode[i], NEARESTMV); - for (j = 0; j < VP9_INTER_MODES - 1; ++j) - vp9_cond_prob_diff_update(bc, &pc->fc.inter_mode_probs[i][j], - VP9_MODE_UPDATE_PROB, branch_ct[j]); + for (j = 0; j < INTER_MODES - 1; ++j) + vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j], + MODE_UPDATE_PROB, branch_ct[j]); } } @@ -356,39 +357,39 @@ static void write_segment_id(vp9_writer *w, const struct segmentation *seg, // This function encodes the reference frame static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) { - VP9_COMMON *const pc = &cpi->common; + VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mi = &xd->mode_info_context->mbmi; + MB_MODE_INFO *mi = &xd->this_mi->mbmi; const int segment_id = mi->segment_id; - int seg_ref_active = vp9_segfeature_active(&xd->seg, segment_id, + int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); // If segment level coding of this signal is disabled... // or the segment allows multiple reference frame options if (!seg_ref_active) { // does the feature use compound prediction or not // (if not specified at the frame/segment level) - if (pc->comp_pred_mode == HYBRID_PREDICTION) { + if (cm->comp_pred_mode == HYBRID_PREDICTION) { vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME, - vp9_get_pred_prob_comp_inter_inter(pc, xd)); + vp9_get_pred_prob_comp_inter_inter(cm, xd)); } else { assert((mi->ref_frame[1] <= INTRA_FRAME) == - (pc->comp_pred_mode == SINGLE_PREDICTION_ONLY)); + (cm->comp_pred_mode == SINGLE_PREDICTION_ONLY)); } if (mi->ref_frame[1] > INTRA_FRAME) { vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME, - vp9_get_pred_prob_comp_ref_p(pc, xd)); + vp9_get_pred_prob_comp_ref_p(cm, xd)); } else { vp9_write(bc, mi->ref_frame[0] != LAST_FRAME, - vp9_get_pred_prob_single_ref_p1(pc, xd)); + vp9_get_pred_prob_single_ref_p1(cm, xd)); if (mi->ref_frame[0] != LAST_FRAME) vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME, - vp9_get_pred_prob_single_ref_p2(pc, xd)); + vp9_get_pred_prob_single_ref_p2(cm, xd)); } } else { assert(mi->ref_frame[1] <= INTRA_FRAME); - assert(vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) == + assert(vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) == mi->ref_frame[0]); } @@ -397,20 +398,20 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) { } static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { - VP9_COMMON *const pc = &cpi->common; - const nmv_context *nmvc = &pc->fc.nmvc; + VP9_COMMON *const cm = &cpi->common; + const nmv_context *nmvc = &cm->fc.nmvc; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - struct segmentation *seg = &xd->seg; + struct segmentation *seg = &cm->seg; MB_MODE_INFO *const mi = &m->mbmi; const MV_REFERENCE_FRAME rf = mi->ref_frame[0]; const MB_PREDICTION_MODE mode = mi->mode; const int segment_id = mi->segment_id; int skip_coeff; - const BLOCK_SIZE_TYPE bsize = mi->sb_type; + const BLOCK_SIZE bsize = mi->sb_type; const int allow_hp = xd->allow_high_precision_mv; - x->partition_info = x->pi + (m - pc->mi); + x->partition_info = x->pi + (m - cm->mi); #ifdef ENTROPY_STATS active_section = 9; @@ -419,7 +420,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { if (seg->update_map) { if (seg->temporal_update) { const int pred_flag = mi->seg_id_predicted; - vp9_prob pred_prob = vp9_get_pred_prob_seg_id(xd); + vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd); vp9_write(bc, pred_flag, pred_prob); if (!pred_flag) write_segment_id(bc, seg, segment_id); @@ -432,12 +433,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) vp9_write(bc, rf != INTRA_FRAME, - vp9_get_pred_prob_intra_inter(pc, xd)); + vp9_get_pred_prob_intra_inter(cm, xd)); - if (bsize >= BLOCK_SIZE_SB8X8 && pc->tx_mode == TX_MODE_SELECT && + if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT && !(rf != INTRA_FRAME && (skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) { - write_selected_tx_size(cpi, mi->txfm_size, bsize, bc); + write_selected_tx_size(cpi, m, mi->tx_size, bsize, bc); } if (rf == INTRA_FRAME) { @@ -445,8 +446,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { active_section = 6; #endif - if (bsize >= BLOCK_SIZE_SB8X8) { - write_intra_mode(bc, mode, pc->fc.y_mode_prob[size_group_lookup[bsize]]); + if (bsize >= BLOCK_8X8) { + write_intra_mode(bc, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]); } else { int idx, idy; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; @@ -454,15 +455,15 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode; - write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]); + write_intra_mode(bc, bm, cm->fc.y_mode_prob[0]); } } } - write_intra_mode(bc, mi->uv_mode, pc->fc.uv_mode_prob[mode]); + write_intra_mode(bc, mi->uv_mode, cm->fc.uv_mode_prob[mode]); } else { vp9_prob *mv_ref_p; encode_ref_frame(cpi, bc); - mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mb_mode_context[rf]]; + mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mode_context[rf]]; #ifdef ENTROPY_STATS active_section = 3; @@ -470,23 +471,23 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { // If segment skip is not enabled code the mode. if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { - if (bsize >= BLOCK_SIZE_SB8X8) { + if (bsize >= BLOCK_8X8) { write_sb_mv_ref(bc, mode, mv_ref_p); - ++pc->counts.inter_mode[mi->mb_mode_context[rf]] + ++cm->counts.inter_mode[mi->mode_context[rf]] [inter_mode_offset(mode)]; } } - if (cpi->common.mcomp_filter_type == SWITCHABLE) { + if (cm->mcomp_filter_type == SWITCHABLE) { + const int ctx = vp9_get_pred_context_switchable_interp(xd); write_token(bc, vp9_switchable_interp_tree, - vp9_get_pred_probs_switchable_interp(&cpi->common, xd), - vp9_switchable_interp_encodings + - vp9_switchable_interp_map[mi->interp_filter]); + cm->fc.switchable_interp_prob[ctx], + &vp9_switchable_interp_encodings[mi->interp_filter]); } else { - assert(mi->interp_filter == cpi->common.mcomp_filter_type); + assert(mi->interp_filter == cm->mcomp_filter_type); } - if (bsize < BLOCK_SIZE_SB8X8) { + if (bsize < BLOCK_8X8) { int j; MB_PREDICTION_MODE blockmode; int_mv blockmv; @@ -499,7 +500,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { blockmode = x->partition_info->bmi[j].mode; blockmv = m->bmi[j].as_mv[0]; write_sb_mv_ref(bc, blockmode, mv_ref_p); - ++pc->counts.inter_mode[mi->mb_mode_context[rf]] + ++cm->counts.inter_mode[mi->mode_context[rf]] [inter_mode_offset(blockmode)]; if (blockmode == NEWMV) { @@ -531,26 +532,29 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { } } -static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO *m, +static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc) { - const VP9_COMMON *const c = &cpi->common; + const VP9_COMMON *const cm = &cpi->common; const MACROBLOCKD *const xd = &cpi->mb.e_mbd; + const struct segmentation *const seg = &cm->seg; + MODE_INFO *m = mi_8x8[0]; const int ym = m->mbmi.mode; - const int mis = c->mode_info_stride; const int segment_id = m->mbmi.segment_id; + MODE_INFO *above_mi = mi_8x8[-xd->mode_info_stride]; + MODE_INFO *left_mi = mi_8x8[-1]; - if (xd->seg.update_map) - write_segment_id(bc, &xd->seg, m->mbmi.segment_id); + if (seg->update_map) + write_segment_id(bc, seg, m->mbmi.segment_id); write_skip_coeff(cpi, segment_id, m, bc); - if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->tx_mode == TX_MODE_SELECT) - write_selected_tx_size(cpi, m->mbmi.txfm_size, m->mbmi.sb_type, bc); + if (m->mbmi.sb_type >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT) + write_selected_tx_size(cpi, m, m->mbmi.tx_size, m->mbmi.sb_type, bc); - if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) { - const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis); + if (m->mbmi.sb_type >= BLOCK_8X8) { + const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0); const MB_PREDICTION_MODE L = xd->left_available ? - left_block_mode(m, 0) : DC_PRED; + left_block_mode(m, left_mi, 0) : DC_PRED; write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]); } else { int idx, idy; @@ -558,10 +562,10 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO *m, const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[m->mbmi.sb_type]; for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { - const int i = idy * 2 + idx; - const MB_PREDICTION_MODE A = above_block_mode(m, i, mis); + int i = idy * 2 + idx; + const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, i); const MB_PREDICTION_MODE L = (xd->left_available || idx) ? - left_block_mode(m, i) : DC_PRED; + left_block_mode(m, left_mi, i) : DC_PRED; const int bm = m->bmi[i].as_mode; #ifdef ENTROPY_STATS ++intra_mode_stats[A][L][bm]; @@ -574,21 +578,25 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO *m, write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]); } -static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, +static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, TOKENEXTRA **tok, TOKENEXTRA *tok_end, int mi_row, int mi_col) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; + MODE_INFO *m = mi_8x8[0]; - if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8) + if (m->mbmi.sb_type < BLOCK_8X8) if (xd->ab_index > 0) return; - xd->mode_info_context = m; - set_mi_row_col(&cpi->common, xd, mi_row, - 1 << mi_height_log2(m->mbmi.sb_type), - mi_col, 1 << mi_width_log2(m->mbmi.sb_type)); + + xd->this_mi = mi_8x8[0]; + xd->mi_8x8 = mi_8x8; + + set_mi_row_col(&cpi->common, xd, + mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type], + mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type]); if ((cm->frame_type == KEY_FRAME) || cm->intra_only) { - write_mb_modes_kf(cpi, m, bc); + write_mb_modes_kf(cpi, mi_8x8, bc); #ifdef ENTROPY_STATS active_section = 8; #endif @@ -603,10 +611,9 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, pack_mb_tokens(bc, tok, tok_end); } -static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, +static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, TOKENEXTRA **tok, TOKENEXTRA *tok_end, - int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize) { + int mi_row, int mi_col, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &cpi->mb.e_mbd; const int mis = cm->mode_info_stride; @@ -614,20 +621,22 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, int bs = (1 << bsl) / 4; // mode_info step for subsize int n; PARTITION_TYPE partition = PARTITION_NONE; - BLOCK_SIZE_TYPE subsize; + BLOCK_SIZE subsize; + MODE_INFO *m = mi_8x8[0]; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; partition = partition_lookup[bsl][m->mbmi.sb_type]; - if (bsize < BLOCK_SIZE_SB8X8) + if (bsize < BLOCK_8X8) if (xd->ab_index > 0) return; - if (bsize >= BLOCK_SIZE_SB8X8) { + if (bsize >= BLOCK_8X8) { int pl; - const int idx = check_bsize_coverage(cm, mi_row, mi_col, bsize); + const int idx = check_bsize_coverage(bs, cm->mi_rows, cm->mi_cols, + mi_row, mi_col); set_partition_seg_context(cm, xd, mi_row, mi_col); pl = partition_plane_context(xd, bsize); // encode the partition information @@ -645,25 +654,26 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, switch (partition) { case PARTITION_NONE: - write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col); break; case PARTITION_HORZ: - write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col); *(get_sb_index(xd, subsize)) = 1; if ((mi_row + bs) < cm->mi_rows) - write_modes_b(cpi, m + bs * mis, bc, tok, tok_end, mi_row + bs, mi_col); + write_modes_b(cpi, mi_8x8 + bs * mis, bc, tok, tok_end, mi_row + bs, + mi_col); break; case PARTITION_VERT: - write_modes_b(cpi, m, bc, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col); *(get_sb_index(xd, subsize)) = 1; if ((mi_col + bs) < cm->mi_cols) - write_modes_b(cpi, m + bs, bc, tok, tok_end, mi_row, mi_col + bs); + write_modes_b(cpi, mi_8x8 + bs, bc, tok, tok_end, mi_row, mi_col + bs); break; case PARTITION_SPLIT: for (n = 0; n < 4; n++) { int j = n >> 1, i = n & 0x01; *(get_sb_index(xd, subsize)) = n; - write_modes_sb(cpi, m + j * bs * mis + i * bs, bc, tok, tok_end, + write_modes_sb(cpi, mi_8x8 + j * bs * mis + i * bs, bc, tok, tok_end, mi_row + j * bs, mi_col + i * bs, subsize); } break; @@ -672,8 +682,8 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, } // update partition context - if (bsize >= BLOCK_SIZE_SB8X8 && - (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) { + if (bsize >= BLOCK_8X8 && + (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) { set_partition_seg_context(cm, xd, mi_row, mi_col); update_partition_context(xd, subsize, bsize); } @@ -681,20 +691,23 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, static void write_modes(VP9_COMP *cpi, vp9_writer* const bc, TOKENEXTRA **tok, TOKENEXTRA *tok_end) { - VP9_COMMON *const c = &cpi->common; - const int mis = c->mode_info_stride; - MODE_INFO *m, *m_ptr = c->mi; + VP9_COMMON *const cm = &cpi->common; + const int mis = cm->mode_info_stride; int mi_row, mi_col; - - m_ptr += c->cur_tile_mi_col_start + c->cur_tile_mi_row_start * mis; - - for (mi_row = c->cur_tile_mi_row_start; mi_row < c->cur_tile_mi_row_end; - mi_row += 8, m_ptr += 8 * mis) { - m = m_ptr; - vp9_zero(c->left_seg_context); - for (mi_col = c->cur_tile_mi_col_start; mi_col < c->cur_tile_mi_col_end; - mi_col += MI_BLOCK_SIZE, m += MI_BLOCK_SIZE) - write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col, BLOCK_64X64); + MODE_INFO **mi_8x8 = cm->mi_grid_visible; + MODE_INFO **m_8x8; + + mi_8x8 += cm->cur_tile_mi_col_start + cm->cur_tile_mi_row_start * mis; + + for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end; + mi_row += 8, mi_8x8 += 8 * mis) { + m_8x8 = mi_8x8; + vp9_zero(cm->left_seg_context); + for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; + mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) { + write_modes_sb(cpi, m_8x8, bc, tok, tok_end, mi_row, mi_col, + BLOCK_64X64); + } } } @@ -781,94 +794,170 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, vp9_coeff_probs_model *old_frame_coef_probs = cpi->common.fc.coef_probs[tx_size]; vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size]; - int i, j, k, l, t; - int update[2] = {0, 0}; - int savings; - + const vp9_prob upd = VP9_COEF_UPDATE_PROB; const int entropy_nodes_update = UNCONSTRAINED_NODES; + int i, j, k, l, t; + switch (cpi->sf.use_fast_coef_updates) { + case 0: { + /* dry run to see if there is any udpate at all needed */ + int savings = 0; + int update[2] = {0, 0}; + for (i = 0; i < BLOCK_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + for (t = 0; t < entropy_nodes_update; ++t) { + vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; + const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t]; + int s; + int u = 0; + + if (l >= 3 && k == 0) + continue; + if (t == PIVOT_NODE) + s = vp9_prob_diff_update_savings_search_model( + frame_branch_ct[i][j][k][l][0], + old_frame_coef_probs[i][j][k][l], &newp, upd, i, j); + else + s = vp9_prob_diff_update_savings_search( + frame_branch_ct[i][j][k][l][t], oldp, &newp, upd); + if (s > 0 && newp != oldp) + u = 1; + if (u) + savings += s - (int)(vp9_cost_zero(upd)); + else + savings -= (int)(vp9_cost_zero(upd)); + update[u]++; + } + } + } + } + } - const int tstart = 0; - /* dry run to see if there is any udpate at all needed */ - savings = 0; - for (i = 0; i < BLOCK_TYPES; ++i) { - for (j = 0; j < REF_TYPES; ++j) { - for (k = 0; k < COEF_BANDS; ++k) { - // int prev_coef_savings[ENTROPY_NODES] = {0}; - for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { - for (t = tstart; t < entropy_nodes_update; ++t) { - vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; - const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t]; - const vp9_prob upd = VP9_COEF_UPDATE_PROB; - int s; - int u = 0; - - if (l >= 3 && k == 0) - continue; - if (t == PIVOT_NODE) - s = vp9_prob_diff_update_savings_search_model( - frame_branch_ct[i][j][k][l][0], - old_frame_coef_probs[i][j][k][l], &newp, upd, i, j); - else - s = vp9_prob_diff_update_savings_search( - frame_branch_ct[i][j][k][l][t], oldp, &newp, upd); - if (s > 0 && newp != oldp) - u = 1; - if (u) - savings += s - (int)(vp9_cost_zero(upd)); - else - savings -= (int)(vp9_cost_zero(upd)); - update[u]++; + // printf("Update %d %d, savings %d\n", update[0], update[1], savings); + /* Is coef updated at all */ + if (update[1] == 0 || savings < 0) { + vp9_write_bit(bc, 0); + return; + } + vp9_write_bit(bc, 1); + for (i = 0; i < BLOCK_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + // calc probs and branch cts for this frame only + for (t = 0; t < entropy_nodes_update; ++t) { + vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; + vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t; + const vp9_prob upd = VP9_COEF_UPDATE_PROB; + int s; + int u = 0; + if (l >= 3 && k == 0) + continue; + if (t == PIVOT_NODE) + s = vp9_prob_diff_update_savings_search_model( + frame_branch_ct[i][j][k][l][0], + old_frame_coef_probs[i][j][k][l], &newp, upd, i, j); + else + s = vp9_prob_diff_update_savings_search( + frame_branch_ct[i][j][k][l][t], + *oldp, &newp, upd); + if (s > 0 && newp != *oldp) + u = 1; + vp9_write(bc, u, upd); +#ifdef ENTROPY_STATS + if (!cpi->dummy_packing) + ++tree_update_hist[tx_size][i][j][k][l][t][u]; +#endif + if (u) { + /* send/use new probability */ + vp9_write_prob_diff_update(bc, newp, *oldp); + *oldp = newp; + } + } + } } } } + return; } - } - // printf("Update %d %d, savings %d\n", update[0], update[1], savings); - /* Is coef updated at all */ - if (update[1] == 0 || savings < 0) { - vp9_write_bit(bc, 0); - return; - } - vp9_write_bit(bc, 1); - for (i = 0; i < BLOCK_TYPES; ++i) { - for (j = 0; j < REF_TYPES; ++j) { - for (k = 0; k < COEF_BANDS; ++k) { - // int prev_coef_savings[ENTROPY_NODES] = {0}; - for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { - // calc probs and branch cts for this frame only - for (t = tstart; t < entropy_nodes_update; ++t) { - vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; - vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t; - const vp9_prob upd = VP9_COEF_UPDATE_PROB; - int s; - int u = 0; - if (l >= 3 && k == 0) - continue; - if (t == PIVOT_NODE) - s = vp9_prob_diff_update_savings_search_model( - frame_branch_ct[i][j][k][l][0], - old_frame_coef_probs[i][j][k][l], &newp, upd, i, j); - else - s = vp9_prob_diff_update_savings_search( - frame_branch_ct[i][j][k][l][t], - *oldp, &newp, upd); - if (s > 0 && newp != *oldp) - u = 1; - vp9_write(bc, u, upd); + case 1: + case 2: { + const int prev_coef_contexts_to_update = + (cpi->sf.use_fast_coef_updates == 2 ? + PREV_COEF_CONTEXTS >> 1 : PREV_COEF_CONTEXTS); + const int coef_band_to_update = + (cpi->sf.use_fast_coef_updates == 2 ? + COEF_BANDS >> 1 : COEF_BANDS); + int updates = 0; + int noupdates_before_first = 0; + for (i = 0; i < BLOCK_TYPES; ++i) { + for (j = 0; j < REF_TYPES; ++j) { + for (k = 0; k < COEF_BANDS; ++k) { + for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + // calc probs and branch cts for this frame only + for (t = 0; t < entropy_nodes_update; ++t) { + vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; + vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t; + int s; + int u = 0; + if (l >= 3 && k == 0) + continue; + if (l >= prev_coef_contexts_to_update || + k >= coef_band_to_update) { + u = 0; + } else { + if (t == PIVOT_NODE) + s = vp9_prob_diff_update_savings_search_model( + frame_branch_ct[i][j][k][l][0], + old_frame_coef_probs[i][j][k][l], &newp, upd, i, j); + else + s = vp9_prob_diff_update_savings_search( + frame_branch_ct[i][j][k][l][t], + *oldp, &newp, upd); + if (s > 0 && newp != *oldp) + u = 1; + } + updates += u; + if (u == 0 && updates == 0) { + noupdates_before_first++; #ifdef ENTROPY_STATS - if (!cpi->dummy_packing) - ++tree_update_hist[tx_size][i][j][k][l][t][u]; + if (!cpi->dummy_packing) + ++tree_update_hist[tx_size][i][j][k][l][t][u]; #endif - if (u) { - /* send/use new probability */ - vp9_write_prob_diff_update(bc, newp, *oldp); - *oldp = newp; + continue; + } + if (u == 1 && updates == 1) { + int v; + // first update + vp9_write_bit(bc, 1); + for (v = 0; v < noupdates_before_first; ++v) + vp9_write(bc, 0, upd); + } + vp9_write(bc, u, upd); +#ifdef ENTROPY_STATS + if (!cpi->dummy_packing) + ++tree_update_hist[tx_size][i][j][k][l][t][u]; +#endif + if (u) { + /* send/use new probability */ + vp9_write_prob_diff_update(bc, newp, *oldp); + *oldp = newp; + } + } } } } } + if (updates == 0) { + vp9_write_bit(bc, 0); // no updates + } + return; } + + default: + assert(0); } } @@ -967,7 +1056,7 @@ static void encode_segmentation(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) { int i, j; - struct segmentation *seg = &cpi->mb.e_mbd.seg; + struct segmentation *seg = &cpi->common.seg; vp9_wb_write_bit(wb, seg->enabled); if (!seg->enabled) @@ -1047,7 +1136,7 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) { ct_8x8p); for (j = 0; j < TX_SIZES - 3; j++) vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], - VP9_MODE_UPDATE_PROB, ct_8x8p[j]); + MODE_UPDATE_PROB, ct_8x8p[j]); } for (i = 0; i < TX_SIZE_CONTEXTS; i++) { @@ -1055,14 +1144,14 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) { ct_16x16p); for (j = 0; j < TX_SIZES - 2; j++) vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j], - VP9_MODE_UPDATE_PROB, ct_16x16p[j]); + MODE_UPDATE_PROB, ct_16x16p[j]); } for (i = 0; i < TX_SIZE_CONTEXTS; i++) { tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p); for (j = 0; j < TX_SIZES - 1; j++) vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j], - VP9_MODE_UPDATE_PROB, ct_32x32p[j]); + MODE_UPDATE_PROB, ct_32x32p[j]); } #ifdef MODE_STATS if (!cpi->dummy_packing) @@ -1073,9 +1162,11 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) { static void write_interp_filter_type(INTERPOLATIONFILTERTYPE type, struct vp9_write_bit_buffer *wb) { + const int type_to_literal[] = { 1, 0, 2 }; + vp9_wb_write_bit(wb, type == SWITCHABLE); if (type != SWITCHABLE) - vp9_wb_write_literal(wb, type, 2); + vp9_wb_write_literal(wb, type_to_literal[type], 2); } static void fix_mcomp_filter_type(VP9_COMP *cpi) { @@ -1083,19 +1174,19 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) { if (cm->mcomp_filter_type == SWITCHABLE) { // Check to see if only one of the filters is actually used - int count[VP9_SWITCHABLE_FILTERS]; + int count[SWITCHABLE_FILTERS]; int i, j, c = 0; - for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) { + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { count[i] = 0; - for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) + for (j = 0; j <= SWITCHABLE_FILTERS; ++j) count[i] += cm->counts.switchable_interp[j][i]; c += (count[i] > 0); } if (c == 1) { // Only one filter is used. So set the filter at frame level - for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) { + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { if (count[i]) { - cm->mcomp_filter_type = vp9_switchable_interp[i]; + cm->mcomp_filter_type = i; break; } } @@ -1127,7 +1218,8 @@ static int get_refresh_mask(VP9_COMP *cpi) { if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) { #else - if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) { + if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame && + !cpi->use_svc) { #endif // Preserve the previously existing golden frame and update the frame in // the alt ref slot instead. This is highly specific to the use of @@ -1239,9 +1331,16 @@ static void write_frame_size_with_refs(VP9_COMP *cpi, YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[refs[i]]]; found = cm->width == cfg->y_crop_width && cm->height == cfg->y_crop_height; + + // TODO(ivan): This prevents a bug while more than 3 buffers are used. Do it + // in a better way. + if (cpi->use_svc) { + found = 0; + } vp9_wb_write_bit(wb, found); - if (found) + if (found) { break; + } } if (!found) { @@ -1340,7 +1439,7 @@ static void write_uncompressed_header(VP9_COMP *cpi, vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LOG2); - encode_loopfilter(&xd->lf, wb); + encode_loopfilter(&cm->lf, wb); encode_quantization(cm, wb); encode_segmentation(cpi, wb); @@ -1382,7 +1481,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { for (i = 0; i < INTRA_INTER_CONTEXTS; i++) vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i], - VP9_MODE_UPDATE_PROB, + MODE_UPDATE_PROB, cpi->intra_inter_count[i]); if (cm->allow_comp_inter_inter) { @@ -1396,7 +1495,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { if (use_hybrid_pred) for (i = 0; i < COMP_INTER_CONTEXTS; i++) vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i], - VP9_MODE_UPDATE_PROB, + MODE_UPDATE_PROB, cpi->comp_inter_count[i]); } } @@ -1404,10 +1503,10 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) { for (i = 0; i < REF_CONTEXTS; i++) { vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0], - VP9_MODE_UPDATE_PROB, + MODE_UPDATE_PROB, cpi->single_ref_count[i][0]); vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1], - VP9_MODE_UPDATE_PROB, + MODE_UPDATE_PROB, cpi->single_ref_count[i][1]); } } @@ -1415,7 +1514,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) for (i = 0; i < REF_CONTEXTS; i++) vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i], - VP9_MODE_UPDATE_PROB, + MODE_UPDATE_PROB, cpi->comp_ref_count[i]); update_mbintra_mode_probs(cpi, &header_bc); @@ -1453,7 +1552,7 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) { vp9_compute_update_table(); #ifdef ENTROPY_STATS - if (pc->frame_type == INTER_FRAME) + if (cm->frame_type == INTER_FRAME) active_section = 0; else active_section = 7; diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h index 3e377cf..013047e 100644 --- a/libvpx/vp9/encoder/vp9_block.h +++ b/libvpx/vp9/encoder/vp9_block.h @@ -48,7 +48,11 @@ typedef struct { int comp_pred_diff; int single_pred_diff; int64_t tx_rd_diff[TX_MODES]; - int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]; + int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]; + + // motion vector cache for adaptive motion search control in partition + // search loop + int_mv pred_mv[MAX_REF_FRAMES]; // Bit flag for each mode whether it has high error in comparison to others. unsigned int modes_with_high_error; @@ -121,9 +125,9 @@ struct macroblock { int mbmode_cost[MB_MODE_COUNT]; unsigned inter_mode_cost[INTER_MODE_CONTEXTS][MB_MODE_COUNT - NEARESTMV]; int intra_uv_mode_cost[2][MB_MODE_COUNT]; - int y_mode_costs[VP9_INTRA_MODES][VP9_INTRA_MODES][VP9_INTRA_MODES]; - int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1] - [VP9_SWITCHABLE_FILTERS]; + int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; + int switchable_interp_costs[SWITCHABLE_FILTERS + 1] + [SWITCHABLE_FILTERS]; // These define limits to motion vector components to prevent them // from extending outside the UMV borders @@ -144,12 +148,12 @@ struct macroblock { int optimize; // indicate if it is in the rd search loop or encoding process - int rd_search; + int use_lp32x32fdct; int skip_encode; // Used to store sub partition's choices. int fast_ms; - int_mv pred_mv; + int_mv pred_mv[MAX_REF_FRAMES]; int subblock_ref; // TODO(jingning): Need to refactor the structure arrays that buffers the @@ -170,10 +174,10 @@ struct macroblock { PICK_MODE_CONTEXT sb64_context; int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES]; - BLOCK_SIZE_TYPE b_partitioning[4][4][4]; - BLOCK_SIZE_TYPE mb_partitioning[4][4]; - BLOCK_SIZE_TYPE sb_partitioning[4]; - BLOCK_SIZE_TYPE sb64_partitioning; + BLOCK_SIZE b_partitioning[4][4][4]; + BLOCK_SIZE mb_partitioning[4][4]; + BLOCK_SIZE sb_partitioning[4]; + BLOCK_SIZE sb64_partitioning; void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch); void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch); diff --git a/libvpx/vp9/encoder/vp9_dct.c b/libvpx/vp9/encoder/vp9_dct.c index 3112dad..4f4ad04 100644 --- a/libvpx/vp9/encoder/vp9_dct.c +++ b/libvpx/vp9/encoder/vp9_dct.c @@ -1077,6 +1077,44 @@ static void dct32_1d(int *input, int *output, int round) { output[30] = step[30]; output[31] = step[31]; + // dump the magnitude by 4, hence the intermediate values are within + // the range of 16 bits. + if (round) { + output[0] = half_round_shift(output[0]); + output[1] = half_round_shift(output[1]); + output[2] = half_round_shift(output[2]); + output[3] = half_round_shift(output[3]); + output[4] = half_round_shift(output[4]); + output[5] = half_round_shift(output[5]); + output[6] = half_round_shift(output[6]); + output[7] = half_round_shift(output[7]); + output[8] = half_round_shift(output[8]); + output[9] = half_round_shift(output[9]); + output[10] = half_round_shift(output[10]); + output[11] = half_round_shift(output[11]); + output[12] = half_round_shift(output[12]); + output[13] = half_round_shift(output[13]); + output[14] = half_round_shift(output[14]); + output[15] = half_round_shift(output[15]); + + output[16] = half_round_shift(output[16]); + output[17] = half_round_shift(output[17]); + output[18] = half_round_shift(output[18]); + output[19] = half_round_shift(output[19]); + output[20] = half_round_shift(output[20]); + output[21] = half_round_shift(output[21]); + output[22] = half_round_shift(output[22]); + output[23] = half_round_shift(output[23]); + output[24] = half_round_shift(output[24]); + output[25] = half_round_shift(output[25]); + output[26] = half_round_shift(output[26]); + output[27] = half_round_shift(output[27]); + output[28] = half_round_shift(output[28]); + output[29] = half_round_shift(output[29]); + output[30] = half_round_shift(output[30]); + output[31] = half_round_shift(output[31]); + } + // Stage 3 step[0] = output[0] + output[(8 - 1)]; step[1] = output[1] + output[(8 - 2)]; @@ -1112,44 +1150,6 @@ static void dct32_1d(int *input, int *output, int round) { step[30] = output[30] + output[25]; step[31] = output[31] + output[24]; - // dump the magnitude by half, hence the intermediate values are within 1108 - // the range of 16 bits. - if (round) { - step[0] = half_round_shift(step[0]); - step[1] = half_round_shift(step[1]); - step[2] = half_round_shift(step[2]); - step[3] = half_round_shift(step[3]); - step[4] = half_round_shift(step[4]); - step[5] = half_round_shift(step[5]); - step[6] = half_round_shift(step[6]); - step[7] = half_round_shift(step[7]); - step[8] = half_round_shift(step[8]); - step[9] = half_round_shift(step[9]); - step[10] = half_round_shift(step[10]); - step[11] = half_round_shift(step[11]); - step[12] = half_round_shift(step[12]); - step[13] = half_round_shift(step[13]); - step[14] = half_round_shift(step[14]); - step[15] = half_round_shift(step[15]); - - step[16] = half_round_shift(step[16]); - step[17] = half_round_shift(step[17]); - step[18] = half_round_shift(step[18]); - step[19] = half_round_shift(step[19]); - step[20] = half_round_shift(step[20]); - step[21] = half_round_shift(step[21]); - step[22] = half_round_shift(step[22]); - step[23] = half_round_shift(step[23]); - step[24] = half_round_shift(step[24]); - step[25] = half_round_shift(step[25]); - step[26] = half_round_shift(step[26]); - step[27] = half_round_shift(step[27]); - step[28] = half_round_shift(step[28]); - step[29] = half_round_shift(step[29]); - step[30] = half_round_shift(step[30]); - step[31] = half_round_shift(step[31]); - } - // Stage 4 output[0] = step[0] + step[3]; output[1] = step[1] + step[2]; diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c index 66eae41..44ab02d 100644 --- a/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/libvpx/vp9/encoder/vp9_encodeframe.c @@ -8,43 +8,55 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vpx_config.h" +#include <limits.h> +#include <math.h> +#include <stdio.h> + #include "./vp9_rtcd.h" -#include "vp9/encoder/vp9_encodeframe.h" -#include "vp9/encoder/vp9_encodemb.h" -#include "vp9/encoder/vp9_encodemv.h" +#include "./vpx_config.h" + +#include "vpx_ports/vpx_timer.h" + #include "vp9/common/vp9_common.h" -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/common/vp9_extend.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" -#include "vp9/common/vp9_quant_common.h" -#include "vp9/encoder/vp9_segmentation.h" -#include "vp9/encoder/vp9_encodeintra.h" -#include "vp9/common/vp9_reconinter.h" -#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/common/vp9_extend.h" #include "vp9/common/vp9_findnearmv.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_tile_common.h" + +#include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_encodeintra.h" +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_segmentation.h" #include "vp9/encoder/vp9_tokenize.h" -#include "./vp9_rtcd.h" -#include <stdio.h> -#include <math.h> -#include <limits.h> -#include "vpx_ports/vpx_timer.h" -#include "vp9/common/vp9_pred_common.h" -#include "vp9/common/vp9_mvref_common.h" #define DBG_PRNT_SEGMAP 0 + +static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { + TX_4X4, // ONLY_4X4 + TX_8X8, // ONLY_8X8 + TX_16X16, // ONLY_16X16 + TX_32X32, // ONLY_32X32 + TX_32X32, // TX_MODE_SELECT +}; + // #define ENC_DEBUG #ifdef ENC_DEBUG int enc_debug = 0; #endif static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, - int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize); + int mi_row, int mi_col, BLOCK_SIZE bsize); static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x); @@ -53,7 +65,10 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x); * This also avoids the need for divide by zero checks in * vp9_activity_masking(). */ -#define VP9_ACTIVITY_AVG_MIN (64) +#define ACTIVITY_AVG_MIN (64) + +/* Motion vector component magnitude threshold for defining fast motion. */ +#define FAST_MOTION_MV_THRESH (24) /* This is used as a reference when computing the source variance for the * purposes of activity masking. @@ -71,13 +86,14 @@ static const uint8_t VP9_VAR_OFFS[64] = { 128, 128, 128, 128, 128, 128, 128, 128 }; -static unsigned int get_sb_variance(VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE_TYPE bs) { +static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs) { unsigned int var, sse; var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride, VP9_VAR_OFFS, 0, &sse); - return var >> num_pels_log2_lookup[bs]; + return (var + (1 << (num_pels_log2_lookup[bs] - 1))) >> + num_pels_log2_lookup[bs]; } // Original activity measure from Tim T's code. @@ -103,31 +119,29 @@ static unsigned int tt_activity_measure(MACROBLOCK *x) { } // Stub for alternative experimental activity measures. -static unsigned int alt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x, - int use_dc_pred) { - return vp9_encode_intra(cpi, x, use_dc_pred); +static unsigned int alt_activity_measure(MACROBLOCK *x, int use_dc_pred) { + return vp9_encode_intra(x, use_dc_pred); } DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = {0}; // Measure the activity of the current macroblock // What we measure here is TBD so abstracted to this function #define ALT_ACT_MEASURE 1 -static unsigned int mb_activity_measure(VP9_COMP *cpi, MACROBLOCK *x, - int mb_row, int mb_col) { +static unsigned int mb_activity_measure(MACROBLOCK *x, int mb_row, int mb_col) { unsigned int mb_activity; if (ALT_ACT_MEASURE) { int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); // Or use and alternative. - mb_activity = alt_activity_measure(cpi, x, use_dc_pred); + mb_activity = alt_activity_measure(x, use_dc_pred); } else { // Original activity measure from Tim T's code. mb_activity = tt_activity_measure(x); } - if (mb_activity < VP9_ACTIVITY_AVG_MIN) - mb_activity = VP9_ACTIVITY_AVG_MIN; + if (mb_activity < ACTIVITY_AVG_MIN) + mb_activity = ACTIVITY_AVG_MIN; return mb_activity; } @@ -175,10 +189,10 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) { #else // Simple mean for now cpi->activity_avg = (unsigned int) (activity_sum / cpi->common.MBs); -#endif +#endif // ACT_MEDIAN - if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN) - cpi->activity_avg = VP9_ACTIVITY_AVG_MIN; + if (cpi->activity_avg < ACTIVITY_AVG_MIN) + cpi->activity_avg = ACTIVITY_AVG_MIN; // Experimental code: return fixed value normalized for several clips if (ALT_ACT_MEASURE) @@ -240,7 +254,7 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) { #endif } -#endif +#endif // USE_ACT_INDEX // Loop through all MBs. Note activity of each, average activity and // calculate a normalized activity for each @@ -277,7 +291,7 @@ static void build_activity_map(VP9_COMP *cpi) { #endif // measure activity - mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col); + mb_activity = mb_activity_measure(x, mb_row, mb_col); // Keep frame sum activity_sum += mb_activity; @@ -331,15 +345,17 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) { } static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, - BLOCK_SIZE_TYPE bsize, int output_enabled) { + BLOCK_SIZE bsize, int output_enabled) { int i, x_idx, y; - MACROBLOCK * const x = &cpi->mb; - MACROBLOCKD * const xd = &x->e_mbd; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *mi = &ctx->mic; - MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO * const mbmi = &xd->this_mi->mbmi; + MODE_INFO *mi_addr = xd->this_mi; int mb_mode_index = ctx->best_mode_index; - const int mis = cpi->common.mode_info_stride; + const int mis = cm->mode_info_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; @@ -349,17 +365,16 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES); assert(mi->mbmi.sb_type == bsize); + *mi_addr = *mi; + // Restore the coding context of the MB to that that was in place // when the mode was picked for it - for (y = 0; y < mi_height; y++) { - for (x_idx = 0; x_idx < mi_width; x_idx++) { - if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > x_idx - && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > y) { - MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis; - *mi_addr = *mi; - } - } - } + for (y = 0; y < mi_height; y++) + for (x_idx = 0; x_idx < mi_width; x_idx++) + if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx + && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) + xd->mi_8x8[x_idx + y * mis] = mi_addr; + // FIXME(rbultje) I'm pretty sure this should go to the end of this block // (i.e. after the output_enabled) if (bsize < BLOCK_32X32) { @@ -378,12 +393,12 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, if (!output_enabled) return; - if (!vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + if (!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { for (i = 0; i < TX_MODES; i++) cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i]; } - if (cpi->common.frame_type == KEY_FRAME) { + if (cm->frame_type == KEY_FRAME) { // Restore the coding modes to that held in the coding context // if (mb_mode == I4X4_PRED) // for (i = 0; i < 16; i++) @@ -401,7 +416,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, THR_D135_PRED /*D135_PRED*/, THR_D117_PRED /*D117_PRED*/, THR_D153_PRED /*D153_PRED*/, - THR_D27_PRED /*D27_PRED*/, + THR_D207_PRED /*D207_PRED*/, THR_D63_PRED /*D63_PRED*/, THR_TM /*TM_PRED*/, THR_B_PRED /*I4X4_PRED*/, @@ -412,7 +427,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, // Note how often each mode chosen as best cpi->mode_chosen_counts[mb_mode_index]++; if (is_inter_block(mbmi) - && (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) { + && (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) { int_mv best_mv, best_second_mv; const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0]; const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1]; @@ -427,29 +442,17 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv); } - if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) { - int i, j; - for (j = 0; j < mi_height; ++j) - for (i = 0; i < mi_width; ++i) - if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > i - && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > j) - xd->mode_info_context[mis * j + i].mbmi = *mbmi; - } - - if (cpi->common.mcomp_filter_type == SWITCHABLE - && is_inter_mode(mbmi->mode)) { - ++cpi->common.counts.switchable_interp[ - vp9_get_pred_context_switchable_interp(xd)] - [vp9_switchable_interp_map[mbmi->interp_filter]]; + if (cm->mcomp_filter_type == SWITCHABLE && is_inter_mode(mbmi->mode)) { + const int ctx = vp9_get_pred_context_switchable_interp(xd); + ++cm->counts.switchable_interp[ctx][mbmi->interp_filter]; } cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff; cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff; cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff; - for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) { + for (i = 0; i <= SWITCHABLE_FILTERS; i++) cpi->rd_filter_diff[i] += ctx->best_filter_diff[i]; - } } } @@ -469,10 +472,10 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, } static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize) { - MACROBLOCK * const x = &cpi->mb; - VP9_COMMON * const cm = &cpi->common; - MACROBLOCKD * const xd = &x->e_mbd; + BLOCK_SIZE bsize) { + MACROBLOCK *const x = &cpi->mb; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi; const int dst_fb_idx = cm->new_fb_idx; const int idx_str = xd->mode_info_stride * mi_row + mi_col; @@ -481,18 +484,9 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, const int mb_row = mi_row >> 1; const int mb_col = mi_col >> 1; const int idx_map = mb_row * cm->mb_cols + mb_col; - const struct segmentation *const seg = &xd->seg; - int i; + const struct segmentation *const seg = &cm->seg; - // entropy context structures - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].above_context = cm->above_context[i] - + (mi_col * 2 >> xd->plane[i].subsampling_x); - xd->plane[i].left_context = cm->left_context[i] - + (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y); - } - - // partition contexts + set_skip_context(cm, xd, mi_row, mi_col); set_partition_seg_context(cm, xd, mi_row, mi_col); // Activity map pointer @@ -501,23 +495,28 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, /* pointers to mode info contexts */ x->partition_info = x->pi + idx_str; - xd->mode_info_context = cm->mi + idx_str; - mbmi = &xd->mode_info_context->mbmi; + + xd->mi_8x8 = cm->mi_grid_visible + idx_str; + xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str; + // Special case: if prev_mi is NULL, the previous mode info context // cannot be used. - xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + idx_str : NULL; + xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL; + + xd->this_mi = + xd->mi_8x8[0] = cm->mi + idx_str; + + mbmi = &xd->this_mi->mbmi; // Set up destination pointers setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col); - /* Set up limit values for MV components to prevent them from - * extending beyond the UMV borders assuming 16x16 block size */ - x->mv_row_min = -((mi_row * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND); - x->mv_col_min = -((mi_col * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND); - x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE - + (VP9BORDERINPIXELS - MI_SIZE * mi_height - VP9_INTERP_EXTEND)); - x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE - + (VP9BORDERINPIXELS - MI_SIZE * mi_width - VP9_INTERP_EXTEND)); + // Set up limit values for MV components + // mv beyond the range do not produce new/different prediction block + x->mv_row_min = -(((mi_row + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND); + x->mv_col_min = -(((mi_col + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND); + x->mv_row_max = (cm->mi_rows - mi_row) * MI_SIZE + VP9_INTERP_EXTEND; + x->mv_col_max = (cm->mi_cols - mi_col) * MI_SIZE + VP9_INTERP_EXTEND; // Set up distance of MB to edge of frame in 1/8th pel units assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); @@ -564,25 +563,33 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, int *totalrate, int64_t *totaldist, - BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - x->rd_search = 1; + // Use the lower precision, but faster, 32x32 fdct for mode selection. + x->use_lp32x32fdct = 1; - if (bsize < BLOCK_SIZE_SB8X8) { + if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. - if (xd->ab_index != 0) + if (xd->ab_index != 0) { + *totalrate = 0; + *totaldist = 0; return; + } } set_offsets(cpi, mi_row, mi_col, bsize); - xd->mode_info_context->mbmi.sb_type = bsize; + xd->this_mi->mbmi.sb_type = bsize; + + // Set to zero to make sure we do not use the previous encoded frame stats + xd->this_mi->mbmi.skip_coeff = 0; + + x->source_variance = get_sby_perpixel_variance(cpi, x, bsize); - x->source_variance = get_sb_variance(cpi, x, bsize); if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp9_activity_masking(cpi, x); @@ -600,38 +607,39 @@ static void update_stats(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *mi = xd->mode_info_context; + MODE_INFO *mi = xd->this_mi; MB_MODE_INFO *const mbmi = &mi->mbmi; if (cm->frame_type != KEY_FRAME) { - const int seg_ref_active = vp9_segfeature_active(&xd->seg, mbmi->segment_id, + const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); if (!seg_ref_active) - cpi->intra_inter_count[vp9_get_pred_context_intra_inter(xd)][mbmi - ->ref_frame[0] > INTRA_FRAME]++; + cpi->intra_inter_count[vp9_get_pred_context_intra_inter(xd)] + [is_inter_block(mbmi)]++; // If the segment reference feature is enabled we have only a single // reference frame allowed for the segment so exclude it from // the reference frame counts used to work out probabilities. - if ((mbmi->ref_frame[0] > INTRA_FRAME) && !seg_ref_active) { + if (is_inter_block(mbmi) && !seg_ref_active) { if (cm->comp_pred_mode == HYBRID_PREDICTION) cpi->comp_inter_count[vp9_get_pred_context_comp_inter_inter(cm, xd)] - [mbmi->ref_frame[1] > INTRA_FRAME]++; + [has_second_ref(mbmi)]++; - if (mbmi->ref_frame[1] > INTRA_FRAME) { - cpi->comp_ref_count[vp9_get_pred_context_comp_ref_p(cm, xd)][mbmi - ->ref_frame[0] == GOLDEN_FRAME]++; + if (has_second_ref(mbmi)) { + cpi->comp_ref_count[vp9_get_pred_context_comp_ref_p(cm, xd)] + [mbmi->ref_frame[0] == GOLDEN_FRAME]++; } else { - cpi->single_ref_count[vp9_get_pred_context_single_ref_p1(xd)] - [0][mbmi->ref_frame[0] != LAST_FRAME]++; + cpi->single_ref_count[vp9_get_pred_context_single_ref_p1(xd)][0] + [mbmi->ref_frame[0] != LAST_FRAME]++; if (mbmi->ref_frame[0] != LAST_FRAME) cpi->single_ref_count[vp9_get_pred_context_single_ref_p2(xd)][1] - [mbmi->ref_frame[0] != GOLDEN_FRAME]++; + [mbmi->ref_frame[0] != GOLDEN_FRAME]++; } } + // Count of last ref frame 0,0 usage - if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame[0] == LAST_FRAME)) + if (mbmi->mode == ZEROMV && mbmi->ref_frame[0] == LAST_FRAME) cpi->inter_zz_count++; } } @@ -639,9 +647,8 @@ static void update_stats(VP9_COMP *cpi) { // TODO(jingning): the variables used here are little complicated. need further // refactoring on organizing the temporary buffers, when recursive // partition down to 4x4 block size is enabled. -static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize) { - MACROBLOCKD * const xd = &x->e_mbd; +static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &x->e_mbd; switch (bsize) { case BLOCK_64X64: @@ -676,9 +683,8 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, } } -static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize) { - MACROBLOCKD *xd = &x->e_mbd; +static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) { + MACROBLOCKD *const xd = &x->e_mbd; switch (bsize) { case BLOCK_64X64: return &x->sb64_partitioning; @@ -698,7 +704,7 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col, ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8], - BLOCK_SIZE_TYPE bsize) { + BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -729,7 +735,7 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8], - BLOCK_SIZE_TYPE bsize) { + BLOCK_SIZE bsize) { const VP9_COMMON *const cm = &cpi->common; const MACROBLOCK *const x = &cpi->mb; const MACROBLOCKD *const xd = &x->e_mbd; @@ -760,7 +766,7 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, } static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, - int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) { + int output_enabled, BLOCK_SIZE bsize, int sub_index) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; MACROBLOCKD * const xd = &x->e_mbd; @@ -769,9 +775,9 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, return; if (sub_index != -1) - *(get_sb_index(xd, bsize)) = sub_index; + *get_sb_index(xd, bsize) = sub_index; - if (bsize < BLOCK_SIZE_SB8X8) { + if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. if (xd->ab_index > 0) @@ -790,22 +796,22 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, } static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, - int output_enabled, BLOCK_SIZE_TYPE bsize) { + int output_enabled, BLOCK_SIZE bsize) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; MACROBLOCKD * const xd = &x->e_mbd; - BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8; + BLOCK_SIZE c1 = BLOCK_8X8; const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4; - int UNINITIALIZED_IS_SAFE(pl); + int pl = 0; PARTITION_TYPE partition; - BLOCK_SIZE_TYPE subsize; + BLOCK_SIZE subsize; int i; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; c1 = BLOCK_4X4; - if (bsize >= BLOCK_SIZE_SB8X8) { + if (bsize >= BLOCK_8X8) { set_partition_seg_context(cm, xd, mi_row, mi_col); pl = partition_plane_context(xd, bsize); c1 = *(get_sb_partitioning(x, bsize)); @@ -814,7 +820,7 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, switch (partition) { case PARTITION_NONE: - if (output_enabled && bsize >= BLOCK_SIZE_SB8X8) + if (output_enabled && bsize >= BLOCK_8X8) cpi->partition_count[pl][PARTITION_NONE]++; encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1); break; @@ -839,7 +845,7 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, for (i = 0; i < 4; i++) { const int x_idx = i & 1, y_idx = i >> 1; - *(get_sb_index(xd, subsize)) = i; + *get_sb_index(xd, subsize) = i; encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs, output_enabled, subsize); } @@ -849,52 +855,114 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, break; } - if (partition != PARTITION_SPLIT || bsize == BLOCK_SIZE_SB8X8) { + if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) { set_partition_seg_context(cm, xd, mi_row, mi_col); update_partition_context(xd, c1, bsize); } } -static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m, - BLOCK_SIZE_TYPE bsize) { +// Check to see if the given partition size is allowed for a specified number +// of 8x8 block rows and columns remaining in the image. +// If not then return the largest allowed partition size +static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, + int rows_left, int cols_left, + int *bh, int *bw) { + if ((rows_left <= 0) || (cols_left <= 0)) { + return MIN(bsize, BLOCK_8X8); + } else { + for (; bsize > 0; --bsize) { + *bh = num_8x8_blocks_high_lookup[bsize]; + *bw = num_8x8_blocks_wide_lookup[bsize]; + if ((*bh <= rows_left) && (*bw <= cols_left)) { + break; + } + } + } + return bsize; +} + +// This function attempts to set all mode info entries in a given SB64 +// to the same block partition size. +// However, at the bottom and right borders of the image the requested size +// may not be allowed in which case this code attempts to choose the largest +// allowable partition. +static void set_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8, + int mi_row, int mi_col) { VP9_COMMON *const cm = &cpi->common; + BLOCK_SIZE bsize = cpi->sf.always_this_block_size; const int mis = cm->mode_info_stride; + int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row; + int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col; int block_row, block_col; - for (block_row = 0; block_row < 8; ++block_row) { - for (block_col = 0; block_col < 8; ++block_col) { - m[block_row * mis + block_col].mbmi.sb_type = bsize; + MODE_INFO * mi_upper_left = cm->mi + mi_row * mis + mi_col; + int bh = num_8x8_blocks_high_lookup[bsize]; + int bw = num_8x8_blocks_wide_lookup[bsize]; + + assert((row8x8_remaining > 0) && (col8x8_remaining > 0)); + + // Apply the requested partition size to the SB64 if it is all "in image" + if ((col8x8_remaining >= MI_BLOCK_SIZE) && + (row8x8_remaining >= MI_BLOCK_SIZE)) { + for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) { + for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) { + int index = block_row * mis + block_col; + mi_8x8[index] = mi_upper_left + index; + mi_8x8[index]->mbmi.sb_type = bsize; + } + } + } else { + // Else this is a partial SB64. + for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) { + for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) { + int index = block_row * mis + block_col; + // Find a partition size that fits + bsize = find_partition_size(cpi->sf.always_this_block_size, + (row8x8_remaining - block_row), + (col8x8_remaining - block_col), &bh, &bw); + mi_8x8[index] = mi_upper_left + index; + mi_8x8[index]->mbmi.sb_type = bsize; + } } } } -static void copy_partitioning(VP9_COMP *cpi, MODE_INFO *m, MODE_INFO *p) { + +static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8, + MODE_INFO **prev_mi_8x8) { VP9_COMMON *const cm = &cpi->common; const int mis = cm->mode_info_stride; int block_row, block_col; + for (block_row = 0; block_row < 8; ++block_row) { for (block_col = 0; block_col < 8; ++block_col) { - m[block_row * mis + block_col].mbmi.sb_type = - p[block_row * mis + block_col].mbmi.sb_type; + MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col]; + BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0; + int offset; + + if (prev_mi) { + offset = prev_mi - cm->prev_mi; + mi_8x8[block_row * mis + block_col] = cm->mi + offset; + mi_8x8[block_row * mis + block_col]->mbmi.sb_type = sb_type; + } } } } -static void set_block_size(VP9_COMMON * const cm, MODE_INFO *m, - BLOCK_SIZE_TYPE bsize, int mis, int mi_row, +static void set_block_size(VP9_COMMON * const cm, MODE_INFO **mi_8x8, + BLOCK_SIZE bsize, int mis, int mi_row, int mi_col) { - int row, col; - int bwl = b_width_log2(bsize); - int bhl = b_height_log2(bsize); - int bsl = (bwl > bhl ? bwl : bhl); - - int bs = (1 << bsl) / 2; // Block size in units of 8 pels. - MODE_INFO *m2 = m + mi_row * mis + mi_col; - for (row = 0; row < bs; row++) { - for (col = 0; col < bs; col++) { - if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols) - continue; - m2[row * mis + col].mbmi.sb_type = bsize; - } - } + int r, c; + const int bs = MAX(num_8x8_blocks_wide_lookup[bsize], + num_8x8_blocks_high_lookup[bsize]); + const int idx_str = mis * mi_row + mi_col; + MODE_INFO **const mi2 = &mi_8x8[idx_str]; + + mi2[0] = cm->mi + idx_str; + mi2[0]->mbmi.sb_type = bsize; + + for (r = 0; r < bs; r++) + for (c = 0; c < bs; c++) + if (mi_row + r < cm->mi_rows && mi_col + c < cm->mi_cols) + mi2[r * mis + c] = mi2[0]; } typedef struct { @@ -931,9 +999,9 @@ typedef enum { V64X64, } TREE_LEVEL; -static void tree_to_node(void *data, BLOCK_SIZE_TYPE block_size, vt_node *node) { +static void tree_to_node(void *data, BLOCK_SIZE bsize, vt_node *node) { int i; - switch (block_size) { + switch (bsize) { case BLOCK_64X64: { v64x64 *vt = (v64x64 *) data; node->vt = &vt->vt; @@ -990,9 +1058,9 @@ void sum_2_variances(var *r, var *a, var*b) { a->sum_error + b->sum_error, a->count + b->count); } -static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) { +static void fill_variance_tree(void *data, BLOCK_SIZE bsize) { vt_node node; - tree_to_node(data, block_size, &node); + tree_to_node(data, bsize, &node); sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]); sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]); sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]); @@ -1002,7 +1070,7 @@ static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) { #if PERFORM_RANDOM_PARTITIONING static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m, - BLOCK_SIZE_TYPE block_size, int mi_row, + BLOCK_SIZE block_size, int mi_row, int mi_col, int mi_size) { VP9_COMMON * const cm = &cpi->common; vt_node vt; @@ -1038,30 +1106,30 @@ static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m, return 0; } -#else +#else // !PERFORM_RANDOM_PARTITIONING -static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m, - BLOCK_SIZE_TYPE block_size, int mi_row, +static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO **m, + BLOCK_SIZE bsize, int mi_row, int mi_col, int mi_size) { VP9_COMMON * const cm = &cpi->common; vt_node vt; const int mis = cm->mode_info_stride; int64_t threshold = 50 * cpi->common.base_qindex; - tree_to_node(data, block_size, &vt); + tree_to_node(data, bsize, &vt); // split none is available only if we have more than half a block size // in width and height inside the visible image if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows && vt.vt->none.variance < threshold) { - set_block_size(cm, m, block_size, mis, mi_row, mi_col); + set_block_size(cm, m, bsize, mis, mi_row, mi_col); return 1; } // vertical split is available on all but the bottom border if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold && vt.vt->vert[1].variance < threshold) { - set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row, + set_block_size(cm, m, get_subsize(bsize, PARTITION_VERT), mis, mi_row, mi_col); return 1; } @@ -1069,17 +1137,17 @@ static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m, // horizontal split is available on all but the right border if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold && vt.vt->horz[1].variance < threshold) { - set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row, + set_block_size(cm, m, get_subsize(bsize, PARTITION_HORZ), mis, mi_row, mi_col); return 1; } return 0; } -#endif +#endif // PERFORM_RANDOM_PARTITIONING -static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row, - int mi_col) { +static void choose_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8, + int mi_row, int mi_col) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK *x = &cpi->mb; MACROBLOCKD *xd = &cpi->mb.e_mbd; @@ -1095,7 +1163,7 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row, int pixels_wide = 64, pixels_high = 64; vp9_zero(vt); - set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64); + set_offsets(cpi, mi_row, mi_col, BLOCK_64X64); if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); @@ -1122,13 +1190,16 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row, &xd->scale_factor[0]); setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col, &xd->scale_factor[1]); - xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME; - xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64; - vp9_find_best_ref_mvs(xd, m->mbmi.ref_mvs[m->mbmi.ref_frame[0]], + + xd->this_mi->mbmi.ref_frame[0] = LAST_FRAME; + xd->this_mi->mbmi.sb_type = BLOCK_64X64; + vp9_find_best_ref_mvs(xd, + mi_8x8[0]->mbmi.ref_mvs[mi_8x8[0]->mbmi.ref_frame[0]], &nearest_mv, &near_mv); - xd->mode_info_context->mbmi.mv[0] = nearest_mv; - vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_SIZE_SB64X64); + xd->this_mi->mbmi.mv[0] = nearest_mv; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64); + d = xd->plane[0].dst.buf; dp = xd->plane[0].dst.stride; } @@ -1165,24 +1236,24 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row, // Now go through the entire structure, splitting every block size until // we get to one that's got a variance lower than our threshold, or we // hit 8x8. - if (!set_vt_partitioning(cpi, &vt, m, BLOCK_64X64, mi_row, mi_col, + if (!set_vt_partitioning(cpi, &vt, mi_8x8, BLOCK_64X64, mi_row, mi_col, 4)) { for (i = 0; i < 4; ++i) { const int x32_idx = ((i & 1) << 2); const int y32_idx = ((i >> 1) << 2); - if (!set_vt_partitioning(cpi, &vt.split[i], m, BLOCK_32X32, + if (!set_vt_partitioning(cpi, &vt.split[i], mi_8x8, BLOCK_32X32, (mi_row + y32_idx), (mi_col + x32_idx), 2)) { for (j = 0; j < 4; ++j) { const int x16_idx = ((j & 1) << 1); const int y16_idx = ((j >> 1) << 1); - if (!set_vt_partitioning(cpi, &vt.split[i].split[j], m, + if (!set_vt_partitioning(cpi, &vt.split[i].split[j], mi_8x8, BLOCK_16X16, (mi_row + y32_idx + y16_idx), (mi_col + x32_idx + x16_idx), 1)) { for (k = 0; k < 4; ++k) { const int x8_idx = (k & 1); const int y8_idx = (k >> 1); - set_block_size(cm, m, BLOCK_8X8, mis, + set_block_size(cm, mi_8x8, BLOCK_8X8, mis, (mi_row + y32_idx + y16_idx + y8_idx), (mi_col + x32_idx + x16_idx + x8_idx)); } @@ -1193,9 +1264,10 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row, } } -static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, - int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize, - int *rate, int64_t *dist, int do_recon) { +static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, + TOKENEXTRA **tp, int mi_row, int mi_col, + BLOCK_SIZE bsize, int *rate, int64_t *dist, + int do_recon) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; MACROBLOCKD *xd = &cpi->mb.e_mbd; @@ -1208,7 +1280,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, int bss = (1 << bsl) / 4; int i, pl; PARTITION_TYPE partition = PARTITION_NONE; - BLOCK_SIZE_TYPE subsize; + BLOCK_SIZE subsize; ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; int last_part_rate = INT_MAX; @@ -1219,9 +1291,9 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, int64_t none_dist = INT_MAX; int chosen_rate = INT_MAX; int64_t chosen_dist = INT_MAX; - BLOCK_SIZE_TYPE sub_subsize = BLOCK_4X4; + BLOCK_SIZE sub_subsize = BLOCK_4X4; int splits_below = 0; - BLOCK_SIZE_TYPE bs_type = m->mbmi.sb_type; + BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -1230,7 +1302,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, subsize = get_subsize(bsize, partition); - if (bsize < BLOCK_SIZE_SB8X8) { + if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. if (xd->ab_index != 0) { @@ -1244,17 +1316,17 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); x->fast_ms = 0; - x->pred_mv.as_int = 0; x->subblock_ref = 0; if (cpi->sf.adjust_partitioning_from_last_frame) { // Check if any of the sub blocks are further split. - if (partition == PARTITION_SPLIT && subsize > BLOCK_SIZE_SB8X8) { + if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) { sub_subsize = get_subsize(subsize, PARTITION_SPLIT); splits_below = 1; for (i = 0; i < 4; i++) { int jj = i >> 1, ii = i & 0x01; - if (m[jj * bss * mis + ii * bss].mbmi.sb_type >= sub_subsize) { + MODE_INFO * this_mi = mi_8x8[jj * bss * mis + ii * bss]; + if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) { splits_below = 0; } } @@ -1274,7 +1346,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, none_rate += x->partition_cost[pl][PARTITION_NONE]; restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - m->mbmi.sb_type = bs_type; + mi_8x8[0]->mbmi.sb_type = bs_type; *(get_sb_partitioning(x, bsize)) = subsize; } } @@ -1285,16 +1357,16 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, bsize, get_block_context(x, bsize), INT64_MAX); break; case PARTITION_HORZ: - *(get_sb_index(xd, subsize)) = 0; + *get_sb_index(xd, subsize) = 0; pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist, subsize, get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && - bsize >= BLOCK_SIZE_SB8X8 && mi_row + (mh >> 1) < cm->mi_rows) { + bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) { int rt = 0; int64_t dt = 0; update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *(get_sb_index(xd, subsize)) = 1; + *get_sb_index(xd, subsize) = 1; pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { @@ -1308,16 +1380,16 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, } break; case PARTITION_VERT: - *(get_sb_index(xd, subsize)) = 0; + *get_sb_index(xd, subsize) = 0; pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist, subsize, get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && - bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) { + bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) { int rt = 0; int64_t dt = 0; update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *(get_sb_index(xd, subsize)) = 1; + *get_sb_index(xd, subsize) = 1; pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { @@ -1343,10 +1415,11 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; - *(get_sb_index(xd, subsize)) = i; + *get_sb_index(xd, subsize) = i; - rd_use_partition(cpi, m + jj * bss * mis + ii * bss, tp, mi_row + y_idx, - mi_col + x_idx, subsize, &rt, &dt, i != 3); + rd_use_partition(cpi, mi_8x8 + jj * bss * mis + ii * bss, tp, + mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt, + i != 3); if (rt == INT_MAX || dt == INT_MAX) { last_part_rate = INT_MAX; last_part_dist = INT_MAX; @@ -1365,10 +1438,10 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, last_part_rate += x->partition_cost[pl][partition]; if (cpi->sf.adjust_partitioning_from_last_frame - && partition != PARTITION_SPLIT && bsize > BLOCK_SIZE_SB8X8 + && partition != PARTITION_SPLIT && bsize > BLOCK_8X8 && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows) && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) { - BLOCK_SIZE_TYPE split_subsize = get_subsize(bsize, PARTITION_SPLIT); + BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT); split_rate = 0; split_dist = 0; restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); @@ -1386,9 +1459,9 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, || (mi_col + x_idx >= cm->mi_cols)) continue; - *(get_sb_index(xd, split_subsize)) = i; - *(get_sb_partitioning(x, bsize)) = split_subsize; - *(get_sb_partitioning(x, split_subsize)) = split_subsize; + *get_sb_index(xd, split_subsize) = i; + *get_sb_partitioning(x, bsize) = split_subsize; + *get_sb_partitioning(x, split_subsize) = split_subsize; save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); @@ -1427,8 +1500,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, // If last_part is better set the partitioning to that... if (RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist) < RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)) { - m->mbmi.sb_type = bsize; - if (bsize >= BLOCK_SIZE_SB8X8) + mi_8x8[0]->mbmi.sb_type = bsize; + if (bsize >= BLOCK_8X8) *(get_sb_partitioning(x, bsize)) = subsize; chosen_rate = last_part_rate; chosen_dist = last_part_dist; @@ -1436,7 +1509,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, // If none was better set the partitioning to that... if (RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist) > RDCOST(x->rdmult, x->rddiv, none_rate, none_dist)) { - if (bsize >= BLOCK_SIZE_SB8X8) + if (bsize >= BLOCK_8X8) *(get_sb_partitioning(x, bsize)) = bsize; chosen_rate = none_rate; chosen_dist = none_dist; @@ -1446,37 +1519,68 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, // We must have chosen a partitioning and encoding or we'll fail later on. // No other opportunities for success. - if ( bsize == BLOCK_SIZE_SB64X64) + if ( bsize == BLOCK_64X64) assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX); if (do_recon) - encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize); + encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize); *rate = chosen_rate; *dist = chosen_dist; } -static BLOCK_SIZE_TYPE min_partition_size[BLOCK_SIZE_TYPES] = - { BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, - BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8, - BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16 }; -static BLOCK_SIZE_TYPE max_partition_size[BLOCK_SIZE_TYPES] = - { BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, - BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64, - BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64 }; +static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = { + BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, + BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8, + BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16 +}; + +static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = { + BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, + BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64, + BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64 +}; +// Look at all the mode_info entries for blocks that are part of this +// partition and find the min and max values for sb_type. +// At the moment this is designed to work on a 64x64 SB but could be +// adjusted to use a size parameter. +// +// The min and max are assumed to have been initialized prior to calling this +// function so repeat calls can accumulate a min and max of more than one sb64. +static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8, + BLOCK_SIZE * min_block_size, + BLOCK_SIZE * max_block_size ) { + MACROBLOCKD *const xd = &cpi->mb.e_mbd; + int sb_width_in_blocks = MI_BLOCK_SIZE; + int sb_height_in_blocks = MI_BLOCK_SIZE; + int i, j; + int index = 0; + + // Check the sb_type for each block that belongs to this region. + for (i = 0; i < sb_height_in_blocks; ++i) { + for (j = 0; j < sb_width_in_blocks; ++j) { + MODE_INFO * mi = mi_8x8[index+j]; + BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0; + *min_block_size = MIN(*min_block_size, sb_type); + *max_block_size = MAX(*max_block_size, sb_type); + } + index += xd->mode_info_stride; + } +} // Look at neighboring blocks and set a min and max partition size based on // what they chose. -static void rd_auto_partition_range(VP9_COMP *cpi, - BLOCK_SIZE_TYPE * min_block_size, - BLOCK_SIZE_TYPE * max_block_size) { +static void rd_auto_partition_range(VP9_COMP *cpi, int row, int col, + BLOCK_SIZE *min_block_size, + BLOCK_SIZE *max_block_size) { MACROBLOCKD *const xd = &cpi->mb.e_mbd; - const MODE_INFO *const mi = xd->mode_info_context; - const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; - const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; - const int left_in_image = xd->left_available && left_mbmi->mb_in_image; - const int above_in_image = xd->up_available && above_mbmi->mb_in_image; + MODE_INFO ** mi_8x8 = xd->mi_8x8; + const int left_in_image = xd->left_available && mi_8x8[-1]; + const int above_in_image = xd->up_available && + mi_8x8[-xd->mode_info_stride]; + MODE_INFO ** above_sb64_mi_8x8; + MODE_INFO ** left_sb64_mi_8x8; // Frequency check if (cpi->sf.auto_min_max_partition_count <= 0) { @@ -1484,51 +1588,182 @@ static void rd_auto_partition_range(VP9_COMP *cpi, cpi->sf.auto_min_max_partition_interval; *min_block_size = BLOCK_4X4; *max_block_size = BLOCK_64X64; - return; } else { --cpi->sf.auto_min_max_partition_count; + + // Set default values if no left or above neighbour + if (!left_in_image && !above_in_image) { + *min_block_size = BLOCK_4X4; + *max_block_size = BLOCK_64X64; + } else { + VP9_COMMON *const cm = &cpi->common; + int row8x8_remaining = cm->cur_tile_mi_row_end - row; + int col8x8_remaining = cm->cur_tile_mi_col_end - col; + int bh, bw; + + // Default "min to max" and "max to min" + *min_block_size = BLOCK_64X64; + *max_block_size = BLOCK_4X4; + + // Find the min and max partition sizes used in the left SB64 + if (left_in_image) { + left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE]; + get_sb_partition_size_range(cpi, left_sb64_mi_8x8, + min_block_size, max_block_size); + } + + // Find the min and max partition sizes used in the above SB64 taking + // the values found for left as a starting point. + if (above_in_image) { + above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE]; + get_sb_partition_size_range(cpi, above_sb64_mi_8x8, + min_block_size, max_block_size); + } + + // Give a bit of leaway either side of the observed min and max + *min_block_size = min_partition_size[*min_block_size]; + *max_block_size = max_partition_size[*max_block_size]; + + // Check border cases where max and min from neighbours may not be legal. + *max_block_size = find_partition_size(*max_block_size, + row8x8_remaining, col8x8_remaining, + &bh, &bw); + *min_block_size = MIN(*min_block_size, *max_block_size); + } } +} - // Check for edge cases - if (!left_in_image && !above_in_image) { - *min_block_size = BLOCK_4X4; - *max_block_size = BLOCK_64X64; - } else if (!left_in_image) { - *min_block_size = min_partition_size[above_mbmi->sb_type]; - *max_block_size = max_partition_size[above_mbmi->sb_type]; - } else if (!above_in_image) { - *min_block_size = min_partition_size[left_mbmi->sb_type]; - *max_block_size = max_partition_size[left_mbmi->sb_type]; - } else { - *min_block_size = - min_partition_size[MIN(left_mbmi->sb_type, above_mbmi->sb_type)]; - *max_block_size = - max_partition_size[MAX(left_mbmi->sb_type, above_mbmi->sb_type)]; +static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; + + // Only use 8x8 result for non HD videos. + // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0; + int use_8x8 = 1; + + if (cm->frame_type && !cpi->is_src_frame_alt_ref && + ((use_8x8 && bsize == BLOCK_16X16) || + bsize == BLOCK_32X32 || bsize == BLOCK_64X64)) { + int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0; + PICK_MODE_CONTEXT *block_context = NULL; + + if (bsize == BLOCK_16X16) { + block_context = x->sb8x8_context[xd->sb_index][xd->mb_index]; + } else if (bsize == BLOCK_32X32) { + block_context = x->mb_context[xd->sb_index]; + } else if (bsize == BLOCK_64X64) { + block_context = x->sb32_context; + } + + if (block_context) { + ref0 = block_context[0].mic.mbmi.ref_frame[0]; + ref1 = block_context[1].mic.mbmi.ref_frame[0]; + ref2 = block_context[2].mic.mbmi.ref_frame[0]; + ref3 = block_context[3].mic.mbmi.ref_frame[0]; + } + + // Currently, only consider 4 inter reference frames. + if (ref0 && ref1 && ref2 && ref3) { + int d01, d23, d02, d13; + + // Motion vectors for the four subblocks. + int16_t mvr0 = block_context[0].mic.mbmi.mv[0].as_mv.row; + int16_t mvc0 = block_context[0].mic.mbmi.mv[0].as_mv.col; + int16_t mvr1 = block_context[1].mic.mbmi.mv[0].as_mv.row; + int16_t mvc1 = block_context[1].mic.mbmi.mv[0].as_mv.col; + int16_t mvr2 = block_context[2].mic.mbmi.mv[0].as_mv.row; + int16_t mvc2 = block_context[2].mic.mbmi.mv[0].as_mv.col; + int16_t mvr3 = block_context[3].mic.mbmi.mv[0].as_mv.row; + int16_t mvc3 = block_context[3].mic.mbmi.mv[0].as_mv.col; + + // Adjust sign if ref is alt_ref. + if (cm->ref_frame_sign_bias[ref0]) { + mvr0 *= -1; + mvc0 *= -1; + } + + if (cm->ref_frame_sign_bias[ref1]) { + mvr1 *= -1; + mvc1 *= -1; + } + + if (cm->ref_frame_sign_bias[ref2]) { + mvr2 *= -1; + mvc2 *= -1; + } + + if (cm->ref_frame_sign_bias[ref3]) { + mvr3 *= -1; + mvc3 *= -1; + } + + // Calculate mv distances. + d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1)); + d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3)); + d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2)); + d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3)); + + if (d01 < FAST_MOTION_MV_THRESH && d23 < FAST_MOTION_MV_THRESH && + d02 < FAST_MOTION_MV_THRESH && d13 < FAST_MOTION_MV_THRESH) { + // Set fast motion search level. + x->fast_ms = 1; + + if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 && + d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) { + // Set fast motion search level. + x->fast_ms = 2; + + if (!d01 && !d23 && !d02 && !d13) { + x->fast_ms = 3; + x->subblock_ref = ref0; + } + } + } + } } } +static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { + vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv)); +} + +static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { + vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv)); +} + // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, - int mi_col, BLOCK_SIZE_TYPE bsize, int *rate, + int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, int do_recon, int64_t best_rd) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; MACROBLOCKD * const xd = &x->e_mbd; - int bsl = b_width_log2(bsize), bs = 1 << bsl; - int ms = bs / 2; + const int ms = num_8x8_blocks_wide_lookup[bsize] / 2; ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; TOKENEXTRA *tp_orig = *tp; int i, pl; - BLOCK_SIZE_TYPE subsize; - int srate = INT_MAX; - int64_t sdist = INT_MAX; - + BLOCK_SIZE subsize; + int this_rate, sum_rate = 0, best_rate = INT_MAX; + int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX; + int64_t sum_rd = 0; + int do_split = bsize >= BLOCK_8X8; + int do_rect = 1; + // Override skipping rectangular partition operations for edge blocks + const int force_horz_split = (mi_row + ms >= cm->mi_rows); + const int force_vert_split = (mi_col + ms >= cm->mi_cols); + + int partition_none_allowed = !force_horz_split && !force_vert_split; + int partition_horz_allowed = !force_vert_split && bsize >= BLOCK_8X8; + int partition_vert_allowed = !force_horz_split && bsize >= BLOCK_8X8; + + int partition_split_done = 0; (void) *tp_orig; - if (bsize < BLOCK_SIZE_SB8X8) { + if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. if (xd->ab_index != 0) { @@ -1539,320 +1774,228 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, } assert(mi_height_log2(bsize) == mi_width_log2(bsize)); - save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - - // PARTITION_SPLIT - if (!cpi->sf.auto_min_max_partition_size || - bsize >= cpi->sf.min_partition_size) { - if (bsize > BLOCK_SIZE_SB8X8) { - int r4 = 0; - int64_t d4 = 0, sum_rd = 0; - subsize = get_subsize(bsize, PARTITION_SPLIT); - - for (i = 0; i < 4 && sum_rd < best_rd; ++i) { - int x_idx = (i & 1) * (ms >> 1); - int y_idx = (i >> 1) * (ms >> 1); - int r = 0; - int64_t d = 0; - - if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) - continue; + // Determine partition types in search according to the speed features. + // The threshold set here has to be of square block size. + if (cpi->sf.auto_min_max_partition_size) { + partition_none_allowed &= (bsize <= cpi->sf.max_partition_size && + bsize >= cpi->sf.min_partition_size); + partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size && + bsize > cpi->sf.min_partition_size) || + force_horz_split); + partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size && + bsize > cpi->sf.min_partition_size) || + force_vert_split); + do_split &= bsize > cpi->sf.min_partition_size; + } + if (cpi->sf.use_square_partition_only) { + partition_horz_allowed &= force_horz_split; + partition_vert_allowed &= force_vert_split; + } - *(get_sb_index(xd, subsize)) = i; - rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r, - &d, i != 3, best_rd - sum_rd); + save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - if (r == INT_MAX) { - r4 = INT_MAX; - sum_rd = INT64_MAX; - } else { - r4 += r; - d4 += d; - sum_rd = RDCOST(x->rdmult, x->rddiv, r4, d4); - } - } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - if (r4 != INT_MAX && i == 4) { - r4 += x->partition_cost[pl][PARTITION_SPLIT]; - *(get_sb_partitioning(x, bsize)) = subsize; - assert(r4 >= 0); - assert(d4 >= 0); - srate = r4; - sdist = d4; - best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r4, d4)); - } - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + if (cpi->sf.disable_split_var_thresh && partition_none_allowed) { + unsigned int source_variancey; + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + source_variancey = get_sby_perpixel_variance(cpi, x, bsize); + if (source_variancey < cpi->sf.disable_split_var_thresh) { + do_split = 0; + if (source_variancey < cpi->sf.disable_split_var_thresh / 2) + do_rect = 0; } } - // Use 4 subblocks' motion estimation results to speed up current - // partition's checking. - x->fast_ms = 0; - x->pred_mv.as_int = 0; - x->subblock_ref = 0; - - if (cpi->sf.using_small_partition_info && - (!cpi->sf.auto_min_max_partition_size || - (bsize <= cpi->sf.max_partition_size && - bsize >= cpi->sf.min_partition_size))) { - // Only use 8x8 result for non HD videos. - // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0; - int use_8x8 = 1; - - if (cm->frame_type && !cpi->is_src_frame_alt_ref && - ((use_8x8 && bsize == BLOCK_16X16) || - bsize == BLOCK_32X32 || bsize == BLOCK_64X64)) { - int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0; - PICK_MODE_CONTEXT *block_context = NULL; - - if (bsize == BLOCK_16X16) { - block_context = x->sb8x8_context[xd->sb_index][xd->mb_index]; - } else if (bsize == BLOCK_32X32) { - block_context = x->mb_context[xd->sb_index]; - } else if (bsize == BLOCK_SIZE_SB64X64) { - block_context = x->sb32_context; - } - - if (block_context) { - ref0 = block_context[0].mic.mbmi.ref_frame[0]; - ref1 = block_context[1].mic.mbmi.ref_frame[0]; - ref2 = block_context[2].mic.mbmi.ref_frame[0]; - ref3 = block_context[3].mic.mbmi.ref_frame[0]; + // PARTITION_NONE + if (partition_none_allowed) { + pick_sb_modes(cpi, mi_row, mi_col, &this_rate, &this_dist, bsize, + get_block_context(x, bsize), best_rd); + if (this_rate != INT_MAX) { + if (bsize >= BLOCK_8X8) { + set_partition_seg_context(cm, xd, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + this_rate += x->partition_cost[pl][PARTITION_NONE]; } + sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist); + if (sum_rd < best_rd) { + int64_t stop_thresh = 2048; + + best_rate = this_rate; + best_dist = this_dist; + best_rd = sum_rd; + if (bsize >= BLOCK_8X8) + *(get_sb_partitioning(x, bsize)) = bsize; - // Currently, only consider 4 inter ref frames. - if (ref0 && ref1 && ref2 && ref3) { - int16_t mvr0 = 0, mvc0 = 0, mvr1 = 0, mvc1 = 0, mvr2 = 0, mvc2 = 0, - mvr3 = 0, mvc3 = 0; - int d01, d23, d02, d13; // motion vector distance between 2 blocks - - // Get each subblock's motion vectors. - mvr0 = block_context[0].mic.mbmi.mv[0].as_mv.row; - mvc0 = block_context[0].mic.mbmi.mv[0].as_mv.col; - mvr1 = block_context[1].mic.mbmi.mv[0].as_mv.row; - mvc1 = block_context[1].mic.mbmi.mv[0].as_mv.col; - mvr2 = block_context[2].mic.mbmi.mv[0].as_mv.row; - mvc2 = block_context[2].mic.mbmi.mv[0].as_mv.col; - mvr3 = block_context[3].mic.mbmi.mv[0].as_mv.row; - mvc3 = block_context[3].mic.mbmi.mv[0].as_mv.col; - - // Adjust sign if ref is alt_ref - if (cm->ref_frame_sign_bias[ref0]) { - mvr0 *= -1; - mvc0 *= -1; - } - - if (cm->ref_frame_sign_bias[ref1]) { - mvr1 *= -1; - mvc1 *= -1; - } - - if (cm->ref_frame_sign_bias[ref2]) { - mvr2 *= -1; - mvc2 *= -1; - } + // Adjust threshold according to partition size. + stop_thresh >>= 8 - (b_width_log2_lookup[bsize] + + b_height_log2_lookup[bsize]); - if (cm->ref_frame_sign_bias[ref3]) { - mvr3 *= -1; - mvc3 *= -1; + // If obtained distortion is very small, choose current partition + // and stop splitting. + if (this_dist < stop_thresh) { + do_split = 0; + do_rect = 0; } + } + } + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + } - // Calculate mv distances. - d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1)); - d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3)); - d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2)); - d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3)); - - if (d01 < 24 && d23 < 24 && d02 < 24 && d13 < 24) { - // Set fast motion search level. - x->fast_ms = 1; + // store estimated motion vector + if (cpi->sf.adaptive_motion_search) + store_pred_mv(x, get_block_context(x, bsize)); - // Calculate prediction MV - x->pred_mv.as_mv.row = (mvr0 + mvr1 + mvr2 + mvr3) >> 2; - x->pred_mv.as_mv.col = (mvc0 + mvc1 + mvc2 + mvc3) >> 2; + // PARTITION_SPLIT + sum_rd = 0; + // TODO(jingning): use the motion vectors given by the above search as + // the starting point of motion search in the following partition type check. + if (do_split) { + subsize = get_subsize(bsize, PARTITION_SPLIT); + for (i = 0; i < 4 && sum_rd < best_rd; ++i) { + const int x_idx = (i & 1) * ms; + const int y_idx = (i >> 1) * ms; + + if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) + continue; - if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 && - d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) { - // Set fast motion search level. - x->fast_ms = 2; + *get_sb_index(xd, subsize) = i; + if (cpi->sf.adaptive_motion_search) + load_pred_mv(x, get_block_context(x, bsize)); + rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, + &this_rate, &this_dist, i != 3, best_rd - sum_rd); - if (!d01 && !d23 && !d02 && !d13) { - x->fast_ms = 3; - x->subblock_ref = ref0; - } - } - } + if (this_rate == INT_MAX) { + sum_rd = INT64_MAX; + } else { + sum_rate += this_rate; + sum_dist += this_dist; + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); } } - } - - if (!cpi->sf.auto_min_max_partition_size || - bsize <= cpi->sf.max_partition_size) { - int larger_is_better = 0; - // PARTITION_NONE - if ((mi_row + (ms >> 1) < cm->mi_rows) && - (mi_col + (ms >> 1) < cm->mi_cols)) { - int r; - int64_t d; - pick_sb_modes(cpi, mi_row, mi_col, &r, &d, bsize, - get_block_context(x, bsize), best_rd); - if (r != INT_MAX && bsize >= BLOCK_SIZE_SB8X8) { - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - r += x->partition_cost[pl][PARTITION_NONE]; - } - - if (r != INT_MAX && - (bsize == BLOCK_SIZE_SB8X8 || - RDCOST(x->rdmult, x->rddiv, r, d) < - RDCOST(x->rdmult, x->rddiv, srate, sdist))) { - best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r, d)); - srate = r; - sdist = d; - larger_is_better = 1; - if (bsize >= BLOCK_SIZE_SB8X8) - *(get_sb_partitioning(x, bsize)) = bsize; + if (sum_rd < best_rd && i == 4) { + set_partition_seg_context(cm, xd, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + sum_rate += x->partition_cost[pl][PARTITION_SPLIT]; + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + if (sum_rd < best_rd) { + best_rate = sum_rate; + best_dist = sum_dist; + best_rd = sum_rd; + *(get_sb_partitioning(x, bsize)) = subsize; + } else { + // skip rectangular partition test when larger block size + // gives better rd cost + if (cpi->sf.less_rectangular_check) + do_rect &= !partition_none_allowed; } } + partition_split_done = 1; + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + } - if (bsize == BLOCK_SIZE_SB8X8) { - int r4 = 0; - int64_t d4 = 0, sum_rd = 0; - subsize = get_subsize(bsize, PARTITION_SPLIT); - - for (i = 0; i < 4 && sum_rd < best_rd; ++i) { - int x_idx = (i & 1) * (ms >> 1); - int y_idx = (i >> 1) * (ms >> 1); - int r = 0; - int64_t d = 0; - - if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) - continue; - - *(get_sb_index(xd, subsize)) = i; - rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r, - &d, i != 3, best_rd - sum_rd); + x->fast_ms = 0; + x->subblock_ref = 0; - if (r == INT_MAX) { - r4 = INT_MAX; - sum_rd = INT64_MAX; - } else { - r4 += r; - d4 += d; - sum_rd = RDCOST(x->rdmult, x->rddiv, r4, d4); - } + if (partition_split_done && + cpi->sf.using_small_partition_info) { + compute_fast_motion_search_level(cpi, bsize); + } + + // PARTITION_HORZ + if (partition_horz_allowed && do_rect) { + subsize = get_subsize(bsize, PARTITION_HORZ); + *get_sb_index(xd, subsize) = 0; + if (cpi->sf.adaptive_motion_search) + load_pred_mv(x, get_block_context(x, bsize)); + pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + get_block_context(x, subsize), best_rd); + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + + if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) { + update_state(cpi, get_block_context(x, subsize), subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); + + *get_sb_index(xd, subsize) = 1; + if (cpi->sf.adaptive_motion_search) + load_pred_mv(x, get_block_context(x, bsize)); + pick_sb_modes(cpi, mi_row + ms, mi_col, &this_rate, + &this_dist, subsize, get_block_context(x, subsize), + best_rd - sum_rd); + if (this_rate == INT_MAX) { + sum_rd = INT64_MAX; + } else { + sum_rate += this_rate; + sum_dist += this_dist; + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); } + } + if (sum_rd < best_rd) { set_partition_seg_context(cm, xd, mi_row, mi_col); pl = partition_plane_context(xd, bsize); - if (r4 != INT_MAX && i == 4) { - r4 += x->partition_cost[pl][PARTITION_SPLIT]; - if (RDCOST(x->rdmult, x->rddiv, r4, d4) < - RDCOST(x->rdmult, x->rddiv, srate, sdist)) { - srate = r4; - sdist = d4; - larger_is_better = 0; - *(get_sb_partitioning(x, bsize)) = subsize; - best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r4, d4)); - } + sum_rate += x->partition_cost[pl][PARTITION_HORZ]; + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + if (sum_rd < best_rd) { + best_rd = sum_rd; + best_rate = sum_rate; + best_dist = sum_dist; + *(get_sb_partitioning(x, bsize)) = subsize; } - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + } - if (!cpi->sf.use_square_partition_only && - (!cpi->sf.less_rectangular_check ||!larger_is_better)) { - // PARTITION_HORZ - if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) { - int r2, r = 0; - int64_t d2, d = 0, h_rd; - subsize = get_subsize(bsize, PARTITION_HORZ); - *(get_sb_index(xd, subsize)) = 0; - pick_sb_modes(cpi, mi_row, mi_col, &r2, &d2, subsize, - get_block_context(x, subsize), best_rd); - h_rd = RDCOST(x->rdmult, x->rddiv, r2, d2); - - if (r2 != INT_MAX && h_rd < best_rd && - mi_row + (ms >> 1) < cm->mi_rows) { - update_state(cpi, get_block_context(x, subsize), subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - - *(get_sb_index(xd, subsize)) = 1; - pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &r, &d, subsize, - get_block_context(x, subsize), best_rd - h_rd); - if (r == INT_MAX) { - r2 = INT_MAX; - } else { - r2 += r; - d2 += d; - } - } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - if (r2 < INT_MAX) - r2 += x->partition_cost[pl][PARTITION_HORZ]; - if (r2 != INT_MAX && RDCOST(x->rdmult, x->rddiv, r2, d2) - < RDCOST(x->rdmult, x->rddiv, srate, sdist)) { - best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r2, d2)); - srate = r2; - sdist = d2; - *(get_sb_partitioning(x, bsize)) = subsize; - } - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + // PARTITION_VERT + if (partition_vert_allowed && do_rect) { + subsize = get_subsize(bsize, PARTITION_VERT); + + *get_sb_index(xd, subsize) = 0; + if (cpi->sf.adaptive_motion_search) + load_pred_mv(x, get_block_context(x, bsize)); + pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + get_block_context(x, subsize), best_rd); + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) { + update_state(cpi, get_block_context(x, subsize), subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); + + *get_sb_index(xd, subsize) = 1; + if (cpi->sf.adaptive_motion_search) + load_pred_mv(x, get_block_context(x, bsize)); + pick_sb_modes(cpi, mi_row, mi_col + ms, &this_rate, + &this_dist, subsize, get_block_context(x, subsize), + best_rd - sum_rd); + if (this_rate == INT_MAX) { + sum_rd = INT64_MAX; + } else { + sum_rate += this_rate; + sum_dist += this_dist; + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); } - - // PARTITION_VERT - if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) { - int r2; - int64_t d2, v_rd; - subsize = get_subsize(bsize, PARTITION_VERT); - *(get_sb_index(xd, subsize)) = 0; - pick_sb_modes(cpi, mi_row, mi_col, &r2, &d2, subsize, - get_block_context(x, subsize), best_rd); - v_rd = RDCOST(x->rdmult, x->rddiv, r2, d2); - if (r2 != INT_MAX && v_rd < best_rd && - mi_col + (ms >> 1) < cm->mi_cols) { - int r = 0; - int64_t d = 0; - update_state(cpi, get_block_context(x, subsize), subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - - *(get_sb_index(xd, subsize)) = 1; - pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &r, &d, subsize, - get_block_context(x, subsize), best_rd - v_rd); - if (r == INT_MAX) { - r2 = INT_MAX; - } else { - r2 += r; - d2 += d; - } - } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - if (r2 < INT_MAX) - r2 += x->partition_cost[pl][PARTITION_VERT]; - if (r2 != INT_MAX && - RDCOST(x->rdmult, x->rddiv, r2, d2) - < RDCOST(x->rdmult, x->rddiv, srate, sdist)) { - srate = r2; - sdist = d2; - *(get_sb_partitioning(x, bsize)) = subsize; - } - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + } + if (sum_rd < best_rd) { + set_partition_seg_context(cm, xd, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + sum_rate += x->partition_cost[pl][PARTITION_VERT]; + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + if (sum_rd < best_rd) { + best_rate = sum_rate; + best_dist = sum_dist; + best_rd = sum_rd; + *(get_sb_partitioning(x, bsize)) = subsize; } } + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - *rate = srate; - *dist = sdist; - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - if (srate < INT_MAX && sdist < INT_MAX && do_recon) - encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize); + *rate = best_rate; + *dist = best_dist; - if (bsize == BLOCK_SIZE_SB64X64) { + if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) + encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize); + if (bsize == BLOCK_64X64) { assert(tp_orig < *tp); - assert(srate < INT_MAX); - assert(sdist < INT_MAX); + assert(best_rate < INT_MAX); + assert(best_dist < INT_MAX); } else { assert(tp_orig == *tp); } @@ -1863,7 +2006,7 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; MACROBLOCKD * const xd = &x->e_mbd; - int bsl = b_width_log2(BLOCK_SIZE_SB64X64), bs = 1 << bsl; + int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl; int ms = bs / 2; ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; @@ -1871,7 +2014,7 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) { int r; int64_t d; - save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64); + save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64); // Default is non mask (all reference frames allowed. cpi->ref_frame_mask = 0; @@ -1880,17 +2023,17 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) { if ((mi_row + (ms >> 1) < cm->mi_rows) && (mi_col + (ms >> 1) < cm->mi_cols)) { cpi->set_ref_frame_mask = 1; - pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_SIZE_SB64X64, - get_block_context(x, BLOCK_SIZE_SB64X64), INT64_MAX); + pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_64X64, + get_block_context(x, BLOCK_64X64), INT64_MAX); set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64); + pl = partition_plane_context(xd, BLOCK_64X64); r += x->partition_cost[pl][PARTITION_NONE]; - *(get_sb_partitioning(x, BLOCK_SIZE_SB64X64)) = BLOCK_SIZE_SB64X64; + *(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64; cpi->set_ref_frame_mask = 0; } - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64); + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64); } static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, @@ -1908,12 +2051,7 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, int dummy_rate; int64_t dummy_dist; - // Initialize a mask of modes that we will not consider; - // cpi->unused_mode_skip_mask = 0x0000000AAE17F800 (test no golden) - if (cpi->common.frame_type == KEY_FRAME) - cpi->unused_mode_skip_mask = 0; - else - cpi->unused_mode_skip_mask = 0xFFFFFFFFFFFFFE00; + vpx_memset(cpi->mb.pred_mv, 0, sizeof(cpi->mb.pred_mv)); if (cpi->sf.reference_masking) rd_pick_reference_frame(cpi, mi_row, mi_col); @@ -1921,18 +2059,18 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning || cpi->sf.use_one_partition_size_always ) { const int idx_str = cm->mode_info_stride * mi_row + mi_col; - MODE_INFO *m = cm->mi + idx_str; - MODE_INFO *p = cm->prev_mi + idx_str; + MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str; + MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str; cpi->mb.source_variance = UINT_MAX; if (cpi->sf.use_one_partition_size_always) { - set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64); - set_partitioning(cpi, m, cpi->sf.always_this_block_size); - rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + set_offsets(cpi, mi_row, mi_col, BLOCK_64X64); + set_partitioning(cpi, mi_8x8, mi_row, mi_col); + rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1); } else if (cpi->sf.partition_by_variance) { - choose_partitioning(cpi, cm->mi, mi_row, mi_col); - rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + choose_partitioning(cpi, cm->mi_grid_visible, mi_row, mi_col); + rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1); } else { if ((cpi->common.current_video_frame @@ -1943,26 +2081,28 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, || cpi->is_src_frame_alt_ref) { // If required set upper and lower partition size limits if (cpi->sf.auto_min_max_partition_size) { - rd_auto_partition_range(cpi, + set_offsets(cpi, mi_row, mi_col, BLOCK_64X64); + rd_auto_partition_range(cpi, mi_row, mi_col, &cpi->sf.min_partition_size, &cpi->sf.max_partition_size); } - rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, INT64_MAX); } else { - copy_partitioning(cpi, m, p); - rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + copy_partitioning(cpi, mi_8x8, prev_mi_8x8); + rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1); } } } else { // If required set upper and lower partition size limits if (cpi->sf.auto_min_max_partition_size) { - rd_auto_partition_range(cpi, &cpi->sf.min_partition_size, + set_offsets(cpi, mi_row, mi_col, BLOCK_64X64); + rd_auto_partition_range(cpi, mi_row, mi_col, + &cpi->sf.min_partition_size, &cpi->sf.max_partition_size); } - - rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, INT64_MAX); } } @@ -1993,8 +2133,8 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); - xd->mode_info_context->mbmi.mode = DC_PRED; - xd->mode_info_context->mbmi.uv_mode = DC_PRED; + xd->this_mi->mbmi.mode = DC_PRED; + xd->this_mi->mbmi.uv_mode = DC_PRED; vp9_zero(cpi->y_mode_count) vp9_zero(cpi->y_uv_mode_count) @@ -2023,7 +2163,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add; cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add; cpi->mb.optimize = 0; - cpi->mb.e_mbd.lf.filter_level = 0; + cpi->common.lf.filter_level = 0; cpi->zbin_mode_boost_enabled = 0; cpi->common.tx_mode = ONLY_4X4; } else { @@ -2070,8 +2210,14 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_zero(cm->counts.switchable_interp); vp9_zero(cpi->txfm_stepdown_count); - xd->mode_info_context = cm->mi; - xd->prev_mode_info_context = cm->prev_mi; + xd->mi_8x8 = cm->mi_grid_visible; + // required for vp9_frame_init_quantizer + xd->this_mi = + xd->mi_8x8[0] = cm->mi; + xd->mic_stream_ptr = cm->mi; + + xd->last_mi = cm->prev_mi; + vp9_zero(cpi->NMVcount); vp9_zero(cpi->coef_counts); @@ -2095,7 +2241,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { build_activity_map(cpi); } - // re-initencode frame context. + // Re-initialize encode frame context. init_encode_frame_mb_context(cpi); vp9_zero(cpi->rd_comp_pred_diff); @@ -2164,10 +2310,9 @@ static void encode_frame_internal(VP9_COMP *cpi) { } static int check_dual_ref_flags(VP9_COMP *cpi) { - MACROBLOCKD *xd = &cpi->mb.e_mbd; - int ref_flags = cpi->ref_frame_flags; + const int ref_flags = cpi->ref_frame_flags; - if (vp9_segfeature_active(&xd->seg, 1, SEG_LVL_REF_FRAME)) { + if (vp9_segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) { return 0; } else { return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG) @@ -2175,12 +2320,12 @@ static int check_dual_ref_flags(VP9_COMP *cpi) { } } -static int get_skip_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs) { +static int get_skip_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs) { int x, y; for (y = 0; y < ymbs; y++) { for (x = 0; x < xmbs; x++) { - if (!mi[y * mis + x].mbmi.mb_skip_coeff) + if (!mi_8x8[y * mis + x]->mbmi.skip_coeff) return 0; } } @@ -2188,85 +2333,75 @@ static int get_skip_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs) { return 1; } -static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs, - TX_SIZE txfm_size) { +static void set_txfm_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs, + TX_SIZE tx_size) { int x, y; for (y = 0; y < ymbs; y++) { for (x = 0; x < xmbs; x++) - mi[y * mis + x].mbmi.txfm_size = txfm_size; + mi_8x8[y * mis + x]->mbmi.tx_size = tx_size; } } -static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi, int mis, - TX_SIZE txfm_max, int bw, int bh, int mi_row, - int mi_col, BLOCK_SIZE_TYPE bsize) { +static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, + int mis, TX_SIZE max_tx_size, int bw, int bh, + int mi_row, int mi_col, BLOCK_SIZE bsize) { VP9_COMMON * const cm = &cpi->common; - MB_MODE_INFO * const mbmi = &mi->mbmi; + MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - if (mbmi->txfm_size > txfm_max) { - MACROBLOCK * const x = &cpi->mb; - MACROBLOCKD * const xd = &x->e_mbd; + if (mbmi->tx_size > max_tx_size) { const int ymbs = MIN(bh, cm->mi_rows - mi_row); const int xmbs = MIN(bw, cm->mi_cols - mi_col); - xd->mode_info_context = mi; - assert(vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP) || - get_skip_flag(mi, mis, ymbs, xmbs)); - set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max); + assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) || + get_skip_flag(mi_8x8, mis, ymbs, xmbs)); + set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size); } } -static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi, - TX_SIZE txfm_max, int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize) { +static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, + TX_SIZE max_tx_size, int mi_row, int mi_col, + BLOCK_SIZE bsize) { VP9_COMMON * const cm = &cpi->common; const int mis = cm->mode_info_stride; - int bwl, bhl; - const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1); + int bw, bh; + const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - bwl = mi_width_log2(mi->mbmi.sb_type); - bhl = mi_height_log2(mi->mbmi.sb_type); + bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type]; + bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type]; - if (bwl == bsl && bhl == bsl) { - reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl, mi_row, + if (bw == bs && bh == bs) { + reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, bs, bs, mi_row, + mi_col, bsize); + } else if (bw == bs && bh < bs) { + reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, bs, hbs, mi_row, mi_col, bsize); - } else if (bwl == bsl && bhl < bsl) { - reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs, mi_row, mi_col, - bsize); - reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs, - mi_row + bs, mi_col, bsize); - } else if (bwl < bsl && bhl == bsl) { - reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl, mi_row, mi_col, - bsize); - reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl, mi_row, - mi_col + bs, bsize); + reset_skip_txfm_size_b(cpi, mi_8x8 + hbs * mis, mis, max_tx_size, bs, hbs, + mi_row + hbs, mi_col, bsize); + } else if (bw < bs && bh == bs) { + reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, hbs, bs, mi_row, + mi_col, bsize); + reset_skip_txfm_size_b(cpi, mi_8x8 + hbs, mis, max_tx_size, hbs, bs, mi_row, + mi_col + hbs, bsize); + } else { - BLOCK_SIZE_TYPE subsize; + const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize]; int n; - assert(bwl < bsl && bhl < bsl); - if (bsize == BLOCK_64X64) { - subsize = BLOCK_32X32; - } else if (bsize == BLOCK_32X32) { - subsize = BLOCK_16X16; - } else { - assert(bsize == BLOCK_16X16); - subsize = BLOCK_8X8; - } + assert(bw < bs && bh < bs); for (n = 0; n < 4; n++) { - const int y_idx = n >> 1, x_idx = n & 0x01; + const int mi_dc = hbs * (n & 1); + const int mi_dr = hbs * (n >> 1); - reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, txfm_max, - mi_row + y_idx * bs, mi_col + x_idx * bs, - subsize); + reset_skip_txfm_size_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc], max_tx_size, + mi_row + mi_dr, mi_col + mi_dc, subsize); } } } @@ -2275,13 +2410,14 @@ static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) { VP9_COMMON * const cm = &cpi->common; int mi_row, mi_col; const int mis = cm->mode_info_stride; - MODE_INFO *mi, *mi_ptr = cm->mi; +// MODE_INFO *mi, *mi_ptr = cm->mi; + MODE_INFO **mi_8x8, **mi_ptr = cm->mi_grid_visible; for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) { - mi = mi_ptr; - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi += 8) { - reset_skip_txfm_size_sb(cpi, mi, txfm_max, mi_row, mi_col, - BLOCK_SIZE_SB64X64); + mi_8x8 = mi_ptr; + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi_8x8 += 8) { + reset_skip_txfm_size_sb(cpi, mi_8x8, txfm_max, mi_row, mi_col, + BLOCK_64X64); } } } @@ -2334,7 +2470,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { // decoder such that we allow compound where one of the 3 buffers has a // different sign bias and that buffer is then the fixed ref. However, this // requires further work in the rd loop. For now the only supported encoder - // side behaviour is where the ALT ref buffer has opposite sign bias to + // side behavior is where the ALT ref buffer has opposite sign bias to // the other two. if ((cm->ref_frame_sign_bias[ALTREF_FRAME] == cm->ref_frame_sign_bias[GOLDEN_FRAME]) @@ -2387,27 +2523,26 @@ void vp9_encode_frame(VP9_COMP *cpi) { cpi->rd_filter_threshes[frame_type][1] > cpi->rd_filter_threshes[frame_type][2] && cpi->rd_filter_threshes[frame_type][1] > - cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) { - filter_type = vp9_switchable_interp[1]; + cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) { + filter_type = EIGHTTAP_SMOOTH; } else if (cpi->rd_filter_threshes[frame_type][2] > cpi->rd_filter_threshes[frame_type][0] && cpi->rd_filter_threshes[frame_type][2] > - cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) { - filter_type = vp9_switchable_interp[2]; + cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) { + filter_type = EIGHTTAP_SHARP; } else if (cpi->rd_filter_threshes[frame_type][0] > - cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) { - filter_type = vp9_switchable_interp[0]; + cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) { + filter_type = EIGHTTAP; } else { filter_type = SWITCHABLE; } - /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */ - cpi->mb.e_mbd.lossless = 0; if (cpi->oxcf.lossless) { cpi->mb.e_mbd.lossless = 1; } + /* transform size selection (4x4, 8x8, 16x16 or select-per-mb) */ select_tx_mode(cpi); cpi->common.comp_pred_mode = pred_type; cpi->common.mcomp_filter_type = filter_type; @@ -2419,7 +2554,7 @@ void vp9_encode_frame(VP9_COMP *cpi) { cpi->rd_prediction_type_threshes[frame_type][i] >>= 1; } - for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) { + for (i = 0; i <= SWITCHABLE_FILTERS; i++) { const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs; cpi->rd_filter_threshes[frame_type][i] = (cpi->rd_filter_threshes[frame_type][i] + diff) / 2; @@ -2495,29 +2630,22 @@ void vp9_encode_frame(VP9_COMP *cpi) { } -static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) { - const MACROBLOCKD *xd = &x->e_mbd; - const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode; - const MB_PREDICTION_MODE uvm = xd->mode_info_context->mbmi.uv_mode; - - ++cpi->y_uv_mode_count[m][uvm]; - if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB8X8) { - const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; - const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); - const int bsl = MIN(bwl, bhl); - ++cpi->y_mode_count[MIN(bsl, 3)][m]; - } else { +static void sum_intra_stats(VP9_COMP *cpi, const MODE_INFO *mi) { + const MB_PREDICTION_MODE y_mode = mi->mbmi.mode; + const MB_PREDICTION_MODE uv_mode = mi->mbmi.uv_mode; + const BLOCK_SIZE bsize = mi->mbmi.sb_type; + + ++cpi->y_uv_mode_count[y_mode][uv_mode]; + + if (bsize < BLOCK_8X8) { int idx, idy; - int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[ - xd->mode_info_context->mbmi.sb_type]; - int num_4x4_blocks_high = num_4x4_blocks_high_lookup[ - xd->mode_info_context->mbmi.sb_type]; - for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { - for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { - int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode; - ++cpi->y_mode_count[0][m]; - } - } + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) + ++cpi->y_mode_count[0][mi->bmi[idy * 2 + idx].as_mode]; + } else { + ++cpi->y_mode_count[size_group_lookup[bsize]][y_mode]; } } @@ -2541,19 +2669,19 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) { x->act_zbin_adj = 1 - (int) (((int64_t) a + (b >> 1)) / b); #endif } - static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, - int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) { + int mi_row, int mi_col, BLOCK_SIZE bsize) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; MACROBLOCKD * const xd = &x->e_mbd; - MODE_INFO *mi = xd->mode_info_context; + MODE_INFO **mi_8x8 = xd->mi_8x8; + MODE_INFO *mi = mi_8x8[0]; MB_MODE_INFO *mbmi = &mi->mbmi; unsigned int segment_id = mbmi->segment_id; const int mis = cm->mode_info_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; - x->rd_search = 0; + x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct; x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH); if (x->skip_encode) @@ -2582,7 +2710,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST; else cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST; - } else if (mbmi->sb_type < BLOCK_SIZE_SB8X8) { + } else if (mbmi->sb_type < BLOCK_8X8) { cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST; } else { cpi->zbin_mode_boost = MV_ZBIN_BOOST; @@ -2595,13 +2723,11 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, vp9_update_zbin_extra(cpi, x); } - if (mbmi->ref_frame[0] == INTRA_FRAME) { - vp9_encode_intra_block_y( - cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); - vp9_encode_intra_block_uv( - cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); + if (!is_inter_block(mbmi)) { + vp9_encode_intra_block_y(x, MAX(bsize, BLOCK_8X8)); + vp9_encode_intra_block_uv(x, MAX(bsize, BLOCK_8X8)); if (output_enabled) - sum_intra_stats(cpi, x); + sum_intra_stats(cpi, mi); } else { int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[0])]; YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx]; @@ -2619,44 +2745,37 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, &xd->scale_factor[1]); - vp9_build_inter_predictors_sb( - xd, mi_row, mi_col, - bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8 : bsize); + vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); } - if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) { - vp9_tokenize_sb(cpi, t, !output_enabled, - (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); + if (!is_inter_block(mbmi)) { + vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8)); } else if (!x->skip) { - vp9_encode_sb(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); - vp9_tokenize_sb(cpi, t, !output_enabled, - (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); + vp9_encode_sb(x, MAX(bsize, BLOCK_8X8)); + vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8)); } else { - int mb_skip_context = xd->left_available ? (mi - 1)->mbmi.mb_skip_coeff : 0; - mb_skip_context += (mi - mis)->mbmi.mb_skip_coeff; + int mb_skip_context = xd->left_available ? mi_8x8[-1]->mbmi.skip_coeff : 0; + mb_skip_context += mi_8x8[-mis] ? mi_8x8[-mis]->mbmi.skip_coeff : 0; - mbmi->mb_skip_coeff = 1; + mbmi->skip_coeff = 1; if (output_enabled) cm->counts.mbskip[mb_skip_context][1]++; - vp9_reset_sb_tokens_context( - xd, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); + reset_skip_context(xd, MAX(bsize, BLOCK_8X8)); } - // copy skip flag on all mb_mode_info contexts in this SB - // if this was a skip at this txfm size - vp9_set_pred_flag_mbskip(cm, bsize, mi_row, mi_col, mi->mbmi.mb_skip_coeff); - if (output_enabled) { if (cm->tx_mode == TX_MODE_SELECT && - mbmi->sb_type >= BLOCK_SIZE_SB8X8 && + mbmi->sb_type >= BLOCK_8X8 && !(is_inter_block(mbmi) && - (mbmi->mb_skip_coeff || - vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)))) { + (mbmi->skip_coeff || + vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) { const uint8_t context = vp9_get_pred_context_tx_size(xd); - update_tx_counts(bsize, context, mbmi->txfm_size, &cm->counts.tx); + update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx); } else { int x, y; - TX_SIZE sz = (cm->tx_mode == TX_MODE_SELECT) ? TX_32X32 : cm->tx_mode; + TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode]; + assert(sizeof(tx_mode_to_biggest_tx_size) / + sizeof(tx_mode_to_biggest_tx_size[0]) == TX_MODES); // The new intra coding scheme requires no change of transform size if (is_inter_block(&mi->mbmi)) { if (sz == TX_32X32 && bsize < BLOCK_32X32) @@ -2666,18 +2785,15 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, if (sz == TX_8X8 && bsize < BLOCK_8X8) sz = TX_4X4; } else if (bsize >= BLOCK_8X8) { - sz = mbmi->txfm_size; + sz = mbmi->tx_size; } else { sz = TX_4X4; } - for (y = 0; y < mi_height; y++) { - for (x = 0; x < mi_width; x++) { - if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) { - mi[mis * y + x].mbmi.txfm_size = sz; - } - } - } + for (y = 0; y < mi_height; y++) + for (x = 0; x < mi_width; x++) + if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) + mi_8x8[mis * y + x]->mbmi.tx_size = sz; } } } diff --git a/libvpx/vp9/encoder/vp9_encodeintra.c b/libvpx/vp9/encoder/vp9_encodeintra.c index edbd2d9..c5e5dff 100644 --- a/libvpx/vp9/encoder/vp9_encodeintra.c +++ b/libvpx/vp9/encoder/vp9_encodeintra.c @@ -15,14 +15,14 @@ #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodeintra.h" -int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) { - MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi; - (void) cpi; +int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) { + MB_MODE_INFO * mbmi = &x->e_mbd.mi_8x8[0]->mbmi; x->skip_encode = 0; mbmi->mode = DC_PRED; mbmi->ref_frame[0] = INTRA_FRAME; - mbmi->txfm_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? - TX_16X16 : TX_8X8) : TX_4X4; - vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type); + mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16 + : TX_8X8) + : TX_4X4; + vp9_encode_intra_block_y(x, mbmi->sb_type); return vp9_get_mb_ss(x->plane[0].src_diff); } diff --git a/libvpx/vp9/encoder/vp9_encodeintra.h b/libvpx/vp9/encoder/vp9_encodeintra.h index 16ac59e..e217924 100644 --- a/libvpx/vp9/encoder/vp9_encodeintra.h +++ b/libvpx/vp9/encoder/vp9_encodeintra.h @@ -13,12 +13,8 @@ #include "vp9/encoder/vp9_onyx_int.h" -int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred); -void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg); -void vp9_encode_intra_block_y(VP9_COMMON *const cm, MACROBLOCK *mb, - BLOCK_SIZE_TYPE bs); -void vp9_encode_intra_block_uv(VP9_COMMON *const cm, MACROBLOCK *mb, - BLOCK_SIZE_TYPE bs); +int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred); +void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg); #endif // VP9_ENCODER_VP9_ENCODEINTRA_H_ diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c index 40b0a4e..8dd80a5 100644 --- a/libvpx/vp9/encoder/vp9_encodemb.c +++ b/libvpx/vp9/encoder/vp9_encodemb.c @@ -69,7 +69,7 @@ static void inverse_transform_b_16x16_add(int eob, vp9_short_idct16x16_add(dqcoeff, dest, stride); } -static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) { +static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { struct macroblock_plane *const p = &x->plane[plane]; const MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; @@ -81,18 +81,18 @@ static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) { pd->dst.buf, pd->dst.stride); } -void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { +void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) { subtract_plane(x, bsize, 0); } -void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { +void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) { int i; for (i = 1; i < MAX_MB_PLANE; i++) subtract_plane(x, bsize, i); } -void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { +void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { vp9_subtract_sby(x, bsize); vp9_subtract_sbuv(x, bsize); } @@ -142,37 +142,36 @@ static int trellis_get_coeff_context(const int16_t *scan, } static void optimize_b(MACROBLOCK *mb, - int plane, int block, BLOCK_SIZE_TYPE bsize, + int plane, int block, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, TX_SIZE tx_size) { MACROBLOCKD *const xd = &mb->e_mbd; - const int ref = is_inter_block(&xd->mode_info_context->mbmi); + struct macroblockd_plane *pd = &xd->plane[plane]; + const int ref = is_inter_block(&xd->this_mi->mbmi); vp9_token_state tokens[1025][2]; unsigned best_index[1025][2]; - const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, - block, 16); + const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block); int16_t *qcoeff_ptr; int16_t *dqcoeff_ptr; - int eob = xd->plane[plane].eobs[block], final_eob, sz = 0; + int eob = pd->eobs[block], final_eob, sz = 0; const int i0 = 0; int rc, x, next, i; int64_t rdmult, rddiv, rd_cost0, rd_cost1; int rate0, rate1, error0, error1, t0, t1; int best, band, pt; - PLANE_TYPE type = xd->plane[plane].plane_type; + PLANE_TYPE type = pd->plane_type; int err_mult = plane_rd_mult[type]; int default_eob; const int16_t *scan, *nb; const int mul = 1 + (tx_size == TX_32X32); uint8_t token_cache[1024]; - const int ib = txfrm_block_to_raster_block(xd, bsize, plane, - block, 2 * tx_size); - const int16_t *dequant_ptr = xd->plane[plane].dequant; + const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block); + const int16_t *dequant_ptr = pd->dequant; const uint8_t * band_translate; assert((!type && !plane) || (type && plane)); - dqcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16); - qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16); + dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block); + qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block); switch (tx_size) { default: case TX_4X4: @@ -200,7 +199,7 @@ static void optimize_b(MACROBLOCK *mb, /* Now set up a Viterbi trellis to evaluate alternative roundings. */ rdmult = mb->rdmult * err_mult; - if (mb->e_mbd.mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) + if (mb->e_mbd.mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME) rdmult = (rdmult * 9) >> 4; rddiv = mb->rddiv; /* Initialize the sentinel node of the trellis. */ @@ -371,59 +370,48 @@ static void optimize_b(MACROBLOCK *mb, *a = *l = (final_eob > 0); } -void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, MACROBLOCK *mb, - struct optimize_ctx *ctx) { - MACROBLOCKD *const xd = &mb->e_mbd; +void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) { int x, y; - - // find current entropy context - txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y); - - optimize_b(mb, plane, block, bsize, - &ctx->ta[plane][x], &ctx->tl[plane][y], ss_txfrm_size / 2); -} - -static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { - const struct encode_b_args* const args = arg; - vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->x, args->ctx); + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); + optimize_b(mb, plane, block, plane_bsize, + &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size); } -void optimize_init_b(int plane, BLOCK_SIZE_TYPE bsize, void *arg) { - const struct encode_b_args* const args = arg; +static void optimize_init_b(int plane, BLOCK_SIZE bsize, + struct encode_b_args *args) { const MACROBLOCKD *xd = &args->x->e_mbd; const struct macroblockd_plane* const pd = &xd->plane[plane]; - const int bwl = b_width_log2(bsize) - pd->subsampling_x; - const int bhl = b_height_log2(bsize) - pd->subsampling_y; - const int bw = 1 << bwl, bh = 1 << bhl; - const MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; - const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->txfm_size; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size; int i; switch (tx_size) { case TX_4X4: vpx_memcpy(args->ctx->ta[plane], pd->above_context, - sizeof(ENTROPY_CONTEXT) * bw); + sizeof(ENTROPY_CONTEXT) * num_4x4_w); vpx_memcpy(args->ctx->tl[plane], pd->left_context, - sizeof(ENTROPY_CONTEXT) * bh); + sizeof(ENTROPY_CONTEXT) * num_4x4_h); break; case TX_8X8: - for (i = 0; i < bw; i += 2) + for (i = 0; i < num_4x4_w; i += 2) args->ctx->ta[plane][i] = !!*(uint16_t *)&pd->above_context[i]; - for (i = 0; i < bh; i += 2) + for (i = 0; i < num_4x4_h; i += 2) args->ctx->tl[plane][i] = !!*(uint16_t *)&pd->left_context[i]; break; case TX_16X16: - for (i = 0; i < bw; i += 4) + for (i = 0; i < num_4x4_w; i += 4) args->ctx->ta[plane][i] = !!*(uint32_t *)&pd->above_context[i]; - for (i = 0; i < bh; i += 4) + for (i = 0; i < num_4x4_h; i += 4) args->ctx->tl[plane][i] = !!*(uint32_t *)&pd->left_context[i]; break; case TX_32X32: - for (i = 0; i < bw; i += 8) + for (i = 0; i < num_4x4_w; i += 8) args->ctx->ta[plane][i] = !!*(uint64_t *)&pd->above_context[i]; - for (i = 0; i < bh; i += 8) + for (i = 0; i < num_4x4_h; i += 8) args->ctx->tl[plane][i] = !!*(uint64_t *)&pd->left_context[i]; break; default: @@ -431,38 +419,19 @@ void optimize_init_b(int plane, BLOCK_SIZE_TYPE bsize, void *arg) { } } -void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { - struct optimize_ctx ctx; - struct encode_b_args arg = {cm, x, &ctx}; - optimize_init_b(0, bsize, &arg); - foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, optimize_block, &arg); -} - -void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize) { - struct optimize_ctx ctx; - struct encode_b_args arg = {cm, x, &ctx}; - int i; - for (i = 1; i < MAX_MB_PLANE; ++i) - optimize_init_b(i, bsize, &arg); - - foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg); -} - -void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { +void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { struct encode_b_args* const args = arg; MACROBLOCK* const x = args->x; MACROBLOCKD* const xd = &x->e_mbd; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; - int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16); - int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16); - int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16); - const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2); + int16_t *coeff = BLOCK_OFFSET(p->coeff, block); + int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block); + int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); const int16_t *scan, *iscan; uint16_t *eob = &pd->eobs[block]; - const int bwl = plane_block_width_log2by4(bsize, pd), bw = 1 << bwl; + const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl; const int twl = bwl - tx_size, twmask = (1 << twl) - 1; int xoff, yoff; int16_t *src_diff; @@ -475,7 +444,7 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize, xoff = 32 * (block & twmask); yoff = 32 * (block >> twl); src_diff = p->src_diff + 4 * bw * yoff + xoff; - if (x->rd_search) + if (x->use_lp32x32fdct) vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8); else vp9_short_fdct32x32(src_diff, coeff, bw * 8); @@ -523,29 +492,27 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize, } } -static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { +static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { struct encode_b_args *const args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; - const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane, - block, ss_txfrm_size); struct macroblockd_plane *const pd = &xd->plane[plane]; - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16); - uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane, - raster_block, + const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, + block); + + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block, pd->dst.buf, pd->dst.stride); - xform_quant(plane, block, bsize, ss_txfrm_size, arg); + vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); if (x->optimize) - vp9_optimize_b(plane, block, bsize, ss_txfrm_size, x, args->ctx); + vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx); - if (x->skip_encode) - return; - if (pd->eobs[block] == 0) + if (x->skip_encode || pd->eobs[block] == 0) return; - switch (ss_txfrm_size / 2) { + switch (tx_size) { case TX_32X32: vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride); break; @@ -564,28 +531,15 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff, dst, pd->dst.stride); break; + default: + assert(!"Invalid transform size"); } } -void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { - MACROBLOCKD* const xd = &x->e_mbd; - struct encode_b_args arg = {cm, x, NULL}; - - foreach_transformed_block_in_plane(xd, bsize, 0, xform_quant, &arg); -} - -void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize) { - MACROBLOCKD* const xd = &x->e_mbd; - struct encode_b_args arg = {cm, x, NULL}; - - foreach_transformed_block_uv(xd, bsize, xform_quant, &arg); -} - -void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { +void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) { MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx ctx; - struct encode_b_args arg = {cm, x, &ctx}; + struct encode_b_args arg = {x, &ctx}; vp9_subtract_sby(x, bsize); if (x->optimize) @@ -594,25 +548,10 @@ void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg); } -void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { - MACROBLOCKD *const xd = &x->e_mbd; - struct optimize_ctx ctx; - struct encode_b_args arg = {cm, x, &ctx}; - - vp9_subtract_sbuv(x, bsize); - if (x->optimize) { - int i; - for (i = 1; i < MAX_MB_PLANE; ++i) - optimize_init_b(i, bsize, &arg); - } - - foreach_transformed_block_uv(xd, bsize, encode_block, &arg); -} - -void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { +void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx ctx; - struct encode_b_args arg = {cm, x, &ctx}; + struct encode_b_args arg = {x, &ctx}; vp9_subtract_sb(x, bsize); @@ -625,35 +564,32 @@ void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { foreach_transformed_block(xd, bsize, encode_block, &arg); } -void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { +void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { struct encode_b_args* const args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; - const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2); + MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; - int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16); - int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16); - int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16); + int16_t *coeff = BLOCK_OFFSET(p->coeff, block); + int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block); + int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); const int16_t *scan, *iscan; TX_TYPE tx_type; MB_PREDICTION_MODE mode; - const int bwl = b_width_log2(bsize) - pd->subsampling_x, bw = 1 << bwl; + const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl; const int twl = bwl - tx_size, twmask = (1 << twl) - 1; int xoff, yoff; uint8_t *src, *dst; int16_t *src_diff; uint16_t *eob = &pd->eobs[block]; - if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { - extend_for_intra(xd, plane, block, bsize, ss_txfrm_size); - } + if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) + extend_for_intra(xd, plane_bsize, plane, block, tx_size); // if (x->optimize) - // vp9_optimize_b(plane, block, bsize, ss_txfrm_size, - // x, args->ctx); + // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx); switch (tx_size) { case TX_32X32: @@ -670,7 +606,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, dst, pd->dst.stride, dst, pd->dst.stride); vp9_subtract_block(32, 32, src_diff, bw * 4, src, p->src.stride, dst, pd->dst.stride); - if (x->rd_search) + if (x->use_lp32x32fdct) vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8); else vp9_short_fdct32x32(src_diff, coeff, bw * 8); @@ -699,8 +635,8 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type); else x->fwd_txm16x16(src_diff, coeff, bw * 8); - vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, - p->quant_shift, qcoeff, dqcoeff, + vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); if (!x->skip_encode && *eob) { if (tx_type == DCT_DCT) @@ -743,7 +679,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, scan = get_scan_4x4(tx_type); iscan = get_iscan_4x4(tx_type); if (mbmi->sb_type < BLOCK_8X8 && plane == 0) - mode = xd->mode_info_context->bmi[block].as_mode; + mode = xd->this_mi->bmi[block].as_mode; else mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; @@ -778,20 +714,18 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, } } -void vp9_encode_intra_block_y(VP9_COMMON *cm, MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize) { +void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) { MACROBLOCKD* const xd = &x->e_mbd; struct optimize_ctx ctx; - struct encode_b_args arg = {cm, x, &ctx}; + struct encode_b_args arg = {x, &ctx}; - foreach_transformed_block_in_plane(xd, bsize, 0, - encode_block_intra, &arg); + foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra, + &arg); } -void vp9_encode_intra_block_uv(VP9_COMMON *cm, MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize) { +void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) { MACROBLOCKD* const xd = &x->e_mbd; struct optimize_ctx ctx; - struct encode_b_args arg = {cm, x, &ctx}; - foreach_transformed_block_uv(xd, bsize, encode_block_intra, &arg); + struct encode_b_args arg = {x, &ctx}; + foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg); } diff --git a/libvpx/vp9/encoder/vp9_encodemb.h b/libvpx/vp9/encoder/vp9_encodemb.h index f647fd9..54e69fd 100644 --- a/libvpx/vp9/encoder/vp9_encodemb.h +++ b/libvpx/vp9/encoder/vp9_encodemb.h @@ -16,8 +16,28 @@ #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/common/vp9_onyxc_int.h" +typedef enum { + RD_DC_PRED = DC_PRED, + RD_V_PRED = V_PRED, + RD_H_PRED = H_PRED, + RD_D45_PRED = D45_PRED, + RD_D135_PRED = D135_PRED, + RD_D117_PRED = D117_PRED, + RD_D153_PRED = D153_PRED, + RD_D207_PRED = D207_PRED, + RD_D63_PRED = D63_PRED, + RD_TM_PRED = TM_PRED, + RD_NEARESTMV = NEARESTMV, + RD_NEARMV = NEARMV, + RD_ZEROMV = ZEROMV, + RD_NEWMV = NEWMV, + RD_I4X4_PRED, + RD_SPLITMV, + RD_MODE_COUNT +} RD_PREDICTION_MODE; + typedef struct { - MB_PREDICTION_MODE mode; + RD_PREDICTION_MODE mode; MV_REFERENCE_FRAME ref_frame; MV_REFERENCE_FRAME second_ref_frame; } MODE_DEFINITION; @@ -28,28 +48,22 @@ struct optimize_ctx { }; struct encode_b_args { - VP9_COMMON *cm; MACROBLOCK *x; struct optimize_ctx *ctx; }; -void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, MACROBLOCK *x, - struct optimize_ctx *ctx); -void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); -void vp9_optimize_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); +void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize); + +void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg); -void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); -void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); -void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); +void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize); -void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg); -void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); -void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); +void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize); -void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); -void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); -void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize); #endif // VP9_ENCODER_VP9_ENCODEMB_H_ diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c index 1c6fa3a..ed3a2bb 100644 --- a/libvpx/vp9/encoder/vp9_encodemv.c +++ b/libvpx/vp9/encoder/vp9_encodemv.c @@ -20,10 +20,6 @@ extern unsigned int active_section; #endif -#ifdef NMV_STATS -nmv_context_counts tnmvcounts; -#endif - static void encode_mv_component(vp9_writer* w, int comp, const nmv_component* mvcomp, int usehp) { int offset; @@ -159,7 +155,6 @@ static void counts_to_nmv_context( unsigned int (*branch_ct_class0_hp)[2], unsigned int (*branch_ct_hp)[2]) { int i, j, k; - vp9_counts_process(nmv_count, usehp); vp9_tree_probs_from_distribution(vp9_mv_joint_tree, prob->joints, branch_ct_joint, @@ -218,152 +213,6 @@ static void counts_to_nmv_context( } } -#ifdef NMV_STATS -void init_nmvstats() { - vp9_zero(tnmvcounts); -} - -void print_nmvstats() { - nmv_context prob; - unsigned int branch_ct_joint[MV_JOINTS - 1][2]; - unsigned int branch_ct_sign[2][2]; - unsigned int branch_ct_classes[2][MV_CLASSES - 1][2]; - unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2]; - unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2]; - unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2]; - unsigned int branch_ct_fp[2][4 - 1][2]; - unsigned int branch_ct_class0_hp[2][2]; - unsigned int branch_ct_hp[2][2]; - int i, j, k; - counts_to_nmv_context(&tnmvcounts, &prob, 1, - branch_ct_joint, branch_ct_sign, branch_ct_classes, - branch_ct_class0, branch_ct_bits, - branch_ct_class0_fp, branch_ct_fp, - branch_ct_class0_hp, branch_ct_hp); - - printf("\nCounts =\n { "); - for (j = 0; j < MV_JOINTS; ++j) - printf("%d, ", tnmvcounts.joints[j]); - printf("},\n"); - for (i = 0; i < 2; ++i) { - printf(" {\n"); - printf(" %d/%d,\n", tnmvcounts.comps[i].sign[0], - tnmvcounts.comps[i].sign[1]); - printf(" { "); - for (j = 0; j < MV_CLASSES; ++j) - printf("%d, ", tnmvcounts.comps[i].classes[j]); - printf("},\n"); - printf(" { "); - for (j = 0; j < CLASS0_SIZE; ++j) - printf("%d, ", tnmvcounts.comps[i].class0[j]); - printf("},\n"); - printf(" { "); - for (j = 0; j < MV_OFFSET_BITS; ++j) - printf("%d/%d, ", tnmvcounts.comps[i].bits[j][0], - tnmvcounts.comps[i].bits[j][1]); - printf("},\n"); - - printf(" {"); - for (j = 0; j < CLASS0_SIZE; ++j) { - printf("{"); - for (k = 0; k < 4; ++k) - printf("%d, ", tnmvcounts.comps[i].class0_fp[j][k]); - printf("}, "); - } - printf("},\n"); - - printf(" { "); - for (j = 0; j < 4; ++j) - printf("%d, ", tnmvcounts.comps[i].fp[j]); - printf("},\n"); - - printf(" %d/%d,\n", - tnmvcounts.comps[i].class0_hp[0], - tnmvcounts.comps[i].class0_hp[1]); - printf(" %d/%d,\n", - tnmvcounts.comps[i].hp[0], - tnmvcounts.comps[i].hp[1]); - printf(" },\n"); - } - - printf("\nProbs =\n { "); - for (j = 0; j < MV_JOINTS - 1; ++j) - printf("%d, ", prob.joints[j]); - printf("},\n"); - for (i=0; i< 2; ++i) { - printf(" {\n"); - printf(" %d,\n", prob.comps[i].sign); - printf(" { "); - for (j = 0; j < MV_CLASSES - 1; ++j) - printf("%d, ", prob.comps[i].classes[j]); - printf("},\n"); - printf(" { "); - for (j = 0; j < CLASS0_SIZE - 1; ++j) - printf("%d, ", prob.comps[i].class0[j]); - printf("},\n"); - printf(" { "); - for (j = 0; j < MV_OFFSET_BITS; ++j) - printf("%d, ", prob.comps[i].bits[j]); - printf("},\n"); - printf(" { "); - for (j = 0; j < CLASS0_SIZE; ++j) { - printf("{"); - for (k = 0; k < 3; ++k) - printf("%d, ", prob.comps[i].class0_fp[j][k]); - printf("}, "); - } - printf("},\n"); - printf(" { "); - for (j = 0; j < 3; ++j) - printf("%d, ", prob.comps[i].fp[j]); - printf("},\n"); - - printf(" %d,\n", prob.comps[i].class0_hp); - printf(" %d,\n", prob.comps[i].hp); - printf(" },\n"); - } -} - -static void add_nmvcount(nmv_context_counts* const dst, - const nmv_context_counts* const src) { - int i, j, k; - for (j = 0; j < MV_JOINTS; ++j) { - dst->joints[j] += src->joints[j]; - } - for (i = 0; i < 2; ++i) { - for (j = 0; j < MV_VALS; ++j) { - dst->comps[i].mvcount[j] += src->comps[i].mvcount[j]; - } - dst->comps[i].sign[0] += src->comps[i].sign[0]; - dst->comps[i].sign[1] += src->comps[i].sign[1]; - for (j = 0; j < MV_CLASSES; ++j) { - dst->comps[i].classes[j] += src->comps[i].classes[j]; - } - for (j = 0; j < CLASS0_SIZE; ++j) { - dst->comps[i].class0[j] += src->comps[i].class0[j]; - } - for (j = 0; j < MV_OFFSET_BITS; ++j) { - dst->comps[i].bits[j][0] += src->comps[i].bits[j][0]; - dst->comps[i].bits[j][1] += src->comps[i].bits[j][1]; - } - } - for (i = 0; i < 2; ++i) { - for (j = 0; j < CLASS0_SIZE; ++j) { - for (k = 0; k < 4; ++k) { - dst->comps[i].class0_fp[j][k] += src->comps[i].class0_fp[j][k]; - } - } - for (j = 0; j < 4; ++j) { - dst->comps[i].fp[j] += src->comps[i].fp[j]; - } - dst->comps[i].class0_hp[0] += src->comps[i].class0_hp[0]; - dst->comps[i].class0_hp[1] += src->comps[i].class0_hp[1]; - dst->comps[i].hp[0] += src->comps[i].hp[0]; - dst->comps[i].hp[1] += src->comps[i].hp[1]; - } -} -#endif - void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) { int i, j; nmv_context prob; @@ -378,10 +227,6 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) { unsigned int branch_ct_hp[2][2]; nmv_context *mvc = &cpi->common.fc.nmvc; -#ifdef NMV_STATS - if (!cpi->dummy_packing) - add_nmvcount(&tnmvcounts, &cpi->NMVcount); -#endif counts_to_nmv_context(&cpi->NMVcount, &prob, usehp, branch_ct_joint, branch_ct_sign, branch_ct_classes, branch_ct_class0, branch_ct_bits, @@ -390,22 +235,22 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) { for (j = 0; j < MV_JOINTS - 1; ++j) update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j], - VP9_NMV_UPDATE_PROB); + NMV_UPDATE_PROB); for (i = 0; i < 2; ++i) { update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, - prob.comps[i].sign, VP9_NMV_UPDATE_PROB); + prob.comps[i].sign, NMV_UPDATE_PROB); for (j = 0; j < MV_CLASSES - 1; ++j) update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j], - prob.comps[i].classes[j], VP9_NMV_UPDATE_PROB); + prob.comps[i].classes[j], NMV_UPDATE_PROB); for (j = 0; j < CLASS0_SIZE - 1; ++j) update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j], - prob.comps[i].class0[j], VP9_NMV_UPDATE_PROB); + prob.comps[i].class0[j], NMV_UPDATE_PROB); for (j = 0; j < MV_OFFSET_BITS; ++j) update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j], - prob.comps[i].bits[j], VP9_NMV_UPDATE_PROB); + prob.comps[i].bits[j], NMV_UPDATE_PROB); } for (i = 0; i < 2; ++i) { @@ -414,20 +259,20 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) { for (k = 0; k < 3; ++k) update_mv(bc, branch_ct_class0_fp[i][j][k], &mvc->comps[i].class0_fp[j][k], - prob.comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB); + prob.comps[i].class0_fp[j][k], NMV_UPDATE_PROB); } for (j = 0; j < 3; ++j) update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], - prob.comps[i].fp[j], VP9_NMV_UPDATE_PROB); + prob.comps[i].fp[j], NMV_UPDATE_PROB); } if (usehp) { for (i = 0; i < 2; ++i) { update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp, - prob.comps[i].class0_hp, VP9_NMV_UPDATE_PROB); + prob.comps[i].class0_hp, NMV_UPDATE_PROB); update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp, - prob.comps[i].hp, VP9_NMV_UPDATE_PROB); + prob.comps[i].hp, NMV_UPDATE_PROB); } } } @@ -471,7 +316,7 @@ void vp9_build_nmv_cost_table(int *mvjoint, void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv, int_mv *second_best_ref_mv) { - MODE_INFO *mi = x->e_mbd.mode_info_context; + MODE_INFO *mi = x->e_mbd.mi_8x8[0]; MB_MODE_INFO *const mbmi = &mi->mbmi; MV diff; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type]; @@ -488,7 +333,7 @@ void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x, diff.col = mi->bmi[i].as_mv[0].as_mv.col - best_ref_mv->as_mv.col; vp9_inc_mv(&diff, &cpi->NMVcount); - if (x->e_mbd.mode_info_context->mbmi.ref_frame[1] > INTRA_FRAME) { + if (mi->mbmi.ref_frame[1] > INTRA_FRAME) { diff.row = mi->bmi[i].as_mv[1].as_mv.row - second_best_ref_mv->as_mv.row; diff.col = mi->bmi[i].as_mv[1].as_mv.col - diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c index 6ba2a4f..9cf7b83 100644 --- a/libvpx/vp9/encoder/vp9_firstpass.c +++ b/libvpx/vp9/encoder/vp9_firstpass.c @@ -346,7 +346,7 @@ static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *r // Set up pointers for this macro block recon buffer xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset; - switch (xd->mode_info_context->mbmi.sb_type) { + switch (xd->this_mi->mbmi.sb_type) { case BLOCK_8X8: vp9_mse8x8(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride, @@ -385,7 +385,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; int n; vp9_variance_fn_ptr_t v_fn_ptr = - cpi->fn_ptr[xd->mode_info_context->mbmi.sb_type]; + cpi->fn_ptr[xd->this_mi->mbmi.sb_type]; int new_mv_mode_penalty = 256; int sr = 0; @@ -402,7 +402,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, further_steps -= sr; // override the default variance function to use MSE - switch (xd->mode_info_context->mbmi.sb_type) { + switch (xd->this_mi->mbmi.sb_type) { case BLOCK_8X8: v_fn_ptr.vf = vp9_mse8x8; break; @@ -505,8 +505,11 @@ void vp9_first_pass(VP9_COMP *cpi) { setup_dst_planes(xd, new_yv12, 0, 0); x->partition_info = x->pi; - - xd->mode_info_context = cm->mi; + xd->mi_8x8 = cm->mi_grid_visible; + // required for vp9_frame_init_quantizer + xd->this_mi = + xd->mi_8x8[0] = cm->mi; + xd->mic_stream_ptr = cm->mi; setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); @@ -549,26 +552,26 @@ void vp9_first_pass(VP9_COMP *cpi) { if (mb_col * 2 + 1 < cm->mi_cols) { if (mb_row * 2 + 1 < cm->mi_rows) { - xd->mode_info_context->mbmi.sb_type = BLOCK_16X16; + xd->this_mi->mbmi.sb_type = BLOCK_16X16; } else { - xd->mode_info_context->mbmi.sb_type = BLOCK_16X8; + xd->this_mi->mbmi.sb_type = BLOCK_16X8; } } else { if (mb_row * 2 + 1 < cm->mi_rows) { - xd->mode_info_context->mbmi.sb_type = BLOCK_8X16; + xd->this_mi->mbmi.sb_type = BLOCK_8X16; } else { - xd->mode_info_context->mbmi.sb_type = BLOCK_8X8; + xd->this_mi->mbmi.sb_type = BLOCK_8X8; } } - xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME; + xd->this_mi->mbmi.ref_frame[0] = INTRA_FRAME; set_mi_row_col(cm, xd, mb_row << 1, - 1 << mi_height_log2(xd->mode_info_context->mbmi.sb_type), + 1 << mi_height_log2(xd->this_mi->mbmi.sb_type), mb_col << 1, - 1 << mi_height_log2(xd->mode_info_context->mbmi.sb_type)); + 1 << mi_height_log2(xd->this_mi->mbmi.sb_type)); // do intra 16x16 prediction - this_error = vp9_encode_intra(cpi, x, use_dc_pred); + this_error = vp9_encode_intra(x, use_dc_pred); // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame) // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv. @@ -661,13 +664,13 @@ void vp9_first_pass(VP9_COMP *cpi) { mv.as_mv.col <<= 3; this_error = motion_error; vp9_set_mbmode_and_mvs(x, NEWMV, &mv); - xd->mode_info_context->mbmi.txfm_size = TX_4X4; - xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME; - xd->mode_info_context->mbmi.ref_frame[1] = NONE; + xd->this_mi->mbmi.tx_size = TX_4X4; + xd->this_mi->mbmi.ref_frame[0] = LAST_FRAME; + xd->this_mi->mbmi.ref_frame[1] = NONE; vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, - xd->mode_info_context->mbmi.sb_type); - vp9_encode_sby(cm, x, xd->mode_info_context->mbmi.sb_type); + xd->this_mi->mbmi.sb_type); + vp9_encode_sby(x, xd->this_mi->mbmi.sb_type); sum_mvr += mv.as_mv.row; sum_mvr_abs += abs(mv.as_mv.row); sum_mvc += mv.as_mv.col; @@ -1092,7 +1095,6 @@ static int estimate_cq(VP9_COMP *cpi, return q; } - extern void vp9_new_framerate(VP9_COMP *cpi, double framerate); void vp9_init_second_pass(VP9_COMP *cpi) { @@ -1580,7 +1582,7 @@ void define_fixed_arf_period(VP9_COMP *cpi) { // Analyse and define a gf/arf group. static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { - FIRSTPASS_STATS next_frame; + FIRSTPASS_STATS next_frame = { 0 }; FIRSTPASS_STATS *start_pos; int i; double boost_score = 0.0; @@ -1616,8 +1618,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { start_pos = cpi->twopass.stats_in; - vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean - // Load stats for the current frame. mod_frame_err = calculate_modified_err(cpi, this_frame); @@ -1720,6 +1720,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { old_boost_score = boost_score; } + cpi->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0); + // Don't allow a gf too near the next kf if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) { while (i < cpi->twopass.frames_to_key) { @@ -2081,63 +2083,71 @@ void vp9_second_pass(VP9_COMP *cpi) { vp9_clear_system_state(); - // Special case code for first frame. - if (cpi->common.current_video_frame == 0) { - cpi->twopass.est_max_qcorrection_factor = 1.0; - - // Set a cq_level in constrained quality mode. - if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { - int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats, - (int)(cpi->twopass.bits_left / frames_left)); - - cpi->cq_target_quality = cpi->oxcf.cq_level; - if (est_cq > cpi->cq_target_quality) - cpi->cq_target_quality = est_cq; - } + if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { + cpi->active_worst_quality = cpi->oxcf.cq_level; + } else { + // Special case code for first frame. + if (cpi->common.current_video_frame == 0) { + int section_target_bandwidth = + (int)(cpi->twopass.bits_left / frames_left); + cpi->twopass.est_max_qcorrection_factor = 1.0; + + // Set a cq_level in constrained quality mode. + if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { + int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats, + section_target_bandwidth); + + cpi->cq_target_quality = cpi->oxcf.cq_level; + if (est_cq > cpi->cq_target_quality) + cpi->cq_target_quality = est_cq; + } - // guess at maxq needed in 2nd pass - cpi->twopass.maxq_max_limit = cpi->worst_quality; - cpi->twopass.maxq_min_limit = cpi->best_quality; + // guess at maxq needed in 2nd pass + cpi->twopass.maxq_max_limit = cpi->worst_quality; + cpi->twopass.maxq_min_limit = cpi->best_quality; - tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats, - (int)(cpi->twopass.bits_left / frames_left)); + tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats, + section_target_bandwidth); - cpi->active_worst_quality = tmp_q; - cpi->ni_av_qi = tmp_q; - cpi->avg_q = vp9_convert_qindex_to_q(tmp_q); + cpi->active_worst_quality = tmp_q; + cpi->ni_av_qi = tmp_q; + cpi->avg_q = vp9_convert_qindex_to_q(tmp_q); #ifndef ONE_SHOT_Q_ESTIMATE - // Limit the maxq value returned subsequently. - // This increases the risk of overspend or underspend if the initial - // estimate for the clip is bad, but helps prevent excessive - // variation in Q, especially near the end of a clip - // where for example a small overspend may cause Q to crash - adjust_maxq_qrange(cpi); + // Limit the maxq value returned subsequently. + // This increases the risk of overspend or underspend if the initial + // estimate for the clip is bad, but helps prevent excessive + // variation in Q, especially near the end of a clip + // where for example a small overspend may cause Q to crash + adjust_maxq_qrange(cpi); #endif - } + } #ifndef ONE_SHOT_Q_ESTIMATE - // The last few frames of a clip almost always have to few or too many - // bits and for the sake of over exact rate control we dont want to make - // radical adjustments to the allowed quantizer range just to use up a - // few surplus bits or get beneath the target rate. - else if ((cpi->common.current_video_frame < - (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) && - ((cpi->common.current_video_frame + cpi->baseline_gf_interval) < - (unsigned int)cpi->twopass.total_stats.count)) { - if (frames_left < 1) - frames_left = 1; - - tmp_q = estimate_max_q( - cpi, - &cpi->twopass.total_left_stats, - (int)(cpi->twopass.bits_left / frames_left)); - - // Make a damped adjustment to active max Q - cpi->active_worst_quality = - adjust_active_maxq(cpi->active_worst_quality, tmp_q); - } + // The last few frames of a clip almost always have to few or too many + // bits and for the sake of over exact rate control we dont want to make + // radical adjustments to the allowed quantizer range just to use up a + // few surplus bits or get beneath the target rate. + else if ((cpi->common.current_video_frame < + (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) && + ((cpi->common.current_video_frame + cpi->baseline_gf_interval) < + (unsigned int)cpi->twopass.total_stats.count)) { + int section_target_bandwidth = + (int)(cpi->twopass.bits_left / frames_left); + if (frames_left < 1) + frames_left = 1; + + tmp_q = estimate_max_q( + cpi, + &cpi->twopass.total_left_stats, + section_target_bandwidth); + + // Make a damped adjustment to active max Q + cpi->active_worst_quality = + adjust_active_maxq(cpi->active_worst_quality, tmp_q); + } #endif + } vp9_zero(this_frame); if (EOF == input_stats(cpi, &this_frame)) return; @@ -2157,6 +2167,8 @@ void vp9_second_pass(VP9_COMP *cpi) { // Define next gf group and assign bits to it this_frame_copy = this_frame; + cpi->gf_zeromotion_pct = 0; + #if CONFIG_MULTIPLE_ARF if (cpi->multi_arf_enabled) { define_fixed_arf_period(cpi); @@ -2167,6 +2179,15 @@ void vp9_second_pass(VP9_COMP *cpi) { } #endif + if (cpi->gf_zeromotion_pct > 995) { + // As long as max_thresh for encode breakout is small enough, it is ok + // to enable it for no-show frame, i.e. set enable_encode_breakout to 2. + if (!cpi->common.show_frame) + cpi->enable_encode_breakout = 0; + else + cpi->enable_encode_breakout = 2; + } + // If we are going to code an altref frame at the end of the group // and the current frame is not a key frame.... // If the previous group used an arf this frame has already benefited diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c index 154d31a..5a671f2 100644 --- a/libvpx/vp9/encoder/vp9_mbgraph.c +++ b/libvpx/vp9/encoder/vp9_mbgraph.c @@ -40,14 +40,15 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, (cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2); step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2)); - vp9_clamp_mv_min_max(x, ref_mv); + vp9_clamp_mv_min_max(x, &ref_mv->as_mv); ref_full.as_mv.col = ref_mv->as_mv.col >> 3; ref_full.as_mv.row = ref_mv->as_mv.row >> 3; /*cpi->sf.search_method == HEX*/ - best_err = vp9_hex_search(x, &ref_full, dst_mv, step_param, x->errorperbit, - &v_fn_ptr, NULL, NULL, NULL, NULL, ref_mv); + best_err = vp9_hex_search(x, &ref_full, step_param, x->errorperbit, + 0, &v_fn_ptr, + 0, ref_mv, dst_mv); // Try sub-pixel MC // if (bestsme > error_thresh && bestsme < INT_MAX) @@ -58,7 +59,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, x, dst_mv, ref_mv, x->errorperbit, &v_fn_ptr, - NULL, NULL, + 0, cpi->sf.subpel_iters_per_step, NULL, NULL, & distortion, &sse); } @@ -144,7 +145,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi, for (mode = DC_PRED; mode <= TM_PRED; mode++) { unsigned int err; - xd->mode_info_context->mbmi.mode = mode; + xd->this_mi->mbmi.mode = mode; vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode, x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, xd->plane[0].dst.stride); @@ -240,9 +241,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, int mb_col, mb_row, offset = 0; int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0; int_mv arf_top_mv, gld_top_mv; - MODE_INFO mi_local; - - vp9_zero(mi_local); + MODE_INFO mi_local = { { 0 } }; // Set up limit values for motion vectors to prevent them extending outside the UMV borders arf_top_mv.as_int = 0; @@ -254,7 +253,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, xd->plane[0].dst.stride = buf->y_stride; xd->plane[0].pre[0].stride = buf->y_stride; xd->plane[1].dst.stride = buf->uv_stride; - xd->mode_info_context = &mi_local; + xd->this_mi = &mi_local; mi_local.mbmi.sb_type = BLOCK_16X16; mi_local.mbmi.ref_frame[0] = LAST_FRAME; mi_local.mbmi.ref_frame[1] = NONE; @@ -308,7 +307,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, static void separate_arf_mbs(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; int mb_col, mb_row, offset, i; - int ncnt[4]; + int ncnt[4] = { 0 }; int n_frames = cpi->mbgraph_n_frames; int *arf_not_zz; @@ -344,7 +343,6 @@ static void separate_arf_mbs(VP9_COMP *cpi) { } } - vpx_memset(ncnt, 0, sizeof(ncnt)); for (offset = 0, mb_row = 0; mb_row < cm->mb_rows; offset += cm->mb_cols, mb_row++) { for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c index 88beee7..1360088 100644 --- a/libvpx/vp9/encoder/vp9_mcomp.c +++ b/libvpx/vp9/encoder/vp9_mcomp.c @@ -8,28 +8,30 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include <stdio.h> #include <limits.h> #include <math.h> +#include <stdio.h> -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/vp9_mcomp.h" -#include "vpx_mem/vpx_mem.h" #include "./vpx_config.h" + +#include "vpx_mem/vpx_mem.h" + #include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_common.h" +#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_mcomp.h" + // #define NEW_DIAMOND_SEARCH -void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) { - int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL + - ((ref_mv->as_mv.col & 7) ? 1 : 0); - int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL + - ((ref_mv->as_mv.row & 7) ? 1 : 0); - int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL; - int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL; +void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv) { + const int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0); + const int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0); + const int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL; + const int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL; - /* Get intersection of UMV window and valid MV window to reduce # of checks in diamond search. */ + // Get intersection of UMV window and valid MV window to reduce # of checks + // in diamond search. if (x->mv_col_min < col_min) x->mv_col_min = col_min; if (x->mv_col_max > col_max) @@ -245,52 +247,112 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { }, \ v = INT_MAX;) -int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int *mvjcost, int *mvcost[2], - int *distortion, - unsigned int *sse1) { +#define FIRST_LEVEL_CHECKS \ + { \ + unsigned int left, right, up, down, diag; \ + CHECK_BETTER(left, tr, tc - hstep); \ + CHECK_BETTER(right, tr, tc + hstep); \ + CHECK_BETTER(up, tr - hstep, tc); \ + CHECK_BETTER(down, tr + hstep, tc); \ + whichdir = (left < right ? 0 : 1) + \ + (up < down ? 0 : 2); \ + switch (whichdir) { \ + case 0: \ + CHECK_BETTER(diag, tr - hstep, tc - hstep); \ + break; \ + case 1: \ + CHECK_BETTER(diag, tr - hstep, tc + hstep); \ + break; \ + case 2: \ + CHECK_BETTER(diag, tr + hstep, tc - hstep); \ + break; \ + case 3: \ + CHECK_BETTER(diag, tr + hstep, tc + hstep); \ + break; \ + } \ + } + +#define SECOND_LEVEL_CHECKS \ + { \ + int kr, kc; \ + unsigned int second; \ + if (tr != br && tc != bc) { \ + kr = br - tr; \ + kc = bc - tc; \ + CHECK_BETTER(second, tr + kr, tc + 2 * kc); \ + CHECK_BETTER(second, tr + 2 * kr, tc + kc); \ + } else if (tr == br && tc != bc) { \ + kc = bc - tc; \ + CHECK_BETTER(second, tr + hstep, tc + 2 * kc); \ + CHECK_BETTER(second, tr - hstep, tc + 2 * kc); \ + switch (whichdir) { \ + case 0: \ + case 1: \ + CHECK_BETTER(second, tr + hstep, tc + kc); \ + break; \ + case 2: \ + case 3: \ + CHECK_BETTER(second, tr - hstep, tc + kc); \ + break; \ + } \ + } else if (tr != br && tc == bc) { \ + kr = br - tr; \ + CHECK_BETTER(second, tr + 2 * kr, tc + hstep); \ + CHECK_BETTER(second, tr + 2 * kr, tc - hstep); \ + switch (whichdir) { \ + case 0: \ + case 2: \ + CHECK_BETTER(second, tr + kr, tc + hstep); \ + break; \ + case 1: \ + case 3: \ + CHECK_BETTER(second, tr + kr, tc - hstep); \ + break; \ + } \ + } \ + } + +int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int forced_stop, + int iters_per_step, + int *mvjcost, int *mvcost[2], + int *distortion, + unsigned int *sse1) { uint8_t *z = x->plane[0].src.buf; int src_stride = x->plane[0].src.stride; MACROBLOCKD *xd = &x->e_mbd; - int rr, rc, br, bc, hstep; - int tr, tc; unsigned int besterr = INT_MAX; - unsigned int left, right, up, down, diag; unsigned int sse; unsigned int whichdir; - unsigned int halfiters = 4; - unsigned int quarteriters = 4; - unsigned int eighthiters = 4; + unsigned int halfiters = iters_per_step; + unsigned int quarteriters = iters_per_step; + unsigned int eighthiters = iters_per_step; int thismse; - int maxc, minc, maxr, minr; - int y_stride; - int offset; uint8_t *y = xd->plane[0].pre[0].buf + (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + bestmv->as_mv.col; - y_stride = xd->plane[0].pre[0].stride; - - rr = ref_mv->as_mv.row; - rc = ref_mv->as_mv.col; - br = bestmv->as_mv.row << 3; - bc = bestmv->as_mv.col << 3; - hstep = 4; - minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1)); - maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1)); - minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1)); - maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1)); + const int y_stride = xd->plane[0].pre[0].stride; - tr = br; - tc = bc; + int rr = ref_mv->as_mv.row; + int rc = ref_mv->as_mv.col; + int br = bestmv->as_mv.row << 3; + int bc = bestmv->as_mv.col << 3; + int hstep = 4; + const int minc = MAX(x->mv_col_min << 3, ref_mv->as_mv.col - MV_MAX); + const int maxc = MIN(x->mv_col_max << 3, ref_mv->as_mv.col + MV_MAX); + const int minr = MAX(x->mv_row_min << 3, ref_mv->as_mv.row - MV_MAX); + const int maxr = MIN(x->mv_row_max << 3, ref_mv->as_mv.row + MV_MAX); + int tr = br; + int tc = bc; - offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; + const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; // central mv bestmv->as_mv.row <<= 3; @@ -303,105 +365,45 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, // TODO: Each subsequent iteration checks at least one point in // common with the last iteration could be 2 ( if diag selected) - while (--halfiters) { + while (halfiters--) { // 1/2 pel - CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(down, tr + hstep, tc); - - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - - switch (whichdir) { - case 0: - CHECK_BETTER(diag, tr - hstep, tc - hstep); - break; - case 1: - CHECK_BETTER(diag, tr - hstep, tc + hstep); - break; - case 2: - CHECK_BETTER(diag, tr + hstep, tc - hstep); - break; - case 3: - CHECK_BETTER(diag, tr + hstep, tc + hstep); - break; - } - + FIRST_LEVEL_CHECKS; // no reason to check the same one again. if (tr == br && tc == bc) break; - tr = br; tc = bc; } // TODO: Each subsequent iteration checks at least one point in common with // the last iteration could be 2 ( if diag selected) 1/4 pel - hstep >>= 1; - while (--quarteriters) { - CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(down, tr + hstep, tc); - - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - - switch (whichdir) { - case 0: - CHECK_BETTER(diag, tr - hstep, tc - hstep); - break; - case 1: - CHECK_BETTER(diag, tr - hstep, tc + hstep); - break; - case 2: - CHECK_BETTER(diag, tr + hstep, tc - hstep); - break; - case 3: - CHECK_BETTER(diag, tr + hstep, tc + hstep); + + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + hstep >>= 1; + while (quarteriters--) { + FIRST_LEVEL_CHECKS; + // no reason to check the same one again. + if (tr == br && tc == bc) break; + tr = br; + tc = bc; } - - // no reason to check the same one again. - if (tr == br && tc == bc) - break; - - tr = br; - tc = bc; } - if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv)) { + if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && + forced_stop == 0) { hstep >>= 1; - while (--eighthiters) { - CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(down, tr + hstep, tc); - - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - - switch (whichdir) { - case 0: - CHECK_BETTER(diag, tr - hstep, tc - hstep); - break; - case 1: - CHECK_BETTER(diag, tr - hstep, tc + hstep); - break; - case 2: - CHECK_BETTER(diag, tr + hstep, tc - hstep); - break; - case 3: - CHECK_BETTER(diag, tr + hstep, tc + hstep); - break; - } - + while (eighthiters--) { + FIRST_LEVEL_CHECKS; // no reason to check the same one again. if (tr == br && tc == bc) break; - tr = br; tc = bc; } } + bestmv->as_mv.row = br; bestmv->as_mv.col = bc; @@ -412,39 +414,31 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, return besterr; } -#undef DIST -/* returns subpixel variance error function */ -#define DIST(r, c) \ - vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \ - z, src_stride, &sse, second_pred) - -int vp9_find_best_sub_pixel_comp(MACROBLOCK *x, +int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, int_mv *bestmv, int_mv *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, + int forced_stop, + int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion, - unsigned int *sse1, - const uint8_t *second_pred, int w, int h) { + unsigned int *sse1) { uint8_t *z = x->plane[0].src.buf; int src_stride = x->plane[0].src.stride; MACROBLOCKD *xd = &x->e_mbd; - int rr, rc, br, bc, hstep; int tr, tc; unsigned int besterr = INT_MAX; - unsigned int left, right, up, down, diag; unsigned int sse; unsigned int whichdir; - unsigned int halfiters = 4; - unsigned int quarteriters = 4; - unsigned int eighthiters = 4; int thismse; int maxc, minc, maxr, minr; int y_stride; int offset; + unsigned int halfiters = iters_per_step; + unsigned int quarteriters = iters_per_step; + unsigned int eighthiters = iters_per_step; - DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); uint8_t *y = xd->plane[0].pre[0].buf + (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + bestmv->as_mv.col; @@ -456,19 +450,18 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x, br = bestmv->as_mv.row << 3; bc = bestmv->as_mv.col << 3; hstep = 4; - minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - - ((1 << MV_MAX_BITS) - 1)); - maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + - ((1 << MV_MAX_BITS) - 1)); - minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - - ((1 << MV_MAX_BITS) - 1)); - maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + - ((1 << MV_MAX_BITS) - 1)); + minc = MAX(x->mv_col_min << 3, + (ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1)); + maxc = MIN(x->mv_col_max << 3, + (ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1)); + minr = MAX(x->mv_row_min << 3, + (ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1)); + maxr = MIN(x->mv_row_max << 3, + (ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1)); tr = br; tc = bc; - offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; // central mv @@ -476,114 +469,40 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x, bestmv->as_mv.col <<= 3; // calculate central point error - // TODO(yunqingwang): central pointer error was already calculated in full- - // pixel search, and can be passed in this function. - comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); - besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); + besterr = vfp->vf(y, y_stride, z, src_stride, sse1); *distortion = besterr; besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); - // Each subsequent iteration checks at least one point in - // common with the last iteration could be 2 ( if diag selected) - while (--halfiters) { - // 1/2 pel - CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(down, tr + hstep, tc); - - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + // 1/2 pel + FIRST_LEVEL_CHECKS; + if (halfiters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; - switch (whichdir) { - case 0: - CHECK_BETTER(diag, tr - hstep, tc - hstep); - break; - case 1: - CHECK_BETTER(diag, tr - hstep, tc + hstep); - break; - case 2: - CHECK_BETTER(diag, tr + hstep, tc - hstep); - break; - case 3: - CHECK_BETTER(diag, tr + hstep, tc + hstep); - break; + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (quarteriters > 1) { + SECOND_LEVEL_CHECKS; } - - // no reason to check the same one again. - if (tr == br && tc == bc) - break; - tr = br; tc = bc; } - // Each subsequent iteration checks at least one point in common with - // the last iteration could be 2 ( if diag selected) 1/4 pel - hstep >>= 1; - while (--quarteriters) { - CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(down, tr + hstep, tc); - - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - - switch (whichdir) { - case 0: - CHECK_BETTER(diag, tr - hstep, tc - hstep); - break; - case 1: - CHECK_BETTER(diag, tr - hstep, tc + hstep); - break; - case 2: - CHECK_BETTER(diag, tr + hstep, tc - hstep); - break; - case 3: - CHECK_BETTER(diag, tr + hstep, tc + hstep); - break; + if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && + forced_stop == 0) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (eighthiters > 1) { + SECOND_LEVEL_CHECKS; } - - // no reason to check the same one again. - if (tr == br && tc == bc) - break; - tr = br; tc = bc; } - if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv)) { - hstep >>= 1; - while (--eighthiters) { - CHECK_BETTER(left, tr, tc - hstep); - CHECK_BETTER(right, tr, tc + hstep); - CHECK_BETTER(up, tr - hstep, tc); - CHECK_BETTER(down, tr + hstep, tc); - - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - - switch (whichdir) { - case 0: - CHECK_BETTER(diag, tr - hstep, tc - hstep); - break; - case 1: - CHECK_BETTER(diag, tr - hstep, tc + hstep); - break; - case 2: - CHECK_BETTER(diag, tr + hstep, tc - hstep); - break; - case 3: - CHECK_BETTER(diag, tr + hstep, tc + hstep); - break; - } - - // no reason to check the same one again. - if (tr == br && tc == bc) - break; - - tr = br; - tc = bc; - } - } bestmv->as_mv.row = br; bestmv->as_mv.col = bc; @@ -594,636 +513,236 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x, return besterr; } - -#undef MVC -#undef PRE #undef DIST -#undef IFMVCV -#undef CHECK_BETTER -#undef MIN -#undef MAX +/* returns subpixel variance error function */ +#define DIST(r, c) \ + vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \ + z, src_stride, &sse, second_pred) -int vp9_find_best_sub_pixel_step(MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int *mvjcost, int *mvcost[2], int *distortion, - unsigned int *sse1) { - int bestmse = INT_MAX; - int_mv startmv; - int_mv this_mv; - int_mv orig_mv; - int yrow_movedback = 0, ycol_movedback = 0; - uint8_t *z = x->plane[0].src.buf; - int src_stride = x->plane[0].src.stride; - int left, right, up, down, diag; +int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int forced_stop, + int iters_per_step, + int *mvjcost, int *mvcost[2], + int *distortion, + unsigned int *sse1, + const uint8_t *second_pred, + int w, int h) { + uint8_t *const z = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + MACROBLOCKD *const xd = &x->e_mbd; + + unsigned int besterr = INT_MAX; unsigned int sse; - int whichdir; + unsigned int whichdir; + unsigned int halfiters = iters_per_step; + unsigned int quarteriters = iters_per_step; + unsigned int eighthiters = iters_per_step; int thismse; - int y_stride; - MACROBLOCKD *xd = &x->e_mbd; - uint8_t *y = xd->plane[0].pre[0].buf + + DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); + uint8_t *const y = xd->plane[0].pre[0].buf + (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + bestmv->as_mv.col; - y_stride = xd->plane[0].pre[0].stride; - - // central mv - bestmv->as_mv.row <<= 3; - bestmv->as_mv.col <<= 3; - startmv = *bestmv; - orig_mv = *bestmv; - - // calculate central point error - bestmse = vfp->vf(y, y_stride, z, src_stride, sse1); - *distortion = bestmse; - bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); - - // go left then right and check error - this_mv.as_mv.row = startmv.as_mv.row; - this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4); - thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, src_stride, &sse); - left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (left < bestmse) { - *bestmv = this_mv; - bestmse = left; - *distortion = thismse; - *sse1 = sse; - } - this_mv.as_mv.col += 8; - thismse = vfp->svf_halfpix_h(y, y_stride, z, src_stride, &sse); - right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (right < bestmse) { - *bestmv = this_mv; - bestmse = right; - *distortion = thismse; - *sse1 = sse; - } - - // go up then down and check error - this_mv.as_mv.col = startmv.as_mv.col; - this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4); - thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, src_stride, &sse); - up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); - - if (up < bestmse) { - *bestmv = this_mv; - bestmse = up; - *distortion = thismse; - *sse1 = sse; - } - - this_mv.as_mv.row += 8; - thismse = vfp->svf_halfpix_v(y, y_stride, z, src_stride, &sse); - down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (down < bestmse) { - *bestmv = this_mv; - bestmse = down; - *distortion = thismse; - *sse1 = sse; - } + const int y_stride = xd->plane[0].pre[0].stride; + int rr = ref_mv->as_mv.row; + int rc = ref_mv->as_mv.col; + int br = bestmv->as_mv.row << 3; + int bc = bestmv->as_mv.col << 3; + int hstep = 4; + const int minc = MAX(x->mv_col_min << 3, ref_mv->as_mv.col - MV_MAX); + const int maxc = MIN(x->mv_col_max << 3, ref_mv->as_mv.col + MV_MAX); + const int minr = MAX(x->mv_row_min << 3, ref_mv->as_mv.row - MV_MAX); + const int maxr = MIN(x->mv_row_max << 3, ref_mv->as_mv.row + MV_MAX); - // now check 1 more diagonal - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - // for(whichdir =0;whichdir<4;whichdir++) - // { - this_mv = startmv; + int tr = br; + int tc = bc; - switch (whichdir) { - case 0: - this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4; - this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4; - thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, src_stride, - &sse); - break; - case 1: - this_mv.as_mv.col += 4; - this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4; - thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, src_stride, - &sse); - break; - case 2: - this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4; - this_mv.as_mv.row += 4; - thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, src_stride, &sse); - break; - case 3: - default: - this_mv.as_mv.col += 4; - this_mv.as_mv.row += 4; - thismse = vfp->svf_halfpix_hv(y, y_stride, z, src_stride, &sse); - break; - } - - diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (diag < bestmse) { - *bestmv = this_mv; - bestmse = diag; - *distortion = thismse; - *sse1 = sse; - } - -// } - - - // time to check quarter pels. - if (bestmv->as_mv.row < startmv.as_mv.row) { - y -= y_stride; - yrow_movedback = 1; - } - - if (bestmv->as_mv.col < startmv.as_mv.col) { - y--; - ycol_movedback = 1; - } + const int offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; - startmv = *bestmv; - - - - // go left then right and check error - this_mv.as_mv.row = startmv.as_mv.row; - - if (startmv.as_mv.col & 7) { - this_mv.as_mv.col = startmv.as_mv.col - 2; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - } else { - this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6; - thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z, - src_stride, &sse); - } - - left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (left < bestmse) { - *bestmv = this_mv; - bestmse = left; - *distortion = thismse; - *sse1 = sse; - } - - this_mv.as_mv.col += 4; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (right < bestmse) { - *bestmv = this_mv; - bestmse = right; - *distortion = thismse; - *sse1 = sse; - } - - // go up then down and check error - this_mv.as_mv.col = startmv.as_mv.col; - - if (startmv.as_mv.row & 7) { - this_mv.as_mv.row = startmv.as_mv.row - 2; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - } else { - this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6; - thismse = vfp->svf(y - y_stride, y_stride, SP(this_mv.as_mv.col), SP(6), - z, src_stride, &sse); - } - - up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); - - if (up < bestmse) { - *bestmv = this_mv; - bestmse = up; - *distortion = thismse; - *sse1 = sse; - } - - this_mv.as_mv.row += 4; - thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - - if (down < bestmse) { - *bestmv = this_mv; - bestmse = down; - *distortion = thismse; - *sse1 = sse; - } - - - // now check 1 more diagonal - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - -// for(whichdir=0;whichdir<4;whichdir++) -// { - this_mv = startmv; - - switch (whichdir) { - case 0: - - if (startmv.as_mv.row & 7) { - this_mv.as_mv.row -= 2; - - if (startmv.as_mv.col & 7) { - this_mv.as_mv.col -= 2; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - } else { - this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6; - thismse = vfp->svf(y - 1, y_stride, - SP(6), SP(this_mv.as_mv.row), z, src_stride, &sse); - } - } else { - this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6; - - if (startmv.as_mv.col & 7) { - this_mv.as_mv.col -= 2; - thismse = vfp->svf(y - y_stride, y_stride, - SP(this_mv.as_mv.col), SP(6), z, src_stride, &sse); - } else { - this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6; - thismse = vfp->svf(y - y_stride - 1, y_stride, - SP(6), SP(6), z, src_stride, &sse); - } - } - - break; - case 1: - this_mv.as_mv.col += 2; - - if (startmv.as_mv.row & 7) { - this_mv.as_mv.row -= 2; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - } else { - this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6; - thismse = vfp->svf(y - y_stride, y_stride, - SP(this_mv.as_mv.col), SP(6), z, src_stride, &sse); - } + // central mv + bestmv->as_mv.row <<= 3; + bestmv->as_mv.col <<= 3; - break; - case 2: - this_mv.as_mv.row += 2; - - if (startmv.as_mv.col & 7) { - this_mv.as_mv.col -= 2; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - } else { - this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6; - thismse = vfp->svf(y - 1, y_stride, SP(6), SP(this_mv.as_mv.row), z, - src_stride, &sse); - } + // calculate central point error + // TODO(yunqingwang): central pointer error was already calculated in full- + // pixel search, and can be passed in this function. + comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); + besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + // Each subsequent iteration checks at least one point in + // common with the last iteration could be 2 ( if diag selected) + while (halfiters--) { + // 1/2 pel + FIRST_LEVEL_CHECKS; + // no reason to check the same one again. + if (tr == br && tc == bc) break; - case 3: - this_mv.as_mv.col += 2; - this_mv.as_mv.row += 2; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - break; - } - - diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (diag < bestmse) { - *bestmv = this_mv; - bestmse = diag; - *distortion = thismse; - *sse1 = sse; - } - - if (!(xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv))) - return bestmse; - - /* Now do 1/8th pixel */ - if (bestmv->as_mv.row < orig_mv.as_mv.row && !yrow_movedback) { - y -= y_stride; - yrow_movedback = 1; - } - - if (bestmv->as_mv.col < orig_mv.as_mv.col && !ycol_movedback) { - y--; - ycol_movedback = 1; - } - - startmv = *bestmv; - - // go left then right and check error - this_mv.as_mv.row = startmv.as_mv.row; - - if (startmv.as_mv.col & 7) { - this_mv.as_mv.col = startmv.as_mv.col - 1; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - } else { - this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7; - thismse = vfp->svf(y - 1, y_stride, SP(7), SP(this_mv.as_mv.row), - z, src_stride, &sse); - } - - left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (left < bestmse) { - *bestmv = this_mv; - bestmse = left; - *distortion = thismse; - *sse1 = sse; - } - - this_mv.as_mv.col += 2; - thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (right < bestmse) { - *bestmv = this_mv; - bestmse = right; - *distortion = thismse; - *sse1 = sse; - } - - // go up then down and check error - this_mv.as_mv.col = startmv.as_mv.col; - - if (startmv.as_mv.row & 7) { - this_mv.as_mv.row = startmv.as_mv.row - 1; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - } else { - this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7; - thismse = vfp->svf(y - y_stride, y_stride, - SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse); + tr = br; + tc = bc; } - up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); - - if (up < bestmse) { - *bestmv = this_mv; - bestmse = up; - *distortion = thismse; - *sse1 = sse; - } + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel - this_mv.as_mv.row += 2; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (down < bestmse) { - *bestmv = this_mv; - bestmse = down; - *distortion = thismse; - *sse1 = sse; + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + hstep >>= 1; + while (quarteriters--) { + FIRST_LEVEL_CHECKS; + // no reason to check the same one again. + if (tr == br && tc == bc) + break; + tr = br; + tc = bc; + } } - // now check 1 more diagonal - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - -// for(whichdir=0;whichdir<4;whichdir++) -// { - this_mv = startmv; - - switch (whichdir) { - case 0: - - if (startmv.as_mv.row & 7) { - this_mv.as_mv.row -= 1; - - if (startmv.as_mv.col & 7) { - this_mv.as_mv.col -= 1; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - } else { - this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7; - thismse = vfp->svf(y - 1, y_stride, - SP(7), SP(this_mv.as_mv.row), - z, src_stride, &sse); - } - } else { - this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7; - - if (startmv.as_mv.col & 7) { - this_mv.as_mv.col -= 1; - thismse = vfp->svf(y - y_stride, y_stride, - SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse); - } else { - this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7; - thismse = vfp->svf(y - y_stride - 1, y_stride, - SP(7), SP(7), z, src_stride, &sse); - } - } - - break; - case 1: - this_mv.as_mv.col += 1; - - if (startmv.as_mv.row & 7) { - this_mv.as_mv.row -= 1; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - } else { - this_mv.as_mv.row = (startmv.as_mv.row - 8) | 7; - thismse = vfp->svf(y - y_stride, y_stride, - SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse); - } - - break; - case 2: - this_mv.as_mv.row += 1; - - if (startmv.as_mv.col & 7) { - this_mv.as_mv.col -= 1; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - } else { - this_mv.as_mv.col = (startmv.as_mv.col - 8) | 7; - thismse = vfp->svf(y - 1, y_stride, - SP(7), SP(this_mv.as_mv.row), z, src_stride, &sse); - } - - break; - case 3: - this_mv.as_mv.col += 1; - this_mv.as_mv.row += 1; - thismse = vfp->svf(y, y_stride, - SP(this_mv.as_mv.col), SP(this_mv.as_mv.row), - z, src_stride, &sse); - break; + if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && + forced_stop == 0) { + hstep >>= 1; + while (eighthiters--) { + FIRST_LEVEL_CHECKS; + // no reason to check the same one again. + if (tr == br && tc == bc) + break; + tr = br; + tc = bc; + } } + bestmv->as_mv.row = br; + bestmv->as_mv.col = bc; - diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (diag < bestmse) { - *bestmv = this_mv; - bestmse = diag; - *distortion = thismse; - *sse1 = sse; - } + if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) + return INT_MAX; - return bestmse; + return besterr; } -#undef SP - -int vp9_find_best_half_pixel_step(MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int *mvjcost, int *mvcost[2], - int *distortion, - unsigned int *sse1) { - int bestmse = INT_MAX; - int_mv startmv; - int_mv this_mv; +int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int forced_stop, + int iters_per_step, + int *mvjcost, int *mvcost[2], + int *distortion, + unsigned int *sse1, + const uint8_t *second_pred, + int w, int h) { uint8_t *z = x->plane[0].src.buf; int src_stride = x->plane[0].src.stride; - int left, right, up, down, diag; + MACROBLOCKD *xd = &x->e_mbd; + int rr, rc, br, bc, hstep; + int tr, tc; + unsigned int besterr = INT_MAX; unsigned int sse; - int whichdir; + unsigned int whichdir; int thismse; + int maxc, minc, maxr, minr; int y_stride; - MACROBLOCKD *xd = &x->e_mbd; + int offset; + unsigned int halfiters = iters_per_step; + unsigned int quarteriters = iters_per_step; + unsigned int eighthiters = iters_per_step; + DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); uint8_t *y = xd->plane[0].pre[0].buf + - (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + bestmv->as_mv.col; + (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + + bestmv->as_mv.col; + y_stride = xd->plane[0].pre[0].stride; + rr = ref_mv->as_mv.row; + rc = ref_mv->as_mv.col; + br = bestmv->as_mv.row << 3; + bc = bestmv->as_mv.col << 3; + hstep = 4; + minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - + ((1 << MV_MAX_BITS) - 1)); + maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + + ((1 << MV_MAX_BITS) - 1)); + minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - + ((1 << MV_MAX_BITS) - 1)); + maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + + ((1 << MV_MAX_BITS) - 1)); + + tr = br; + tc = bc; + + + offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; + // central mv bestmv->as_mv.row <<= 3; bestmv->as_mv.col <<= 3; - startmv = *bestmv; // calculate central point error - bestmse = vfp->vf(y, y_stride, z, src_stride, sse1); - *distortion = bestmse; - bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); - - // go left then right and check error - this_mv.as_mv.row = startmv.as_mv.row; - this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4); - thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, src_stride, &sse); - left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (left < bestmse) { - *bestmv = this_mv; - bestmse = left; - *distortion = thismse; - *sse1 = sse; - } - - this_mv.as_mv.col += 8; - thismse = vfp->svf_halfpix_h(y, y_stride, z, src_stride, &sse); - right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (right < bestmse) { - *bestmv = this_mv; - bestmse = right; - *distortion = thismse; - *sse1 = sse; - } + // TODO(yunqingwang): central pointer error was already calculated in full- + // pixel search, and can be passed in this function. + comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); + besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); - // go up then down and check error - this_mv.as_mv.col = startmv.as_mv.col; - this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4); - thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, src_stride, &sse); - up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); - - if (up < bestmse) { - *bestmv = this_mv; - bestmse = up; - *distortion = thismse; - *sse1 = sse; + // Each subsequent iteration checks at least one point in + // common with the last iteration could be 2 ( if diag selected) + // 1/2 pel + FIRST_LEVEL_CHECKS; + if (halfiters > 1) { + SECOND_LEVEL_CHECKS; } + tr = br; + tc = bc; - this_mv.as_mv.row += 8; - thismse = vfp->svf_halfpix_v(y, y_stride, z, src_stride, &sse); - down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel - if (down < bestmse) { - *bestmv = this_mv; - bestmse = down; - *distortion = thismse; - *sse1 = sse; + // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only + if (forced_stop != 2) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (quarteriters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; } - // now check 1 more diagonal - - whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); - this_mv = startmv; - - switch (whichdir) { - case 0: - this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4; - this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4; - thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, - z, src_stride, &sse); - break; - case 1: - this_mv.as_mv.col += 4; - this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4; - thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, - z, src_stride, &sse); - break; - case 2: - this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4; - this_mv.as_mv.row += 4; - thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, src_stride, &sse); - break; - case 3: - default: - this_mv.as_mv.col += 4; - this_mv.as_mv.row += 4; - thismse = vfp->svf_halfpix_hv(y, y_stride, z, src_stride, &sse); - break; + if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) && + forced_stop == 0) { + hstep >>= 1; + FIRST_LEVEL_CHECKS; + if (eighthiters > 1) { + SECOND_LEVEL_CHECKS; + } + tr = br; + tc = bc; } + bestmv->as_mv.row = br; + bestmv->as_mv.col = bc; - diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, - error_per_bit); - - if (diag < bestmse) { - *bestmv = this_mv; - bestmse = diag; - *distortion = thismse; - *sse1 = sse; - } + if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) + return INT_MAX; - return bestmse; + return besterr; } +#undef MVC +#undef PRE +#undef DIST +#undef IFMVCV +#undef CHECK_BETTER +#undef SP + #define CHECK_BOUNDS(range) \ {\ all_in = 1;\ @@ -1245,8 +764,10 @@ int vp9_find_best_half_pixel_step(MACROBLOCK *x, {\ if (thissad < bestsad)\ {\ - thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost, \ - sad_per_bit);\ + if (use_mvcost) \ + thissad += mvsad_err_cost(&this_mv, &fcenter_mv, \ + mvjsadcost, mvsadcost, \ + sad_per_bit);\ if (thissad < bestsad)\ {\ bestsad = thissad;\ @@ -1255,46 +776,53 @@ int vp9_find_best_half_pixel_step(MACROBLOCK *x, }\ } -static const MV next_chkpts[6][3] = { - {{ -2, 0}, { -1, -2}, {1, -2}}, - {{ -1, -2}, {1, -2}, {2, 0}}, - {{1, -2}, {2, 0}, {1, 2}}, - {{2, 0}, {1, 2}, { -1, 2}}, - {{1, 2}, { -1, 2}, { -2, 0}}, - {{ -1, 2}, { -2, 0}, { -1, -2}} -}; - -int vp9_hex_search -( - MACROBLOCK *x, - int_mv *ref_mv, - int_mv *best_mv, - int search_param, - int sad_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int *mvjsadcost, int *mvsadcost[2], - int *mvjcost, int *mvcost[2], - int_mv *center_mv -) { +#define get_next_chkpts(list, i, n) \ + list[0] = ((i) == 0 ? (n) - 1 : (i) - 1); \ + list[1] = (i); \ + list[2] = ((i) == (n) - 1 ? 0 : (i) + 1); + +#define MAX_PATTERN_SCALES 11 +#define MAX_PATTERN_CANDIDATES 8 // max number of canddiates per scale +#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates + +// Generic pattern search function that searches over multiple scales. +// Each scale can have a different number of candidates and shape of +// candidates as indicated in the num_candidates and candidates arrays +// passed into this function +static int vp9_pattern_search(MACROBLOCK *x, + int_mv *ref_mv, + int search_param, + int sad_per_bit, + int do_init_search, + int do_refine, + const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, + int_mv *center_mv, int_mv *best_mv, + const int num_candidates[MAX_PATTERN_SCALES], + const MV candidates[MAX_PATTERN_SCALES] + [MAX_PATTERN_CANDIDATES]) { const MACROBLOCKD* const xd = &x->e_mbd; - MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} }; - MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}}; - int i, j; - + static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = { + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + }; + int i, j, s, t; uint8_t *what = x->plane[0].src.buf; int what_stride = x->plane[0].src.stride; int in_what_stride = xd->plane[0].pre[0].stride; int br, bc; int_mv this_mv; - unsigned int bestsad = 0x7fffffff; - unsigned int thissad; + int bestsad = INT_MAX; + int thissad; uint8_t *base_offset; uint8_t *this_offset; int k = -1; int all_in; int best_site = -1; - int_mv fcenter_mv; + int best_init_s = search_param_to_steps[search_param]; + int *mvjsadcost = x->nmvjointsadcost; + int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; + fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; @@ -1306,7 +834,7 @@ int vp9_hex_search // Work out the start point for the search base_offset = (uint8_t *)(xd->plane[0].pre[0].buf); - this_offset = base_offset + (br * (xd->plane[0].pre[0].stride)) + bc; + this_offset = base_offset + (br * in_what_stride) + bc; this_mv.as_mv.row = br; this_mv.as_mv.col = bc; bestsad = vfp->sdf(what, what_stride, this_offset, @@ -1314,109 +842,310 @@ int vp9_hex_search + mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost, sad_per_bit); - // hex search - // j=0 - CHECK_BOUNDS(2) - - if (all_in) { - for (i = 0; i < 6; i++) { - this_mv.as_mv.row = br + hex[i].row; - this_mv.as_mv.col = bc + hex[i].col; - this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); - CHECK_BETTER + // Search all possible scales upto the search param around the center point + // pick the scale of the point that is best as the starting scale of + // further steps around it. + if (do_init_search) { + s = best_init_s; + best_init_s = -1; + for (t = 0; t <= s; ++t) { + best_site = -1; + CHECK_BOUNDS((1 << t)) + if (all_in) { + for (i = 0; i < num_candidates[t]; i++) { + this_mv.as_mv.row = br + candidates[t][i].row; + this_mv.as_mv.col = bc + candidates[t][i].col; + this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[t]; i++) { + this_mv.as_mv.row = br + candidates[t][i].row; + this_mv.as_mv.col = bc + candidates[t][i].col; + CHECK_POINT + this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } + if (best_site == -1) { + continue; + } else { + best_init_s = t; + k = best_site; + } } - } else { - for (i = 0; i < 6; i++) { - this_mv.as_mv.row = br + hex[i].row; - this_mv.as_mv.col = bc + hex[i].col; - CHECK_POINT - this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); - CHECK_BETTER + if (best_init_s != -1) { + br += candidates[best_init_s][k].row; + bc += candidates[best_init_s][k].col; } } - if (best_site == -1) - goto cal_neighbors; - else { - br += hex[best_site].row; - bc += hex[best_site].col; - k = best_site; - } - - for (j = 1; j < 127; j++) { + // If the center point is still the best, just skip this and move to + // the refinement step. + if (best_init_s != -1) { + s = best_init_s; best_site = -1; - CHECK_BOUNDS(2) + do { + // No need to search all 6 points the 1st time if initial search was used + if (!do_init_search || s != best_init_s) { + CHECK_BOUNDS((1 << s)) + if (all_in) { + for (i = 0; i < num_candidates[s]; i++) { + this_mv.as_mv.row = br + candidates[s][i].row; + this_mv.as_mv.col = bc + candidates[s][i].col; + this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } else { + for (i = 0; i < num_candidates[s]; i++) { + this_mv.as_mv.row = br + candidates[s][i].row; + this_mv.as_mv.col = bc + candidates[s][i].col; + CHECK_POINT + this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } - if (all_in) { - for (i = 0; i < 3; i++) { - this_mv.as_mv.row = br + next_chkpts[k][i].row; - this_mv.as_mv.col = bc + next_chkpts[k][i].col; - this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); - CHECK_BETTER - } - } else { - for (i = 0; i < 3; i++) { - this_mv.as_mv.row = br + next_chkpts[k][i].row; - this_mv.as_mv.col = bc + next_chkpts[k][i].col; - CHECK_POINT - this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); - CHECK_BETTER + if (best_site == -1) { + continue; + } else { + br += candidates[s][best_site].row; + bc += candidates[s][best_site].col; + k = best_site; + } } - } - if (best_site == -1) - break; - else { - br += next_chkpts[k][best_site].row; - bc += next_chkpts[k][best_site].col; - k += 5 + best_site; - if (k >= 12) k -= 12; - else if (k >= 6) k -= 6; - } + do { + int next_chkpts_indices[PATTERN_CANDIDATES_REF]; + best_site = -1; + CHECK_BOUNDS((1 << s)) + + get_next_chkpts(next_chkpts_indices, k, num_candidates[s]); + if (all_in) { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + this_mv.as_mv.row = br + + candidates[s][next_chkpts_indices[i]].row; + this_mv.as_mv.col = bc + + candidates[s][next_chkpts_indices[i]].col; + this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } else { + for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { + this_mv.as_mv.row = br + + candidates[s][next_chkpts_indices[i]].row; + this_mv.as_mv.col = bc + + candidates[s][next_chkpts_indices[i]].col; + CHECK_POINT + this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } + + if (best_site != -1) { + k = next_chkpts_indices[best_site]; + br += candidates[s][k].row; + bc += candidates[s][k].col; + } + } while (best_site != -1); + } while (s--); } - // check 4 1-away neighbors -cal_neighbors: - for (j = 0; j < 32; j++) { - best_site = -1; - CHECK_BOUNDS(1) + // Check 4 1-away neighbors if do_refine is true. + // For most well-designed schemes do_refine will not be necessary. + if (do_refine) { + static const MV neighbors[4] = { + {0, -1}, { -1, 0}, {1, 0}, {0, 1}, + }; + for (j = 0; j < 16; j++) { + best_site = -1; + CHECK_BOUNDS(1) + if (all_in) { + for (i = 0; i < 4; i++) { + this_mv.as_mv.row = br + neighbors[i].row; + this_mv.as_mv.col = bc + neighbors[i].col; + this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } else { + for (i = 0; i < 4; i++) { + this_mv.as_mv.row = br + neighbors[i].row; + this_mv.as_mv.col = bc + neighbors[i].col; + CHECK_POINT + this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + + this_mv.as_mv.col; + thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, + bestsad); + CHECK_BETTER + } + } - if (all_in) { - for (i = 0; i < 4; i++) { - this_mv.as_mv.row = br + neighbors[i].row; - this_mv.as_mv.col = bc + neighbors[i].col; - this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); - CHECK_BETTER - } - } else { - for (i = 0; i < 4; i++) { - this_mv.as_mv.row = br + neighbors[i].row; - this_mv.as_mv.col = bc + neighbors[i].col; - CHECK_POINT - this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad); - CHECK_BETTER + if (best_site == -1) { + break; + } else { + br += neighbors[best_site].row; + bc += neighbors[best_site].col; } } - - if (best_site == -1) - break; - else { - br += neighbors[best_site].row; - bc += neighbors[best_site].col; - } } best_mv->as_mv.row = br; best_mv->as_mv.col = bc; - return bestsad; + this_offset = base_offset + (best_mv->as_mv.row * (in_what_stride)) + + best_mv->as_mv.col; + this_mv.as_mv.row = best_mv->as_mv.row << 3; + this_mv.as_mv.col = best_mv->as_mv.col << 3; + if (bestsad == INT_MAX) + return INT_MAX; + return + vfp->vf(what, what_stride, this_offset, in_what_stride, + (unsigned int *)(&bestsad)) + + use_mvcost ? mv_err_cost(&this_mv, center_mv, x->nmvjointcost, x->mvcost, + x->errorperbit) : 0; +} + + +int vp9_hex_search(MACROBLOCK *x, + int_mv *ref_mv, + int search_param, + int sad_per_bit, + int do_init_search, + const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, + int_mv *center_mv, int_mv *best_mv) { + // First scale has 8-closest points, the rest have 6 points in hex shape + // at increasing scales + static const int hex_num_candidates[MAX_PATTERN_SCALES] = { + 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 + }; + // Note that the largest candidate step at each scale is 2^scale + static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = { + {{-1, -1}, {0, -1}, {1, -1}, {1, 0}, {1, 1}, { 0, 1}, { -1, 1}, {-1, 0}}, + {{-1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0}}, + {{-2, -4}, {2, -4}, {4, 0}, {2, 4}, { -2, 4}, { -4, 0}}, + {{-4, -8}, {4, -8}, {8, 0}, {4, 8}, { -4, 8}, { -8, 0}}, + {{-8, -16}, {8, -16}, {16, 0}, {8, 16}, { -8, 16}, { -16, 0}}, + {{-16, -32}, {16, -32}, {32, 0}, {16, 32}, { -16, 32}, { -32, 0}}, + {{-32, -64}, {32, -64}, {64, 0}, {32, 64}, { -32, 64}, { -64, 0}}, + {{-64, -128}, {64, -128}, {128, 0}, {64, 128}, { -64, 128}, { -128, 0}}, + {{-128, -256}, {128, -256}, {256, 0}, {128, 256}, { -128, 256}, { -256, 0}}, + {{-256, -512}, {256, -512}, {512, 0}, {256, 512}, { -256, 512}, { -512, 0}}, + {{-512, -1024}, {512, -1024}, {1024, 0}, {512, 1024}, { -512, 1024}, + { -1024, 0}}, + }; + return + vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, + do_init_search, 0, vfp, use_mvcost, + center_mv, best_mv, + hex_num_candidates, hex_candidates); +} + +int vp9_bigdia_search(MACROBLOCK *x, + int_mv *ref_mv, + int search_param, + int sad_per_bit, + int do_init_search, + const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, + int_mv *center_mv, + int_mv *best_mv) { + // First scale has 4-closest points, the rest have 8 points in diamond + // shape at increasing scales + static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = { + 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + }; + // Note that the largest candidate step at each scale is 2^scale + static const MV bigdia_candidates[MAX_PATTERN_SCALES] + [MAX_PATTERN_CANDIDATES] = { + {{0, -1}, {1, 0}, { 0, 1}, {-1, 0}}, + {{-1, -1}, {0, -2}, {1, -1}, {2, 0}, {1, 1}, {0, 2}, {-1, 1}, {-2, 0}}, + {{-2, -2}, {0, -4}, {2, -2}, {4, 0}, {2, 2}, {0, 4}, {-2, 2}, {-4, 0}}, + {{-4, -4}, {0, -8}, {4, -4}, {8, 0}, {4, 4}, {0, 8}, {-4, 4}, {-8, 0}}, + {{-8, -8}, {0, -16}, {8, -8}, {16, 0}, {8, 8}, {0, 16}, {-8, 8}, {-16, 0}}, + {{-16, -16}, {0, -32}, {16, -16}, {32, 0}, {16, 16}, {0, 32}, + {-16, 16}, {-32, 0}}, + {{-32, -32}, {0, -64}, {32, -32}, {64, 0}, {32, 32}, {0, 64}, + {-32, 32}, {-64, 0}}, + {{-64, -64}, {0, -128}, {64, -64}, {128, 0}, {64, 64}, {0, 128}, + {-64, 64}, {-128, 0}}, + {{-128, -128}, {0, -256}, {128, -128}, {256, 0}, {128, 128}, {0, 256}, + {-128, 128}, {-256, 0}}, + {{-256, -256}, {0, -512}, {256, -256}, {512, 0}, {256, 256}, {0, 512}, + {-256, 256}, {-512, 0}}, + {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024}, + {-512, 512}, {-1024, 0}}, + }; + return + vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, + do_init_search, 0, vfp, use_mvcost, + center_mv, best_mv, + bigdia_num_candidates, bigdia_candidates); } + +int vp9_square_search(MACROBLOCK *x, + int_mv *ref_mv, + int search_param, + int sad_per_bit, + int do_init_search, + const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, + int_mv *center_mv, + int_mv *best_mv) { + // All scales have 8 closest points in square shape + static const int square_num_candidates[MAX_PATTERN_SCALES] = { + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + }; + // Note that the largest candidate step at each scale is 2^scale + static const MV square_candidates[MAX_PATTERN_SCALES] + [MAX_PATTERN_CANDIDATES] = { + {{-1, -1}, {0, -1}, {1, -1}, {1, 0}, {1, 1}, {0, 1}, {-1, 1}, {-1, 0}}, + {{-2, -2}, {0, -2}, {2, -2}, {2, 0}, {2, 2}, {0, 2}, {-2, 2}, {-2, 0}}, + {{-4, -4}, {0, -4}, {4, -4}, {4, 0}, {4, 4}, {0, 4}, {-4, 4}, {-4, 0}}, + {{-8, -8}, {0, -8}, {8, -8}, {8, 0}, {8, 8}, {0, 8}, {-8, 8}, {-8, 0}}, + {{-16, -16}, {0, -16}, {16, -16}, {16, 0}, {16, 16}, {0, 16}, + {-16, 16}, {-16, 0}}, + {{-32, -32}, {0, -32}, {32, -32}, {32, 0}, {32, 32}, {0, 32}, + {-32, 32}, {-32, 0}}, + {{-64, -64}, {0, -64}, {64, -64}, {64, 0}, {64, 64}, {0, 64}, + {-64, 64}, {-64, 0}}, + {{-128, -128}, {0, -128}, {128, -128}, {128, 0}, {128, 128}, {0, 128}, + {-128, 128}, {-128, 0}}, + {{-256, -256}, {0, -256}, {256, -256}, {256, 0}, {256, 256}, {0, 256}, + {-256, 256}, {-256, 0}}, + {{-512, -512}, {0, -512}, {512, -512}, {512, 0}, {512, 512}, {0, 512}, + {-512, 512}, {-512, 0}}, + {{-1024, -1024}, {0, -1024}, {1024, -1024}, {1024, 0}, {1024, 1024}, + {0, 1024}, {-1024, 1024}, {-1024, 0}}, + }; + return + vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, + do_init_search, 0, vfp, use_mvcost, + center_mv, best_mv, + square_num_candidates, square_candidates); +}; + #undef CHECK_BOUNDS #undef CHECK_POINT #undef CHECK_BETTER @@ -1808,7 +1537,7 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv, int in_what_stride = xd->plane[0].pre[0].stride; int mv_stride = xd->plane[0].pre[0].stride; uint8_t *bestaddress; - int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0]; + int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0]; int_mv this_mv; int bestsad = INT_MAX; int r, c; @@ -1844,18 +1573,12 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv, + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, sad_per_bit); - // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border - if (col_min < x->mv_col_min) - col_min = x->mv_col_min; - - if (col_max > x->mv_col_max) - col_max = x->mv_col_max; - - if (row_min < x->mv_row_min) - row_min = x->mv_row_min; - - if (row_max > x->mv_row_max) - row_max = x->mv_row_max; + // Apply further limits to prevent us looking using vectors that stretch + // beyond the UMV border + col_min = MAX(col_min, x->mv_col_min); + col_max = MIN(col_max, x->mv_col_max); + row_min = MAX(row_min, x->mv_row_min); + row_max = MIN(row_max, x->mv_row_max); for (r = row_min; r < row_max; r++) { this_mv.as_mv.row = r; @@ -1902,7 +1625,7 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv, int in_what_stride = xd->plane[0].pre[0].stride; int mv_stride = xd->plane[0].pre[0].stride; uint8_t *bestaddress; - int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0]; + int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0]; int_mv this_mv; unsigned int bestsad = INT_MAX; int r, c; @@ -1940,18 +1663,12 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv, + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, sad_per_bit); - // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border - if (col_min < x->mv_col_min) - col_min = x->mv_col_min; - - if (col_max > x->mv_col_max) - col_max = x->mv_col_max; - - if (row_min < x->mv_row_min) - row_min = x->mv_row_min; - - if (row_max > x->mv_row_max) - row_max = x->mv_row_max; + // Apply further limits to prevent us looking using vectors that stretch + // beyond the UMV border + col_min = MAX(col_min, x->mv_col_min); + col_max = MIN(col_max, x->mv_col_max); + row_min = MAX(row_min, x->mv_row_min); + row_max = MIN(row_max, x->mv_row_max); for (r = row_min; r < row_max; r++) { this_mv.as_mv.row = r; @@ -2030,7 +1747,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, int in_what_stride = xd->plane[0].pre[0].stride; int mv_stride = xd->plane[0].pre[0].stride; uint8_t *bestaddress; - int_mv *best_mv = &x->e_mbd.mode_info_context->bmi[n].as_mv[0]; + int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0]; int_mv this_mv; unsigned int bestsad = INT_MAX; int r, c; @@ -2069,18 +1786,12 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, + mvsad_err_cost(best_mv, &fcenter_mv, mvjsadcost, mvsadcost, sad_per_bit); - // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border - if (col_min < x->mv_col_min) - col_min = x->mv_col_min; - - if (col_max > x->mv_col_max) - col_max = x->mv_col_max; - - if (row_min < x->mv_row_min) - row_min = x->mv_row_min; - - if (row_max > x->mv_row_max) - row_max = x->mv_row_max; + // Apply further limits to prevent us looking using vectors that stretch + // beyond the UMV border + col_min = MAX(col_min, x->mv_col_min); + col_max = MIN(col_max, x->mv_col_max); + row_min = MAX(row_min, x->mv_row_min); + row_max = MIN(row_max, x->mv_row_max); for (r = row_min; r < row_max; r++) { this_mv.as_mv.row = r; @@ -2113,7 +1824,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, } } - while ((c + 2) < col_max) { + while ((c + 2) < col_max && fn_ptr->sdx3f != NULL) { int i; fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array); diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h index 097d33c..3598fa0 100644 --- a/libvpx/vp9/encoder/vp9_mcomp.h +++ b/libvpx/vp9/encoder/vp9_mcomp.h @@ -23,7 +23,7 @@ // Maximum size of the first step in full pel units #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) -void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv); +void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv); int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2], int weight); void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride); @@ -40,19 +40,61 @@ int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, int_mv *ref_mv, int_mv *dst_mv); int vp9_hex_search(MACROBLOCK *x, - int_mv *ref_mv, int_mv *best_mv, - int search_param, int error_per_bit, + int_mv *ref_mv, + int search_param, + int error_per_bit, + int do_init_search, const vp9_variance_fn_ptr_t *vf, - int *mvjsadcost, int *mvsadcost[2], - int *mvjcost, int *mvcost[2], - int_mv *center_mv); + int use_mvcost, + int_mv *center_mv, + int_mv *best_mv); +int vp9_bigdia_search(MACROBLOCK *x, + int_mv *ref_mv, + int search_param, + int error_per_bit, + int do_init_search, + const vp9_variance_fn_ptr_t *vf, + int use_mvcost, + int_mv *center_mv, + int_mv *best_mv); +int vp9_square_search(MACROBLOCK *x, + int_mv *ref_mv, + int search_param, + int error_per_bit, + int do_init_search, + const vp9_variance_fn_ptr_t *vf, + int use_mvcost, + int_mv *center_mv, + int_mv *best_mv); -typedef int (fractional_mv_step_fp) (MACROBLOCK *x, int_mv - *bestmv, int_mv *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, - int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse); -extern fractional_mv_step_fp vp9_find_best_sub_pixel_step_iteratively; -extern fractional_mv_step_fp vp9_find_best_sub_pixel_step; -extern fractional_mv_step_fp vp9_find_best_half_pixel_step; +typedef int (fractional_mv_step_fp) ( + MACROBLOCK *x, + int_mv *bestmv, + int_mv *ref_mv, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int forced_stop, // 0 - full, 1 - qtr only, 2 - half only + int iters_per_step, + int *mvjcost, + int *mvcost[2], + int *distortion, + unsigned int *sse); +extern fractional_mv_step_fp vp9_find_best_sub_pixel_iterative; +extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; + +typedef int (fractional_mv_step_comp_fp) ( + MACROBLOCK *x, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int forced_stop, // 0 - full, 1 - qtr only, 2 - half only + int iters_per_step, + int *mvjcost, int *mvcost[2], + int *distortion, unsigned int *sse1, + const uint8_t *second_pred, + int w, int h); +extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_iterative; +extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree; typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x, int_mv *ref_mv, int sad_per_bit, @@ -75,15 +117,6 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x, int *mvjcost, int *mvcost[2], int_mv *center_mv); -int vp9_find_best_sub_pixel_comp(MACROBLOCK *x, - int_mv *bestmv, int_mv *ref_mv, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int *mvjcost, int *mvcost[2], - int *distortion, unsigned int *sse1, - const uint8_t *second_pred, - int w, int h); - int vp9_refining_search_8p_c(MACROBLOCK *x, int_mv *ref_mv, int error_per_bit, int search_range, vp9_variance_fn_ptr_t *fn_ptr, diff --git a/libvpx/vp9/encoder/vp9_modecosts.c b/libvpx/vp9/encoder/vp9_modecosts.c index 993aba7..a5dfaed 100644 --- a/libvpx/vp9/encoder/vp9_modecosts.c +++ b/libvpx/vp9/encoder/vp9_modecosts.c @@ -16,28 +16,28 @@ void vp9_init_mode_costs(VP9_COMP *c) { - VP9_COMMON *x = &c->common; + VP9_COMMON *const cm = &c->common; const vp9_tree_p KT = vp9_intra_mode_tree; int i, j; - for (i = 0; i < VP9_INTRA_MODES; i++) { - for (j = 0; j < VP9_INTRA_MODES; j++) { + for (i = 0; i < INTRA_MODES; i++) { + for (j = 0; j < INTRA_MODES; j++) { vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j], KT); } } // TODO(rbultje) separate tables for superblock costing? - vp9_cost_tokens(c->mb.mbmode_cost, x->fc.y_mode_prob[1], + vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1], vp9_intra_mode_tree); vp9_cost_tokens(c->mb.intra_uv_mode_cost[1], - x->fc.uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree); + cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree); vp9_cost_tokens(c->mb.intra_uv_mode_cost[0], - vp9_kf_uv_mode_prob[VP9_INTRA_MODES - 1], + vp9_kf_uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree); - for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i) + for (i = 0; i <= SWITCHABLE_FILTERS; ++i) vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i], - x->fc.switchable_interp_prob[i], + cm->fc.switchable_interp_prob[i], vp9_switchable_interp_tree); } diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c index db03995..883b31e 100644 --- a/libvpx/vp9/encoder/vp9_onyx_if.c +++ b/libvpx/vp9/encoder/vp9_onyx_if.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -28,7 +28,7 @@ #include "vp9/encoder/vp9_segmentation.h" #include "./vp9_rtcd.h" #include "./vpx_scale_rtcd.h" -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC #include "vp9/common/vp9_postproc.h" #endif #include "vpx_mem/vpx_mem.h" @@ -49,14 +49,10 @@ extern void print_tree_update_probs(); -static void set_default_lf_deltas(VP9_COMP *cpi); +static void set_default_lf_deltas(struct loopfilter *lf); #define DEFAULT_INTERP_FILTER SWITCHABLE -#define SEARCH_BEST_FILTER 0 /* to search exhaustively for - best filter */ -#define RESET_FOREACH_FILTER 0 /* whether to reset the encoder state - before trying each new filter */ #define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */ #define ALTREF_HIGH_PRECISION_MV 1 /* whether to use high precision mv @@ -98,15 +94,11 @@ FILE *keyfile; #ifdef ENTROPY_STATS -extern int intra_mode_stats[VP9_INTRA_MODES] - [VP9_INTRA_MODES] - [VP9_INTRA_MODES]; +extern int intra_mode_stats[INTRA_MODES] + [INTRA_MODES] + [INTRA_MODES]; #endif -#ifdef NMV_STATS -extern void init_nmvstats(); -extern void print_nmvstats(); -#endif #ifdef MODE_STATS extern void init_tx_count_stats(); extern void write_tx_count_stats(); @@ -241,10 +233,9 @@ void vp9_initialize_enc() { } } -static void setup_features(VP9_COMP *cpi) { - MACROBLOCKD *xd = &cpi->mb.e_mbd; - struct loopfilter *const lf = &xd->lf; - struct segmentation *const seg = &xd->seg; +static void setup_features(VP9_COMMON *cm) { + struct loopfilter *const lf = &cm->lf; + struct segmentation *const seg = &cm->seg; // Set up default state for MB feature flags seg->enabled = 0; @@ -262,7 +253,7 @@ static void setup_features(VP9_COMP *cpi) { vp9_zero(lf->last_ref_deltas); vp9_zero(lf->last_mode_deltas); - set_default_lf_deltas(cpi); + set_default_lf_deltas(lf); } static void dealloc_compressor_data(VP9_COMP *cpi) { @@ -324,8 +315,7 @@ static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) { static void configure_static_seg_features(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &cpi->mb.e_mbd; - struct segmentation *seg = &xd->seg; + struct segmentation *seg = &cm->seg; int high_q = (int)(cpi->avg_q > 48.0); int qi_delta; @@ -450,9 +440,9 @@ static void configure_static_seg_features(VP9_COMP *cpi) { void vp9_update_mode_context_stats(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; int i, j; - unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] = + unsigned int (*inter_mode_counts)[INTER_MODES - 1][2] = cm->fc.inter_mode_counts; - int64_t (*mv_ref_stats)[VP9_INTER_MODES - 1][2] = cpi->mv_ref_stats; + int64_t (*mv_ref_stats)[INTER_MODES - 1][2] = cpi->mv_ref_stats; FILE *f; // Read the past stats counters @@ -466,7 +456,7 @@ void vp9_update_mode_context_stats(VP9_COMP *cpi) { // Add in the values for this frame for (i = 0; i < INTER_MODE_CONTEXTS; i++) { - for (j = 0; j < VP9_INTER_MODES - 1; j++) { + for (j = 0; j < INTER_MODES - 1; j++) { mv_ref_stats[i][j][0] += (int64_t)inter_mode_counts[i][j][0]; mv_ref_stats[i][j][1] += (int64_t)inter_mode_counts[i][j][1]; } @@ -485,12 +475,12 @@ void print_mode_context(VP9_COMP *cpi) { fprintf(f, "#include \"vp9_entropy.h\"\n"); fprintf( f, - "const int inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1] ="); + "const int inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1] ="); fprintf(f, "{\n"); for (j = 0; j < INTER_MODE_CONTEXTS; j++) { fprintf(f, " {/* %d */ ", j); fprintf(f, " "); - for (i = 0; i < VP9_INTER_MODES - 1; i++) { + for (i = 0; i < INTER_MODES - 1; i++) { int this_prob; int64_t count = cpi->mv_ref_stats[j][i][0] + cpi->mv_ref_stats[j][i][1]; if (count) @@ -533,22 +523,20 @@ static void print_seg_map(VP9_COMP *cpi) { static void update_reference_segmentation_map(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; int row, col; - MODE_INFO *mi, *mi_ptr = cm->mi; + MODE_INFO **mi_8x8, **mi_8x8_ptr = cm->mi_grid_visible; uint8_t *cache_ptr = cm->last_frame_seg_map, *cache; for (row = 0; row < cm->mi_rows; row++) { - mi = mi_ptr; + mi_8x8 = mi_8x8_ptr; cache = cache_ptr; - for (col = 0; col < cm->mi_cols; col++, mi++, cache++) - cache[0] = mi->mbmi.segment_id; - mi_ptr += cm->mode_info_stride; + for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++) + cache[0] = mi_8x8[0]->mbmi.segment_id; + mi_8x8_ptr += cm->mode_info_stride; cache_ptr += cm->mi_cols; } } -static void set_default_lf_deltas(VP9_COMP *cpi) { - struct loopfilter *lf = &cpi->mb.e_mbd.lf; - +static void set_default_lf_deltas(struct loopfilter *lf) { lf->mode_ref_delta_enabled = 1; lf->mode_ref_delta_update = 1; @@ -565,9 +553,8 @@ static void set_default_lf_deltas(VP9_COMP *cpi) { lf->mode_deltas[1] = 0; // New mv } -static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) { +static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) { SPEED_FEATURES *sf = &cpi->sf; - int speed_multiplier = speed + 1; int i; // Set baseline threshold values @@ -578,46 +565,46 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) { sf->thresh_mult[THR_NEARESTG] = 0; sf->thresh_mult[THR_NEARESTA] = 0; - sf->thresh_mult[THR_NEWMV] += speed_multiplier * 1000; - sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1000; - sf->thresh_mult[THR_NEARMV] += speed_multiplier * 1000; - sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1000; - - sf->thresh_mult[THR_DC] += speed_multiplier * 1000; - - sf->thresh_mult[THR_NEWG] += speed_multiplier * 1000; - sf->thresh_mult[THR_NEWA] += speed_multiplier * 1000; - sf->thresh_mult[THR_NEARA] += speed_multiplier * 1000; - - sf->thresh_mult[THR_TM] += speed_multiplier * 1000; - - sf->thresh_mult[THR_COMP_NEARLA] += speed_multiplier * 1500; - sf->thresh_mult[THR_COMP_NEWLA] += speed_multiplier * 2000; - sf->thresh_mult[THR_NEARG] += speed_multiplier * 1000; - sf->thresh_mult[THR_COMP_NEARGA] += speed_multiplier * 1500; - sf->thresh_mult[THR_COMP_NEWGA] += speed_multiplier * 2000; - - sf->thresh_mult[THR_SPLITMV] += speed_multiplier * 2500; - sf->thresh_mult[THR_SPLITG] += speed_multiplier * 2500; - sf->thresh_mult[THR_SPLITA] += speed_multiplier * 2500; - sf->thresh_mult[THR_COMP_SPLITLA] += speed_multiplier * 4500; - sf->thresh_mult[THR_COMP_SPLITGA] += speed_multiplier * 4500; - - sf->thresh_mult[THR_ZEROMV] += speed_multiplier * 2000; - sf->thresh_mult[THR_ZEROG] += speed_multiplier * 2000; - sf->thresh_mult[THR_ZEROA] += speed_multiplier * 2000; - sf->thresh_mult[THR_COMP_ZEROLA] += speed_multiplier * 2500; - sf->thresh_mult[THR_COMP_ZEROGA] += speed_multiplier * 2500; - - sf->thresh_mult[THR_B_PRED] += speed_multiplier * 2500; - sf->thresh_mult[THR_H_PRED] += speed_multiplier * 2000; - sf->thresh_mult[THR_V_PRED] += speed_multiplier * 2000; - sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 2500; - sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 2500; - sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 2500; - sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 2500; - sf->thresh_mult[THR_D27_PRED] += speed_multiplier * 2500; - sf->thresh_mult[THR_D63_PRED] += speed_multiplier * 2500; + sf->thresh_mult[THR_NEWMV] += 1000; + sf->thresh_mult[THR_COMP_NEARESTLA] += 1000; + sf->thresh_mult[THR_NEARMV] += 1000; + sf->thresh_mult[THR_COMP_NEARESTGA] += 1000; + + sf->thresh_mult[THR_DC] += 1000; + + sf->thresh_mult[THR_NEWG] += 1000; + sf->thresh_mult[THR_NEWA] += 1000; + sf->thresh_mult[THR_NEARA] += 1000; + + sf->thresh_mult[THR_TM] += 1000; + + sf->thresh_mult[THR_COMP_NEARLA] += 1500; + sf->thresh_mult[THR_COMP_NEWLA] += 2000; + sf->thresh_mult[THR_NEARG] += 1000; + sf->thresh_mult[THR_COMP_NEARGA] += 1500; + sf->thresh_mult[THR_COMP_NEWGA] += 2000; + + sf->thresh_mult[THR_SPLITMV] += 2500; + sf->thresh_mult[THR_SPLITG] += 2500; + sf->thresh_mult[THR_SPLITA] += 2500; + sf->thresh_mult[THR_COMP_SPLITLA] += 4500; + sf->thresh_mult[THR_COMP_SPLITGA] += 4500; + + sf->thresh_mult[THR_ZEROMV] += 2000; + sf->thresh_mult[THR_ZEROG] += 2000; + sf->thresh_mult[THR_ZEROA] += 2000; + sf->thresh_mult[THR_COMP_ZEROLA] += 2500; + sf->thresh_mult[THR_COMP_ZEROGA] += 2500; + + sf->thresh_mult[THR_B_PRED] += 2500; + sf->thresh_mult[THR_H_PRED] += 2000; + sf->thresh_mult[THR_V_PRED] += 2000; + sf->thresh_mult[THR_D45_PRED ] += 2500; + sf->thresh_mult[THR_D135_PRED] += 2500; + sf->thresh_mult[THR_D117_PRED] += 2500; + sf->thresh_mult[THR_D153_PRED] += 2500; + sf->thresh_mult[THR_D207_PRED] += 2500; + sf->thresh_mult[THR_D63_PRED] += 2500; if (cpi->sf.skip_lots_of_modes) { for (i = 0; i < MAX_MODES; ++i) @@ -713,9 +700,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->search_method = NSTEP; sf->auto_filter = 1; sf->recode_loop = 1; - sf->quarter_pixel_search = 1; - sf->half_pixel_search = 1; - sf->iterative_sub_pixel = 1; + sf->subpel_search_method = SUBPEL_TREE; + sf->subpel_iters_per_step = 2; sf->optimize_coefficients = !cpi->oxcf.lossless; sf->reduce_first_step_size = 0; sf->auto_mv_step_size = 0; @@ -724,11 +710,11 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->adaptive_rd_thresh = 0; sf->use_lastframe_partitioning = 0; sf->tx_size_search_method = USE_FULL_RD; - sf->use_8tap_always = 0; + sf->use_lp32x32fdct = 0; + sf->adaptive_motion_search = 0; sf->use_avoid_tested_higherror = 0; sf->reference_masking = 0; sf->skip_lots_of_modes = 0; - sf->adjust_thresholds_by_speed = 0; sf->partition_by_variance = 0; sf->use_one_partition_size_always = 0; sf->less_rectangular_check = 0; @@ -736,22 +722,23 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->auto_min_max_partition_size = 0; sf->auto_min_max_partition_interval = 0; sf->auto_min_max_partition_count = 0; - // sf->use_max_partition_size = 0; sf->max_partition_size = BLOCK_64X64; - // sf->use_min_partition_size = 0; sf->min_partition_size = BLOCK_4X4; sf->adjust_partitioning_from_last_frame = 0; sf->last_partitioning_redo_frequency = 4; sf->disable_splitmv = 0; sf->mode_search_skip_flags = 0; - sf->last_chroma_intra_mode = TM_PRED; + sf->disable_split_var_thresh = 0; + sf->disable_filter_search_var_thresh = 0; + sf->intra_y_mode_mask = ALL_INTRA_MODES; + sf->intra_uv_mode_mask = ALL_INTRA_MODES; sf->use_rd_breakout = 0; sf->skip_encode_sb = 0; sf->use_uv_intra_rd_estimate = 0; + sf->use_fast_lpf_pick = 0; + sf->use_fast_coef_updates = 0; sf->using_small_partition_info = 0; - // Skip any mode not chosen at size < X for all sizes > X - // Hence BLOCK_64X64 (skip is off) - sf->unused_mode_skip_lvl = BLOCK_64X64; + sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set #if CONFIG_MULTIPLE_ARF // Switch segmentation off. @@ -762,7 +749,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) { switch (mode) { case 0: // best quality mode - sf->search_best_filter = SEARCH_BEST_FILTER; break; case 1: @@ -773,9 +759,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->static_segmentation = 0; #endif sf->use_avoid_tested_higherror = 1; - sf->adaptive_rd_thresh = 1; + sf->adaptive_rd_thresh = MIN((speed + 1), 4); + if (speed == 1) { - sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; + sf->comp_inter_joint_search_thresh = BLOCK_SIZES; sf->less_rectangular_check = 1; sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || cpi->common.intra_only || @@ -787,7 +774,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) { cpi->common.show_frame == 0); sf->disable_splitmv = (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0; - sf->unused_mode_skip_lvl = BLOCK_32X32; sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | FLAG_SKIP_COMP_BESTINTRA | @@ -795,22 +781,30 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->use_uv_intra_rd_estimate = 1; sf->use_rd_breakout = 1; sf->skip_encode_sb = 1; + sf->use_lp32x32fdct = 1; + sf->adaptive_motion_search = 1; sf->auto_mv_step_size = 1; sf->auto_min_max_partition_size = 1; - // sf->use_max_partition_size = 1; - // sf->use_min_partition_size = 1; sf->auto_min_max_partition_interval = 1; + // FIXME(jingning): temporarily turn off disable_split_var_thresh + // during refactoring process. will get this back after finishing + // the main framework of partition search type. + sf->disable_split_var_thresh = 0; + sf->disable_filter_search_var_thresh = 16; + + sf->intra_y_mode_mask = INTRA_DC_TM_H_V; + sf->intra_uv_mode_mask = INTRA_DC_TM_H_V; + sf->use_fast_coef_updates = 1; + sf->mode_skip_start = 9; } if (speed == 2) { - sf->adjust_thresholds_by_speed = 1; sf->less_rectangular_check = 1; sf->use_square_partition_only = 1; - sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; + sf->comp_inter_joint_search_thresh = BLOCK_SIZES; sf->use_lastframe_partitioning = 1; sf->adjust_partitioning_from_last_frame = 1; sf->last_partitioning_redo_frequency = 3; - sf->unused_mode_skip_lvl = BLOCK_32X32; sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || cpi->common.intra_only || cpi->common.show_frame == 0) ? @@ -822,17 +816,29 @@ void vp9_set_speed_features(VP9_COMP *cpi) { FLAG_SKIP_COMP_REFMISMATCH | FLAG_SKIP_INTRA_LOWVAR | FLAG_EARLY_TERMINATE; - sf->last_chroma_intra_mode = DC_PRED; + sf->intra_y_mode_mask = INTRA_DC_TM; + sf->intra_uv_mode_mask = INTRA_DC_TM; sf->use_uv_intra_rd_estimate = 1; sf->use_rd_breakout = 1; sf->skip_encode_sb = 1; - sf->using_small_partition_info = 1; + sf->use_lp32x32fdct = 1; + sf->adaptive_motion_search = 1; + sf->using_small_partition_info = 0; sf->disable_splitmv = (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0; sf->auto_mv_step_size = 1; + sf->search_method = SQUARE; + sf->subpel_iters_per_step = 1; + sf->use_fast_lpf_pick = 1; + sf->auto_min_max_partition_size = 1; + sf->auto_min_max_partition_interval = 2; + sf->disable_split_var_thresh = 32; + sf->disable_filter_search_var_thresh = 32; + sf->use_fast_coef_updates = 2; + sf->mode_skip_start = 9; } if (speed == 3) { - sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; + sf->comp_inter_joint_search_thresh = BLOCK_SIZES; sf->partition_by_variance = 1; sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || cpi->common.intra_only || @@ -847,11 +853,20 @@ void vp9_set_speed_features(VP9_COMP *cpi) { FLAG_EARLY_TERMINATE; sf->use_rd_breakout = 1; sf->skip_encode_sb = 1; + sf->use_lp32x32fdct = 1; sf->disable_splitmv = 1; sf->auto_mv_step_size = 1; + sf->search_method = BIGDIA; + sf->subpel_iters_per_step = 1; + sf->disable_split_var_thresh = 64; + sf->disable_filter_search_var_thresh = 64; + sf->intra_y_mode_mask = INTRA_DC_ONLY; + sf->intra_uv_mode_mask = INTRA_DC_ONLY; + sf->use_fast_coef_updates = 2; + sf->mode_skip_start = 9; } if (speed == 4) { - sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; + sf->comp_inter_joint_search_thresh = BLOCK_SIZES; sf->use_one_partition_size_always = 1; sf->always_this_block_size = BLOCK_16X16; sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || @@ -866,37 +881,28 @@ void vp9_set_speed_features(VP9_COMP *cpi) { FLAG_SKIP_INTRA_LOWVAR | FLAG_EARLY_TERMINATE; sf->use_rd_breakout = 1; + sf->use_lp32x32fdct = 1; sf->optimize_coefficients = 0; sf->auto_mv_step_size = 1; // sf->reduce_first_step_size = 1; // sf->reference_masking = 1; sf->disable_splitmv = 1; + sf->search_method = HEX; + sf->subpel_iters_per_step = 1; + sf->disable_split_var_thresh = 64; + sf->disable_filter_search_var_thresh = 96; + sf->intra_y_mode_mask = INTRA_DC_ONLY; + sf->intra_uv_mode_mask = INTRA_DC_ONLY; + sf->use_fast_coef_updates = 2; + sf->mode_skip_start = 9; } - /* - if (speed == 2) { - sf->first_step = 0; - sf->comp_inter_joint_search_thresh = BLOCK_8X8; - sf->use_max_partition_size = 1; - sf->max_partition_size = BLOCK_16X16; - } - if (speed == 3) { - sf->first_step = 0; - sf->comp_inter_joint_search_thresh = BLOCK_B8X8; - sf->use_min_partition_size = 1; - sf->min_partition_size = BLOCK_8X8; - } - */ - break; }; /* switch */ // Set rd thresholds based on mode and speed setting - if (cpi->sf.adjust_thresholds_by_speed) - set_rd_speed_thresholds(cpi, mode, speed); - else - set_rd_speed_thresholds(cpi, mode, 0); + set_rd_speed_thresholds(cpi, mode); // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. @@ -915,12 +921,12 @@ void vp9_set_speed_features(VP9_COMP *cpi) { cpi->mb.quantize_b_4x4 = vp9_regular_quantize_b_4x4; - if (cpi->sf.iterative_sub_pixel == 1) { - cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step_iteratively; - } else if (cpi->sf.quarter_pixel_search) { - cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step; - } else if (cpi->sf.half_pixel_search) { - cpi->find_fractional_mv_step = vp9_find_best_half_pixel_step; + if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) { + cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_iterative; + cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_iterative; + } else if (cpi->sf.subpel_search_method == SUBPEL_TREE) { + cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree; + cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_tree; } cpi->mb.optimize = cpi->sf.optimize_coefficients == 1 && cpi->pass != 1; @@ -1163,6 +1169,9 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->gld_fb_idx = 1; cpi->alt_fb_idx = 2; + cpi->current_layer = 0; + cpi->use_svc = 0; + set_tile_limits(cpi); cpi->fixed_divide[0] = 0; @@ -1227,7 +1236,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cm->refresh_frame_context = 1; cm->reset_frame_context = 0; - setup_features(cpi); + setup_features(cm); cpi->mb.e_mbd.allow_high_precision_mv = 0; // Default mv precision adaptation set_mvcost(&cpi->mb); @@ -1297,7 +1306,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) cpi->oxcf.Sharpness = MIN(7, cpi->oxcf.Sharpness); - cpi->mb.e_mbd.lf.sharpness_level = cpi->oxcf.Sharpness; + cpi->common.lf.sharpness_level = cpi->oxcf.Sharpness; if (cpi->initial_width) { // Increasing the size of the frame beyond the first seen frame, or some @@ -1382,7 +1391,7 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) { } VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { - int i; + int i, j; volatile union { VP9_COMP *cpi; VP9_PTR ptr; @@ -1433,14 +1442,13 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->alt_is_last = 0; cpi->gold_is_alt = 0; + // Spatial scalability + cpi->number_spatial_layers = oxcf->ss_number_layers; + // Create the encoder segmentation map and set all entries to 0 CHECK_MEM_ERROR(cm, cpi->segmentation_map, vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); - // And a copy in common for temporal coding - CHECK_MEM_ERROR(cm, cm->last_frame_seg_map, - vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); - // And a place holder structure is the coding context // for use if we want to save and restore it CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy, @@ -1462,9 +1470,6 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { init_context_counters(); #endif -#ifdef NMV_STATS - init_nmvstats(); -#endif #ifdef MODE_STATS init_tx_count_stats(); init_switchable_interp_stats(); @@ -1576,6 +1581,8 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->output_pkt_list = oxcf->output_pkt_list; + cpi->enable_encode_breakout = 1; + if (cpi->pass == 1) { vp9_init_first_pass(cpi); } else if (cpi->pass == 2) { @@ -1591,9 +1598,10 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { vp9_set_speed_features(cpi); - // Set starting values of RD threshold multipliers (128 = *1) - for (i = 0; i < MAX_MODES; i++) - cpi->rd_thresh_mult[i] = 128; + // Default rd threshold factors for mode selection + for (i = 0; i < BLOCK_SIZES; ++i) + for (j = 0; j < MAX_MODES; ++j) + cpi->rd_thresh_freq_fact[i][j] = 32; #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \ SDX3F, SDX8F, SDX4DF)\ @@ -1700,12 +1708,16 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { */ vp9_init_quantizer(cpi); - vp9_loop_filter_init(cm, &cpi->mb.e_mbd.lf); + vp9_loop_filter_init(cm); cpi->common.error.setjmp = 0; vp9_zero(cpi->y_uv_mode_count) +#ifdef MODE_TEST_HIT_STATS + vp9_zero(cpi->mode_test_hits) +#endif + return (VP9_PTR) cpi; } @@ -1728,10 +1740,7 @@ void vp9_remove_compressor(VP9_PTR *ptr) { print_mode_context(cpi); } #endif -#ifdef NMV_STATS - if (cpi->pass != 1) - print_nmvstats(); -#endif + #ifdef MODE_STATS if (cpi->pass != 1) { write_tx_count_stats(); @@ -1790,6 +1799,34 @@ void vp9_remove_compressor(VP9_PTR *ptr) { #endif +#ifdef MODE_TEST_HIT_STATS + if (cpi->pass != 1) { + double norm_per_pixel_mode_tests = 0; + double norm_counts[BLOCK_SIZES]; + int i; + int sb64_per_frame; + int norm_factors[BLOCK_SIZES] = + {256, 128, 128, 64, 32, 32, 16, 8, 8, 4, 2, 2, 1}; + FILE *f = fopen("mode_hit_stats.stt", "a"); + + // On average, how many mode tests do we do + for (i = 0; i < BLOCK_SIZES; ++i) { + norm_counts[i] = (double)cpi->mode_test_hits[i] / + (double)norm_factors[i]; + norm_per_pixel_mode_tests += norm_counts[i]; + } + // Convert to a number per 64x64 and per frame + sb64_per_frame = ((cpi->common.height + 63) / 64) * + ((cpi->common.width + 63) / 64); + norm_per_pixel_mode_tests = + norm_per_pixel_mode_tests / + (double)(cpi->common.current_video_frame * sb64_per_frame); + + fprintf(f, "%6.4f\n", norm_per_pixel_mode_tests); + fclose(f); + } +#endif + #ifdef ENTROPY_STATS { int i, j, k; @@ -1797,18 +1834,18 @@ void vp9_remove_compressor(VP9_PTR *ptr) { fprintf(fmode, "\n#include \"vp9_entropymode.h\"\n\n"); fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts "); - fprintf(fmode, "[VP9_INTRA_MODES][VP9_INTRA_MODES]" - "[VP9_INTRA_MODES] =\n{\n"); + fprintf(fmode, "[INTRA_MODES][INTRA_MODES]" + "[INTRA_MODES] =\n{\n"); - for (i = 0; i < VP9_INTRA_MODES; i++) { + for (i = 0; i < INTRA_MODES; i++) { fprintf(fmode, " { // Above Mode : %d\n", i); - for (j = 0; j < VP9_INTRA_MODES; j++) { + for (j = 0; j < INTRA_MODES; j++) { fprintf(fmode, " {"); - for (k = 0; k < VP9_INTRA_MODES; k++) { + for (k = 0; k < INTRA_MODES; k++) { if (!intra_mode_stats[i][j][k]) fprintf(fmode, " %5d, ", 1); else @@ -2214,6 +2251,12 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) { cpi->source_alt_ref_pending = 1; cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; + + // TODO(ivan): for SVC encoder, GF automatic update is disabled by using a + // large GF_interval + if (cpi->use_svc) { + cpi->frames_till_gf_update_due = INT_MAX; + } } if (!cpi->source_alt_ref_pending) @@ -2379,7 +2422,8 @@ static void update_reference_frames(VP9_COMP * const cpi) { else if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) { #else - else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) { + else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame && + !cpi->use_svc) { #endif /* Preserve the previously existing golden frame and update the frame in * the alt ref slot instead. This is highly specific to the current use of @@ -2424,7 +2468,7 @@ static void update_reference_frames(VP9_COMP * const cpi) { static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { MACROBLOCKD *xd = &cpi->mb.e_mbd; - struct loopfilter *lf = &xd->lf; + struct loopfilter *lf = &cm->lf; if (xd->lossless) { lf->filter_level = 0; } else { @@ -2434,7 +2478,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { vpx_usec_timer_start(&timer); - vp9_pick_filter_level(cpi->Source, cpi); + vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.use_fast_lpf_pick); vpx_usec_timer_mark(&timer); cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); @@ -2442,7 +2486,7 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { if (lf->filter_level > 0) { vp9_set_alt_lf_level(cpi, lf->filter_level); - vp9_loop_filter_frame(cm, xd, lf->filter_level, 0); + vp9_loop_filter_frame(cm, xd, lf->filter_level, 0, 0); } vp9_extend_frame_inner_borders(cm->frame_to_show, @@ -2452,9 +2496,11 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { static void scale_references(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; int i; + int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx, + cpi->alt_fb_idx}; for (i = 0; i < 3; i++) { - YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[i]]; + YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[refs[i]]]; if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { @@ -2467,8 +2513,8 @@ static void scale_references(VP9_COMP *cpi) { scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]); cpi->scaled_ref_idx[i] = new_fb; } else { - cpi->scaled_ref_idx[i] = cm->ref_frame_map[i]; - cm->fb_idx_ref_cnt[cm->ref_frame_map[i]]++; + cpi->scaled_ref_idx[i] = cm->ref_frame_map[refs[i]]; + cm->fb_idx_ref_cnt[cm->ref_frame_map[refs[i]]]++; } } } @@ -2532,25 +2578,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, SPEED_FEATURES *sf = &cpi->sf; unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height); - struct segmentation *seg = &xd->seg; -#if RESET_FOREACH_FILTER - int q_low0; - int q_high0; - int Q0; - int active_best_quality0; - int active_worst_quality0; - double rate_correction_factor0; - double gf_rate_correction_factor0; -#endif - - /* list of filters to search over */ - int mcomp_filters_to_search[] = { - EIGHTTAP, EIGHTTAP_SHARP, EIGHTTAP_SMOOTH, SWITCHABLE - }; - int mcomp_filters = sizeof(mcomp_filters_to_search) / - sizeof(*mcomp_filters_to_search); - int mcomp_filter_index = 0; - int64_t mcomp_filter_cost[4]; + struct segmentation *seg = &cm->seg; /* Scale the source buffer, if required */ if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width || @@ -2603,7 +2631,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } // Set default state for segment based loop filter update flags - xd->lf.mode_ref_delta_update = 0; + cm->lf.mode_ref_delta_update = 0; // Initialize cpi->mv_step_param to default based on max resolution cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def); @@ -2626,10 +2654,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Set various flags etc to special state if it is a key frame if (cm->frame_type == KEY_FRAME) { - int i; - // Reset the loop filter deltas and segmentation map - setup_features(cpi); + setup_features(cm); // If segmentation is enabled force a map update for key frames if (seg->enabled) { @@ -2640,10 +2666,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // The alternate reference frame cannot be active for a key frame cpi->source_alt_ref_active = 0; - // Reset the RD threshold multipliers to default of * 1 (128) - for (i = 0; i < MAX_MODES; i++) - cpi->rd_thresh_mult[i] = 128; - cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0); cm->frame_parallel_decoding_mode = (cpi->oxcf.frame_parallel_decoding_mode != 0); @@ -2672,9 +2694,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cm->frame_type == KEY_FRAME) { #if !CONFIG_MULTIPLE_ARF - // Special case for key frames forced because we have reached - // the maximum key frame interval. Here force the Q to a range - // based on the ambient Q to reduce the risk of popping + // Special case for key frames forced because we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping if (cpi->this_key_frame_forced) { int delta_qindex; int qindex = cpi->last_boosted_qindex; @@ -2683,7 +2705,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, delta_qindex = compute_qdelta(cpi, last_boosted_q, (last_boosted_q * 0.75)); - cpi->active_best_quality = MAX(qindex + delta_qindex, cpi->best_quality); + cpi->active_best_quality = MAX(qindex + delta_qindex, + cpi->best_quality); } else { int high = 5000; int low = 400; @@ -2704,7 +2727,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->active_best_quality = kf_low_motion_minq[q] + adjustment; } - // Allow somewhat lower kf minq with small image formats. if ((cm->width * cm->height) <= (352 * 288)) { q_adj_factor -= 0.25; @@ -2713,14 +2735,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Make a further adjustment based on the kf zero motion measure. q_adj_factor += 0.05 - (0.001 * (double)cpi->kf_zeromotion_pct); - // Convert the adjustment factor to a qindex delta on active_best_quality. + // Convert the adjustment factor to a qindex delta + // on active_best_quality. q_val = vp9_convert_qindex_to_q(cpi->active_best_quality); cpi->active_best_quality += - compute_qdelta(cpi, q_val, (q_val * q_adj_factor)); + compute_qdelta(cpi, q_val, (q_val * q_adj_factor)); } #else double current_q; - // Force the KF quantizer to be 30% of the active_worst_quality. current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality); cpi->active_best_quality = cpi->active_worst_quality @@ -2737,13 +2759,11 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->avg_frame_qindex < cpi->active_worst_quality) { q = cpi->avg_frame_qindex; } - // For constrained quality dont allow Q less than the cq level if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < cpi->cq_target_quality) { q = cpi->cq_target_quality; } - if (cpi->gfu_boost > high) { cpi->active_best_quality = gf_low_motion_minq[q]; } else if (cpi->gfu_boost < low) { @@ -2760,28 +2780,54 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Constrained quality use slightly lower active best. if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) cpi->active_best_quality = cpi->active_best_quality * 15 / 16; + + // TODO(debargha): Refine the logic below + if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { + if (!cpi->refresh_alt_ref_frame) { + cpi->active_best_quality = cpi->cq_target_quality; + } else { + if (cpi->frames_since_key > 1) { + if (cpi->gfu_boost > high) { + cpi->active_best_quality = cpi->cq_target_quality * 6 / 16; + } else if (cpi->gfu_boost < low) { + cpi->active_best_quality = cpi->cq_target_quality * 11 / 16; + } else { + const int gap = high - low; + const int offset = high - cpi->gfu_boost; + const int qdiff = cpi->cq_target_quality * 5 / 16; + const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap; + cpi->active_best_quality = cpi->cq_target_quality * 6 / 16 + + adjustment; + } + } + } + } } else { + if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { + cpi->active_best_quality = cpi->cq_target_quality; + } else { #ifdef ONE_SHOT_Q_ESTIMATE #ifdef STRICT_ONE_SHOT_Q - cpi->active_best_quality = q; + cpi->active_best_quality = q; #else - cpi->active_best_quality = inter_minq[q]; + cpi->active_best_quality = inter_minq[q]; #endif #else - cpi->active_best_quality = inter_minq[q]; + cpi->active_best_quality = inter_minq[q]; #endif - // For the constant/constrained quality mode we don't want - // q to fall below the cq level. - if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && - (cpi->active_best_quality < cpi->cq_target_quality)) { - // If we are strongly undershooting the target rate in the last - // frames then use the user passed in cq value not the auto - // cq value. - if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth) - cpi->active_best_quality = cpi->oxcf.cq_level; - else - cpi->active_best_quality = cpi->cq_target_quality; + // For the constant/constrained quality mode we don't want + // q to fall below the cq level. + if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && + (cpi->active_best_quality < cpi->cq_target_quality)) { + // If we are strongly undershooting the target rate in the last + // frames then use the user passed in cq value not the auto + // cq value. + if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth) + cpi->active_best_quality = cpi->oxcf.cq_level; + else + cpi->active_best_quality = cpi->cq_target_quality; + } } } @@ -2799,7 +2845,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->active_worst_quality = cpi->active_best_quality; // Special case code to try and match quality with forced key frames - if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) { + if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { + q = cpi->active_best_quality; + } else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) { q = cpi->last_boosted_qindex; } else { // Determine initial Q to try @@ -2811,7 +2859,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, #if CONFIG_MULTIPLE_ARF // Force the quantizer determined by the coding order pattern. - if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME)) { + if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) && + cpi->oxcf.end_usage != USAGE_CONSTANT_QUALITY) { double new_q; double current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality); int level = cpi->this_frame_weight; @@ -2841,19 +2890,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_zero(cpi->rd_tx_select_threshes); if (cm->frame_type != KEY_FRAME) { - /* TODO: Decide this more intelligently */ - if (sf->search_best_filter) { - cm->mcomp_filter_type = mcomp_filters_to_search[0]; - mcomp_filter_index = 0; - } else { - cm->mcomp_filter_type = DEFAULT_INTERP_FILTER; - } + cm->mcomp_filter_type = DEFAULT_INTERP_FILTER; /* TODO: Decide this more intelligently */ xd->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH; set_mvcost(&cpi->mb); } -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC if (cpi->oxcf.noise_sensitivity > 0) { int l = 0; @@ -2886,17 +2929,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_write_yuv_frame(cpi->Source); #endif -#if RESET_FOREACH_FILTER - if (sf->search_best_filter) { - q_low0 = q_low; - q_high0 = q_high; - Q0 = Q; - rate_correction_factor0 = cpi->rate_correction_factor; - gf_rate_correction_factor0 = cpi->gf_rate_correction_factor; - active_best_quality0 = cpi->active_best_quality; - active_worst_quality0 = cpi->active_worst_quality; - } -#endif do { vp9_clear_system_state(); // __asm emms; @@ -2946,178 +2978,135 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, active_worst_qchanged = 0; // Special case handling for forced key frames - if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) { - int last_q = q; - int kf_err = vp9_calc_ss_err(cpi->Source, - &cm->yv12_fb[cm->new_fb_idx]); - - int high_err_target = cpi->ambient_err; - int low_err_target = cpi->ambient_err >> 1; - - // Prevent possible divide by zero error below for perfect KF - kf_err += !kf_err; - - // The key frame is not good enough or we can afford - // to make it better without undue risk of popping. - if ((kf_err > high_err_target && - cpi->projected_frame_size <= frame_over_shoot_limit) || - (kf_err > low_err_target && - cpi->projected_frame_size <= frame_under_shoot_limit)) { - // Lower q_high - q_high = q > q_low ? q - 1 : q_low; - - // Adjust Q - q = (q * high_err_target) / kf_err; - q = MIN(q, (q_high + q_low) >> 1); - } else if (kf_err < low_err_target && - cpi->projected_frame_size >= frame_under_shoot_limit) { - // The key frame is much better than the previous frame - // Raise q_low - q_low = q < q_high ? q + 1 : q_high; - - // Adjust Q - q = (q * low_err_target) / kf_err; - q = MIN(q, (q_high + q_low + 1) >> 1); - } - - // Clamp Q to upper and lower limits: - q = clamp(q, q_low, q_high); - - loop = q != last_q; - } - - // Is the projected frame size out of range and are we allowed to attempt to recode. - else if (recode_loop_test(cpi, - frame_over_shoot_limit, frame_under_shoot_limit, - q, top_index, bottom_index)) { - int last_q = q; - int retries = 0; + if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { + loop = 0; + } else { + if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) { + int last_q = q; + int kf_err = vp9_calc_ss_err(cpi->Source, + &cm->yv12_fb[cm->new_fb_idx]); + + int high_err_target = cpi->ambient_err; + int low_err_target = cpi->ambient_err >> 1; + + // Prevent possible divide by zero error below for perfect KF + kf_err += !kf_err; + + // The key frame is not good enough or we can afford + // to make it better without undue risk of popping. + if ((kf_err > high_err_target && + cpi->projected_frame_size <= frame_over_shoot_limit) || + (kf_err > low_err_target && + cpi->projected_frame_size <= frame_under_shoot_limit)) { + // Lower q_high + q_high = q > q_low ? q - 1 : q_low; + + // Adjust Q + q = (q * high_err_target) / kf_err; + q = MIN(q, (q_high + q_low) >> 1); + } else if (kf_err < low_err_target && + cpi->projected_frame_size >= frame_under_shoot_limit) { + // The key frame is much better than the previous frame + // Raise q_low + q_low = q < q_high ? q + 1 : q_high; + + // Adjust Q + q = (q * low_err_target) / kf_err; + q = MIN(q, (q_high + q_low + 1) >> 1); + } - // Frame size out of permitted range: - // Update correction factor & compute new Q to try... + // Clamp Q to upper and lower limits: + q = clamp(q, q_low, q_high); + + loop = q != last_q; + } else if (recode_loop_test( + cpi, frame_over_shoot_limit, frame_under_shoot_limit, + q, top_index, bottom_index)) { + // Is the projected frame size out of range and are we allowed + // to attempt to recode. + int last_q = q; + int retries = 0; + + // Frame size out of permitted range: + // Update correction factor & compute new Q to try... + + // Frame is too large + if (cpi->projected_frame_size > cpi->this_frame_target) { + // Raise Qlow as to at least the current value + q_low = q < q_high ? q + 1 : q_high; + + if (undershoot_seen || loop_count > 1) { + // Update rate_correction_factor unless + // cpi->active_worst_quality has changed. + if (!active_worst_qchanged) + vp9_update_rate_correction_factors(cpi, 1); + + q = (q_high + q_low + 1) / 2; + } else { + // Update rate_correction_factor unless + // cpi->active_worst_quality has changed. + if (!active_worst_qchanged) + vp9_update_rate_correction_factors(cpi, 0); - // Frame is too large - if (cpi->projected_frame_size > cpi->this_frame_target) { - // Raise Qlow as to at least the current value - q_low = q < q_high ? q + 1 : q_high; + q = vp9_regulate_q(cpi, cpi->this_frame_target); - if (undershoot_seen || loop_count > 1) { - // Update rate_correction_factor unless cpi->active_worst_quality - // has changed. - if (!active_worst_qchanged) - vp9_update_rate_correction_factors(cpi, 1); + while (q < q_low && retries < 10) { + vp9_update_rate_correction_factors(cpi, 0); + q = vp9_regulate_q(cpi, cpi->this_frame_target); + retries++; + } + } - q = (q_high + q_low + 1) / 2; + overshoot_seen = 1; } else { - // Update rate_correction_factor unless cpi->active_worst_quality has changed. - if (!active_worst_qchanged) - vp9_update_rate_correction_factors(cpi, 0); - - q = vp9_regulate_q(cpi, cpi->this_frame_target); + // Frame is too small + q_high = q > q_low ? q - 1 : q_low; + + if (overshoot_seen || loop_count > 1) { + // Update rate_correction_factor unless + // cpi->active_worst_quality has changed. + if (!active_worst_qchanged) + vp9_update_rate_correction_factors(cpi, 1); + + q = (q_high + q_low) / 2; + } else { + // Update rate_correction_factor unless + // cpi->active_worst_quality has changed. + if (!active_worst_qchanged) + vp9_update_rate_correction_factors(cpi, 0); - while (q < q_low && retries < 10) { - vp9_update_rate_correction_factors(cpi, 0); q = vp9_regulate_q(cpi, cpi->this_frame_target); - retries++; - } - } - overshoot_seen = 1; - } else { - // Frame is too small - q_high = q > q_low ? q - 1 : q_low; - - if (overshoot_seen || loop_count > 1) { - // Update rate_correction_factor unless cpi->active_worst_quality has changed. - if (!active_worst_qchanged) - vp9_update_rate_correction_factors(cpi, 1); + // Special case reset for qlow for constrained quality. + // This should only trigger where there is very substantial + // undershoot on a frame and the auto cq level is above + // the user passsed in value. + if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) { + q_low = q; + } - q = (q_high + q_low) / 2; - } else { - // Update rate_correction_factor unless cpi->active_worst_quality has changed. - if (!active_worst_qchanged) - vp9_update_rate_correction_factors(cpi, 0); - - q = vp9_regulate_q(cpi, cpi->this_frame_target); - - // Special case reset for qlow for constrained quality. - // This should only trigger where there is very substantial - // undershoot on a frame and the auto cq level is above - // the user passsed in value. - if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) { - q_low = q; + while (q > q_high && retries < 10) { + vp9_update_rate_correction_factors(cpi, 0); + q = vp9_regulate_q(cpi, cpi->this_frame_target); + retries++; + } } - while (q > q_high && retries < 10) { - vp9_update_rate_correction_factors(cpi, 0); - q = vp9_regulate_q(cpi, cpi->this_frame_target); - retries++; - } + undershoot_seen = 1; } - undershoot_seen = 1; - } + // Clamp Q to upper and lower limits: + q = clamp(q, q_low, q_high); - // Clamp Q to upper and lower limits: - q = clamp(q, q_low, q_high); - - loop = q != last_q; - } else { - loop = 0; + loop = q != last_q; + } else { + loop = 0; + } } if (cpi->is_src_frame_alt_ref) loop = 0; - if (!loop && cm->frame_type != KEY_FRAME && sf->search_best_filter) { - if (mcomp_filter_index < mcomp_filters) { - int64_t err = vp9_calc_ss_err(cpi->Source, - &cm->yv12_fb[cm->new_fb_idx]); - int64_t rate = cpi->projected_frame_size << 8; - mcomp_filter_cost[mcomp_filter_index] = - (RDCOST(cpi->RDMULT, cpi->RDDIV, rate, err)); - mcomp_filter_index++; - if (mcomp_filter_index < mcomp_filters) { - cm->mcomp_filter_type = mcomp_filters_to_search[mcomp_filter_index]; - loop_count = -1; - loop = 1; - } else { - int f; - int64_t best_cost = mcomp_filter_cost[0]; - int mcomp_best_filter = mcomp_filters_to_search[0]; - for (f = 1; f < mcomp_filters; f++) { - if (mcomp_filter_cost[f] < best_cost) { - mcomp_best_filter = mcomp_filters_to_search[f]; - best_cost = mcomp_filter_cost[f]; - } - } - if (mcomp_best_filter != mcomp_filters_to_search[mcomp_filters - 1]) { - loop_count = -1; - loop = 1; - cm->mcomp_filter_type = mcomp_best_filter; - } - /* - printf(" best filter = %d, ( ", mcomp_best_filter); - for (f=0;f<mcomp_filters; f++) printf("%d ", mcomp_filter_cost[f]); - printf(")\n"); - */ - } -#if RESET_FOREACH_FILTER - if (loop) { - overshoot_seen = 0; - undershoot_seen = 0; - q_low = q_low0; - q_high = q_high0; - q = Q0; - cpi->rate_correction_factor = rate_correction_factor0; - cpi->gf_rate_correction_factor = gf_rate_correction_factor0; - cpi->active_best_quality = active_best_quality0; - cpi->active_worst_quality = active_worst_quality0; - } -#endif - } - } - if (loop) { loop_count++; @@ -3165,7 +3154,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->dummy_packing = 0; vp9_pack_bitstream(cpi, dest, size); - if (xd->seg.update_map) + if (cm->seg.update_map) update_reference_segmentation_map(cpi); release_scaled_references(cpi); @@ -3296,9 +3285,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // in this frame. // update_base_skip_probs(cpi); -#if 0 && CONFIG_INTERNAL_STATS +#if CONFIG_INTERNAL_STATS { - FILE *f = fopen("tmp.stt", "a"); + FILE *f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w"); int recon_err; vp9_clear_system_state(); // __asm emms; @@ -3307,7 +3296,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, &cm->yv12_fb[cm->new_fb_idx]); if (cpi->twopass.total_left_stats.coded_error != 0.0) - fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d" + fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d" "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f" "%6d %6d %5d %5d %5d %8.2f %10d %10.3f" "%10.3f %8d %10d %10d %10d\n", @@ -3317,6 +3306,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, (int)cpi->total_target_vs_actual, (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target), (int)cpi->total_actual_bits, + cm->base_qindex, vp9_convert_qindex_to_q(cm->base_qindex), (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0, vp9_convert_qindex_to_q(cpi->active_best_quality), @@ -3335,7 +3325,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->tot_recode_hits, recon_err, cpi->kf_boost, cpi->kf_zeromotion_pct); else - fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d" + fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d" "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f" "%5d %5d %5d %8d %8d %8.2f %10d %10.3f" "%8d %10d %10d %10d\n", @@ -3346,6 +3336,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, (int)cpi->total_target_vs_actual, (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target), (int)cpi->total_actual_bits, + cm->base_qindex, vp9_convert_qindex_to_q(cm->base_qindex), (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0, vp9_convert_qindex_to_q(cpi->active_best_quality), @@ -3473,9 +3464,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas. - xd->seg.update_map = 0; - xd->seg.update_data = 0; - xd->lf.mode_ref_delta_update = 0; + cm->seg.update_map = 0; + cm->seg.update_data = 0; + cm->lf.mode_ref_delta_update = 0; // keep track of the last coded dimensions cm->last_width = cm->width; @@ -3486,11 +3477,15 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, if (cm->show_frame) { // current mip will be the prev_mip for the next frame MODE_INFO *temp = cm->prev_mip; + MODE_INFO **temp2 = cm->prev_mi_grid_base; cm->prev_mip = cm->mip; cm->mip = temp; + cm->prev_mi_grid_base = cm->mi_grid_base; + cm->mi_grid_base = temp2; // update the upper left visible macroblock ptrs cm->mi = cm->mip + cm->mode_info_stride + 1; + cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1; // Don't increment frame counters if this was an altref buffer // update not a real frame @@ -3499,8 +3494,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } // restore prev_mi cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; + cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1; -#if 0 + #if 0 { char filename[512]; FILE *recon_file; @@ -3521,6 +3517,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags) { + cpi->enable_encode_breakout = 1; + if (!cpi->refresh_alt_ref_frame) vp9_second_pass(cpi); @@ -3544,24 +3542,28 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, } } - -int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags, - YV12_BUFFER_CONFIG *sd, int64_t time_stamp, - int64_t end_time) { - VP9_COMP *cpi = (VP9_COMP *) ptr; +static void check_initial_width(VP9_COMP *cpi, YV12_BUFFER_CONFIG *sd) { VP9_COMMON *cm = &cpi->common; - struct vpx_usec_timer timer; - int res = 0; - if (!cpi->initial_width) { // TODO(jkoleszar): Support 1/4 subsampling? - cm->subsampling_x = sd->uv_width < sd->y_width; - cm->subsampling_y = sd->uv_height < sd->y_height; + cm->subsampling_x = (sd != NULL) && sd->uv_width < sd->y_width; + cm->subsampling_y = (sd != NULL) && sd->uv_height < sd->y_height; alloc_raw_frame_buffers(cpi); cpi->initial_width = cm->width; cpi->initial_height = cm->height; } +} + + +int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time) { + VP9_COMP *cpi = (VP9_COMP *) ptr; + struct vpx_usec_timer timer; + int res = 0; + + check_initial_width(cpi, sd); vpx_usec_timer_start(&timer); if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL)) @@ -3575,16 +3577,15 @@ int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags, static int frame_is_reference(const VP9_COMP *cpi) { const VP9_COMMON *cm = &cpi->common; - const MACROBLOCKD *mb = &cpi->mb.e_mbd; return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame || cm->refresh_frame_context || - mb->lf.mode_ref_delta_update || - mb->seg.update_map || - mb->seg.update_data; + cm->lf.mode_ref_delta_update || + cm->seg.update_map || + cm->seg.update_data; } #if CONFIG_MULTIPLE_ARF @@ -3644,6 +3645,8 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, configure_arnr_filter(cpi, cm->current_video_frame + frames_to_arf, cpi->gfu_boost); vp9_temporal_filter_prepare(cpi, frames_to_arf); + vp9_extend_frame_borders(&cpi->alt_ref_buffer, + cm->subsampling_x, cm->subsampling_y); force_src_buffer = &cpi->alt_ref_buffer; } @@ -3911,9 +3914,9 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, { double frame_psnr2, frame_ssim2 = 0; double weight = 0; -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, - cpi->mb.e_mbd.lf.filter_level * 10 / 6); + cm->lf.filter_level * 10 / 6); #endif vp9_clear_system_state(); @@ -3987,8 +3990,8 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, return -1; else { int ret; -#if CONFIG_POSTPROC - ret = vp9_post_proc_frame(&cpi->common, &cpi->mb.e_mbd.lf, dest, flags); +#if CONFIG_VP9_POSTPROC + ret = vp9_post_proc_frame(&cpi->common, dest, flags); #else if (cpi->common.frame_to_show) { @@ -4001,7 +4004,7 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, ret = -1; } -#endif // !CONFIG_POSTPROC +#endif // !CONFIG_VP9_POSTPROC vp9_clear_system_state(); return ret; } @@ -4013,7 +4016,7 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows, unsigned int threshold[MAX_SEGMENTS]) { VP9_COMP *cpi = (VP9_COMP *) comp; signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS]; - struct segmentation *seg = &cpi->mb.e_mbd.seg; + struct segmentation *seg = &cpi->common.seg; int i; if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols) @@ -4030,7 +4033,7 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows, // Activate segmentation. vp9_enable_segmentation((VP9_PTR)cpi); - // Set up the quan, LF and breakout threshold segment data + // Set up the quant, LF and breakout threshold segment data for (i = 0; i < MAX_SEGMENTS; i++) { feature_data[SEG_LVL_ALT_Q][i] = delta_q[i]; feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i]; @@ -4050,7 +4053,7 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows, vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF); } - // Initialise the feature data structure + // Initialize the feature data structure // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1 vp9_set_segment_data((VP9_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA); @@ -4098,7 +4101,76 @@ int vp9_set_internal_size(VP9_PTR comp, return 0; } +int vp9_set_size_literal(VP9_PTR comp, unsigned int width, + unsigned int height) { + VP9_COMP *cpi = (VP9_COMP *)comp; + VP9_COMMON *cm = &cpi->common; + + check_initial_width(cpi, NULL); + + if (width) { + cm->width = width; + if (cm->width * 5 < cpi->initial_width) { + cm->width = cpi->initial_width / 5 + 1; + printf("Warning: Desired width too small, changed to %d \n", cm->width); + } + if (cm->width > cpi->initial_width) { + cm->width = cpi->initial_width; + printf("Warning: Desired width too large, changed to %d \n", cm->width); + } + } + + if (height) { + cm->height = height; + if (cm->height * 5 < cpi->initial_height) { + cm->height = cpi->initial_height / 5 + 1; + printf("Warning: Desired height too small, changed to %d \n", cm->height); + } + if (cm->height > cpi->initial_height) { + cm->height = cpi->initial_height; + printf("Warning: Desired height too large, changed to %d \n", cm->height); + } + } + + assert(cm->width <= cpi->initial_width); + assert(cm->height <= cpi->initial_height); + update_frame_size(cpi); + return 0; +} + +int vp9_switch_layer(VP9_PTR comp, int layer) { + VP9_COMP *cpi = (VP9_COMP *)comp; + + if (cpi->use_svc) { + cpi->current_layer = layer; + + // Use buffer i for layer i LST + cpi->lst_fb_idx = layer; + // Use buffer i-1 for layer i Alt (Inter-layer prediction) + if (layer != 0) cpi->alt_fb_idx = layer - 1; + + // Use the rest for Golden + if (layer < 2 * cpi->number_spatial_layers - NUM_REF_FRAMES) + cpi->gld_fb_idx = cpi->lst_fb_idx; + else + cpi->gld_fb_idx = 2 * cpi->number_spatial_layers - 1 - layer; + + printf("Switching to layer %d:\n", layer); + printf("Using references: LST/GLD/ALT [%d|%d|%d]\n", cpi->lst_fb_idx, + cpi->gld_fb_idx, cpi->alt_fb_idx); + } else { + printf("Switching layer not supported. Enable SVC first \n"); + } + return 0; +} + +void vp9_set_svc(VP9_PTR comp, int use_svc) { + VP9_COMP *cpi = (VP9_COMP *)comp; + cpi->use_svc = use_svc; + if (cpi->use_svc) printf("Enabled SVC encoder \n"); + return; +} int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) { int i, j; diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h index c258829..3e5796f 100644 --- a/libvpx/vp9/encoder/vp9_onyx_int.h +++ b/libvpx/vp9/encoder/vp9_onyx_int.h @@ -36,6 +36,8 @@ #define DISABLE_RC_LONG_TERM_MEM 0 #endif +// #define MODE_TEST_HIT_STATS + // #define SPEEDSTATS 1 #if CONFIG_MULTIPLE_ARF // Set MIN_GF_INTERVAL to 1 for the full decomposition. @@ -79,15 +81,15 @@ typedef struct { vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES]; - vp9_prob y_mode_prob[4][VP9_INTRA_MODES - 1]; - vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1]; + vp9_prob y_mode_prob[4][INTRA_MODES - 1]; + vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; vp9_prob partition_prob[2][NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1]; - vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] - [VP9_SWITCHABLE_FILTERS - 1]; + vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1] + [SWITCHABLE_FILTERS - 1]; - int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2]; - vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1]; + int inter_mode_counts[INTER_MODE_CONTEXTS][INTER_MODES - 1][2]; + vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1]; struct tx_probs tx_probs; vp9_prob mbskip_probs[MBSKIP_CONTEXTS]; @@ -145,18 +147,19 @@ typedef struct { // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code. typedef enum { THR_NEARESTMV, - THR_DC, - THR_NEARESTA, THR_NEARESTG, - THR_NEWMV, - THR_COMP_NEARESTLA, - THR_NEARMV, - THR_COMP_NEARESTGA, - THR_NEWG, + THR_DC, + + THR_NEWMV, THR_NEWA, + THR_NEWG, + + THR_NEARMV, THR_NEARA, + THR_COMP_NEARESTLA, + THR_COMP_NEARESTGA, THR_TM, @@ -182,7 +185,7 @@ typedef enum { THR_H_PRED, THR_V_PRED, THR_D135_PRED, - THR_D27_PRED, + THR_D207_PRED, THR_D153_PRED, THR_D63_PRED, THR_D117_PRED, @@ -192,7 +195,9 @@ typedef enum { typedef enum { DIAMOND = 0, NSTEP = 1, - HEX = 2 + HEX = 2, + BIGDIA = 3, + SQUARE = 4 } SEARCH_METHODS; typedef enum { @@ -230,20 +235,29 @@ typedef enum { FLAG_SKIP_INTRA_LOWVAR = 32, } MODE_SEARCH_SKIP_LOGIC; +typedef enum { + SUBPEL_ITERATIVE = 0, + SUBPEL_TREE = 1, + // Other methods to come +} SUBPEL_SEARCH_METHODS; + +#define ALL_INTRA_MODES 0x3FF +#define INTRA_DC_ONLY 0x01 +#define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED)) +#define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED)) + typedef struct { int RD; SEARCH_METHODS search_method; int auto_filter; int recode_loop; - int iterative_sub_pixel; - int half_pixel_search; - int quarter_pixel_search; + SUBPEL_SEARCH_METHODS subpel_search_method; + int subpel_iters_per_step; int thresh_mult[MAX_MODES]; int max_step_search_steps; int reduce_first_step_size; int auto_mv_step_size; int optimize_coefficients; - int search_best_filter; int static_segmentation; int comp_inter_joint_search_thresh; int adaptive_rd_thresh; @@ -251,36 +265,43 @@ typedef struct { int skip_encode_frame; int use_lastframe_partitioning; TX_SIZE_SEARCH_METHOD tx_size_search_method; - int use_8tap_always; + int use_lp32x32fdct; int use_avoid_tested_higherror; int skip_lots_of_modes; - int adjust_thresholds_by_speed; int partition_by_variance; int use_one_partition_size_always; int less_rectangular_check; int use_square_partition_only; - int unused_mode_skip_lvl; + int mode_skip_start; int reference_masking; - BLOCK_SIZE_TYPE always_this_block_size; + BLOCK_SIZE always_this_block_size; int auto_min_max_partition_size; int auto_min_max_partition_interval; int auto_min_max_partition_count; - BLOCK_SIZE_TYPE min_partition_size; - BLOCK_SIZE_TYPE max_partition_size; - // int use_min_partition_size; // not used in code - // int use_max_partition_size; + BLOCK_SIZE min_partition_size; + BLOCK_SIZE max_partition_size; int adjust_partitioning_from_last_frame; int last_partitioning_redo_frequency; int disable_splitmv; int using_small_partition_info; + // TODO(jingning): combine the related motion search speed features + int adaptive_motion_search; // Implements various heuristics to skip searching modes // The heuristics selected are based on flags // defined in the MODE_SEARCH_SKIP_HEURISTICS enum unsigned int mode_search_skip_flags; - MB_PREDICTION_MODE last_chroma_intra_mode; + // A source variance threshold below which the split mode is disabled + unsigned int disable_split_var_thresh; + // A source variance threshold below which filter search is disabled + // Choose a very large value (UINT_MAX) to use 8-tap always + unsigned int disable_filter_search_var_thresh; + int intra_y_mode_mask; + int intra_uv_mode_mask; int use_rd_breakout; int use_uv_intra_rd_estimate; + int use_fast_lpf_pick; + int use_fast_coef_updates; // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced } SPEED_FEATURES; typedef struct VP9_COMP { @@ -331,6 +352,10 @@ typedef struct VP9_COMP { int lst_fb_idx; int gld_fb_idx; int alt_fb_idx; + + int current_layer; + int use_svc; + #if CONFIG_MULTIPLE_ARF int alt_ref_fb_idx[NUM_REF_FRAMES - 3]; #endif @@ -360,14 +385,12 @@ typedef struct VP9_COMP { unsigned int mode_check_freq[MAX_MODES]; unsigned int mode_test_hit_counts[MAX_MODES]; unsigned int mode_chosen_counts[MAX_MODES]; - int64_t unused_mode_skip_mask; + int64_t mode_skip_mask; int ref_frame_mask; int set_ref_frame_mask; - int rd_thresh_mult[MAX_MODES]; - int rd_baseline_thresh[BLOCK_SIZE_TYPES][MAX_MODES]; - int rd_threshes[BLOCK_SIZE_TYPES][MAX_MODES]; - int rd_thresh_freq_fact[BLOCK_SIZE_TYPES][MAX_MODES]; + int rd_threshes[BLOCK_SIZES][MAX_MODES]; + int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES]; // FIXME(rbultje) int64_t? @@ -381,9 +404,9 @@ typedef struct VP9_COMP { // FIXME(rbultje) can this overflow? int rd_tx_select_threshes[4][TX_MODES]; - int64_t rd_filter_diff[VP9_SWITCHABLE_FILTERS + 1]; - int64_t rd_filter_threshes[4][VP9_SWITCHABLE_FILTERS + 1]; - int64_t rd_filter_cache[VP9_SWITCHABLE_FILTERS + 1]; + int64_t rd_filter_diff[SWITCHABLE_FILTERS + 1]; + int64_t rd_filter_threshes[4][SWITCHABLE_FILTERS + 1]; + int64_t rd_filter_cache[SWITCHABLE_FILTERS + 1]; int RDMULT; int RDDIV; @@ -458,8 +481,8 @@ typedef struct VP9_COMP { int cq_target_quality; - int y_mode_count[4][VP9_INTRA_MODES]; - int y_uv_mode_count[VP9_INTRA_MODES][VP9_INTRA_MODES]; + int y_mode_count[4][INTRA_MODES]; + int y_uv_mode_count[INTRA_MODES][INTRA_MODES]; unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES]; nmv_context_counts NMVcount; @@ -472,6 +495,7 @@ typedef struct VP9_COMP { int last_boost; int kf_boost; int kf_zeromotion_pct; + int gf_zeromotion_pct; int64_t target_bandwidth; struct vpx_codec_pkt_list *output_pkt_list; @@ -527,10 +551,11 @@ typedef struct VP9_COMP { unsigned int active_map_enabled; fractional_mv_step_fp *find_fractional_mv_step; + fractional_mv_step_comp_fp *find_fractional_mv_step_comp; vp9_full_search_fn_t full_search_sad; vp9_refining_search_fn_t refining_search_sad; vp9_diamond_search_fn_t diamond_search_sad; - vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZE_TYPES]; + vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES]; uint64_t time_receive_data; uint64_t time_compress_data; uint64_t time_pick_lpf; @@ -623,14 +648,18 @@ typedef struct VP9_COMP { int dummy_packing; /* flag to indicate if packing is dummy */ - unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1] - [VP9_SWITCHABLE_FILTERS]; + unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1] + [SWITCHABLE_FILTERS]; unsigned int txfm_stepdown_count[TX_SIZES]; int initial_width; int initial_height; + int number_spatial_layers; + int enable_encode_breakout; // Default value is 1. From first pass stats, + // encode_breakout may be disabled. + #if CONFIG_MULTIPLE_ARF // ARF tracking variables. int multi_arf_enabled; @@ -645,7 +674,13 @@ typedef struct VP9_COMP { #endif #ifdef ENTROPY_STATS - int64_t mv_ref_stats[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2]; + int64_t mv_ref_stats[INTER_MODE_CONTEXTS][INTER_MODES - 1][2]; +#endif + + +#ifdef MODE_TEST_HIT_STATS + // Debug / test stats + int64_t mode_test_hits[BLOCK_SIZES]; #endif } VP9_COMP; @@ -659,6 +694,17 @@ static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { } } +static int get_scale_ref_frame_idx(VP9_COMP *cpi, + MV_REFERENCE_FRAME ref_frame) { + if (ref_frame == LAST_FRAME) { + return 0; + } else if (ref_frame == GOLDEN_FRAME) { + return 1; + } else { + return 2; + } +} + void vp9_encode_frame(VP9_COMP *cpi); void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c index 2b8f2cd..239fd6b 100644 --- a/libvpx/vp9/encoder/vp9_picklpf.c +++ b/libvpx/vp9/encoder/vp9_picklpf.c @@ -21,29 +21,15 @@ #include "./vpx_scale_rtcd.h" void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc, int Fraction) { - uint8_t *src_y, *dst_y; - int yheight; - int ystride; - int yoffset; - int linestocopy; + YV12_BUFFER_CONFIG *dst_ybc, int fraction) { + const int height = src_ybc->y_height; + const int stride = src_ybc->y_stride; + const int offset = stride * ((height >> 5) * 16 - 8); + const int lines_to_copy = MAX(height >> (fraction + 4), 1) << 4; assert(src_ybc->y_stride == dst_ybc->y_stride); - yheight = src_ybc->y_height; - ystride = src_ybc->y_stride; - - linestocopy = (yheight >> (Fraction + 4)); - - if (linestocopy < 1) - linestocopy = 1; - - linestocopy <<= 4; - - yoffset = ystride * ((yheight >> 5) * 16 - 8); - src_y = src_ybc->y_buffer + yoffset; - dst_y = dst_ybc->y_buffer + yoffset; - - vpx_memcpy(dst_y, src_y, ystride * (linestocopy + 16)); + vpx_memcpy(dst_ybc->y_buffer + offset, src_ybc->y_buffer + offset, + stride * (lines_to_copy + 16)); } static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, @@ -125,14 +111,14 @@ static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) { void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) { } -void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - struct loopfilter *lf = &cpi->mb.e_mbd.lf; +void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { + VP9_COMMON *const cm = &cpi->common; + struct loopfilter *const lf = &cm->lf; int best_err = 0; int filt_err = 0; - int min_filter_level = get_min_filter_level(cpi, cm->base_qindex); - int max_filter_level = get_max_filter_level(cpi, cm->base_qindex); + const int min_filter_level = get_min_filter_level(cpi, cm->base_qindex); + const int max_filter_level = get_max_filter_level(cpi, cm->base_qindex); int filter_step; int filt_high = 0; @@ -145,33 +131,26 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { int Bias = 0; // Bias against raising loop filter and in favour of lowering it // Make a copy of the unfiltered / processed recon buffer - vp8_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf); + vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf); - if (cm->frame_type == KEY_FRAME) - lf->sharpness_level = 0; - else - lf->sharpness_level = cpi->oxcf.Sharpness; + lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 + : cpi->oxcf.Sharpness; // Start the search at the previous frame filter level unless it is now out of range. - filt_mid = lf->filter_level; - - if (filt_mid < min_filter_level) - filt_mid = min_filter_level; - else if (filt_mid > max_filter_level) - filt_mid = max_filter_level; + filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level); // Define the initial step size - filter_step = (filt_mid < 16) ? 4 : filt_mid / 4; + filter_step = filt_mid < 16 ? 4 : filt_mid / 4; // Get baseline error score vp9_set_alt_lf_level(cpi, filt_mid); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1); + vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1, partial); best_err = vp9_calc_ss_err(sd, cm->frame_to_show); filt_best = filt_mid; // Re-instate the unfiltered frame - vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); + vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); while (filter_step > 0) { Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images @@ -190,12 +169,12 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { if ((filt_direction <= 0) && (filt_low != filt_mid)) { // Get Low filter error score vp9_set_alt_lf_level(cpi, filt_low); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1); + vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1, partial); filt_err = vp9_calc_ss_err(sd, cm->frame_to_show); // Re-instate the unfiltered frame - vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); + vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); // If value is close to the best so far then bias towards a lower loop filter value. if ((filt_err - Bias) < best_err) { @@ -210,12 +189,12 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { // Now look at filt_high if ((filt_direction >= 0) && (filt_high != filt_mid)) { vp9_set_alt_lf_level(cpi, filt_high); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1); + vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, partial); filt_err = vp9_calc_ss_err(sd, cm->frame_to_show); // Re-instate the unfiltered frame - vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); + vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); // Was it better than the previous best? if (filt_err < (best_err - Bias)) { @@ -236,3 +215,4 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { lf->filter_level = filt_best; } + diff --git a/libvpx/vp9/encoder/vp9_picklpf.h b/libvpx/vp9/encoder/vp9_picklpf.h index 698cb8d..9de4cf8 100644 --- a/libvpx/vp9/encoder/vp9_picklpf.h +++ b/libvpx/vp9/encoder/vp9_picklpf.h @@ -18,6 +18,5 @@ struct VP9_COMP; void vp9_set_alt_lf_level(struct VP9_COMP *cpi, int filt_val); void vp9_pick_filter_level(struct yv12_buffer_config *sd, - struct VP9_COMP *cpi); - + struct VP9_COMP *cpi, int partial); #endif // VP9_ENCODER_VP9_PICKLPF_H_ diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c index 525f4da..6c8b2a0 100644 --- a/libvpx/vp9/encoder/vp9_quantize.c +++ b/libvpx/vp9/encoder/vp9_quantize.c @@ -69,6 +69,7 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, if (x >= zbin) { x += (round_ptr[rc != 0]); + x = clamp(x, INT16_MIN, INT16_MAX); y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) * quant_shift_ptr[rc != 0]) >> 16; // quantize (x) x = (y ^ sz) - sz; // get the sign back @@ -84,7 +85,6 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, *eob_ptr = eob + 1; } -// This function works well for large transform size. void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, @@ -94,7 +94,7 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i, rc, eob; - int zbins[2], nzbins[2], zbin; + int zbins[2], nzbins[2]; int x, y, z, sz; int idx = 0; int idx_arr[1024]; @@ -105,8 +105,8 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, eob = -1; // Base ZBIN - zbins[0] = zbin_ptr[0] + zbin_oq_value; - zbins[1] = zbin_ptr[1] + zbin_oq_value; + zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1); + zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1); nzbins[0] = zbins[0] * -1; nzbins[1] = zbins[1] * -1; @@ -114,7 +114,7 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, // Pre-scan pass for (i = 0; i < n_coeffs; i++) { rc = scan[i]; - z = coeff_ptr[rc] * 2; + z = coeff_ptr[rc]; // If the coefficient is out of the base ZBIN range, keep it for // quantization. @@ -127,31 +127,52 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, for (i = 0; i < idx; i++) { rc = scan[idx_arr[i]]; - // Calculate ZBIN - zbin = (zbins[rc != 0]); - - z = coeff_ptr[rc] * 2; + z = coeff_ptr[rc]; sz = (z >> 31); // sign of z x = (z ^ sz) - sz; // x = abs(z) - if (x >= zbin) { - x += (round_ptr[rc != 0]); - y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) * - quant_shift_ptr[rc != 0]) >> 16; // quantize (x) + x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + x = clamp(x, INT16_MIN, INT16_MAX); + y = ((((x * quant_ptr[rc != 0]) >> 16) + x) * + quant_shift_ptr[rc != 0]) >> 15; // quantize (x) - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2; // dequantized value + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2; // dequantized value - if (y) { - eob = idx_arr[i]; // last nonzero coeffs - } - } + if (y) + eob = idx_arr[i]; // last nonzero coeffs } } *eob_ptr = eob + 1; } +struct plane_block_idx { + int plane; + int block; +}; + +// TODO(jkoleszar): returning a struct so it can be used in a const context, +// expect to refactor this further later. +static INLINE struct plane_block_idx plane_block_idx(int y_blocks, + int b_idx) { + const int v_offset = y_blocks * 5 / 4; + struct plane_block_idx res; + + if (b_idx < y_blocks) { + res.plane = 0; + res.block = b_idx; + } else if (b_idx < v_offset) { + res.plane = 1; + res.block = b_idx - y_blocks; + } else { + assert(b_idx < y_blocks * 3 / 2); + res.plane = 2; + res.block = b_idx - v_offset; + } + return res; +} + void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, int y_blocks) { MACROBLOCKD *const xd = &mb->e_mbd; @@ -159,14 +180,14 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, const int16_t *scan = get_scan_4x4(tx_type); const int16_t *iscan = get_iscan_4x4(tx_type); - vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16), + vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block), 16, mb->skip_block, mb->plane[pb_idx.plane].zbin, mb->plane[pb_idx.plane].round, mb->plane[pb_idx.plane].quant, mb->plane[pb_idx.plane].quant_shift, - BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16), - BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16), + BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block), + BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block), xd->plane[pb_idx.plane].dequant, mb->plane[pb_idx.plane].zbin_extra, &xd->plane[pb_idx.plane].eobs[pb_idx.block], @@ -185,63 +206,43 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) { } void vp9_init_quantizer(VP9_COMP *cpi) { - int i; - int quant_val; - int quant_uv_val; -#if CONFIG_ALPHA - int quant_alpha_val; -#endif - int q; + int i, q; + VP9_COMMON *const cm = &cpi->common; for (q = 0; q < QINDEX_RANGE; q++) { - int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80; - int qrounding_factor = 48; - if (q == 0) { - qzbin_factor = 64; - qrounding_factor = 64; + const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80); + const int qrounding_factor = q == 0 ? 64 : 48; + + // y + for (i = 0; i < 2; ++i) { + const int quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q) + : vp9_ac_quant(q, 0); + invert_quant(&cpi->y_quant[q][i], &cpi->y_quant_shift[q][i], quant); + cpi->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); + cpi->y_round[q][i] = (qrounding_factor * quant) >> 7; + cm->y_dequant[q][i] = quant; } - // dc values - quant_val = vp9_dc_quant(q, cpi->common.y_dc_delta_q); - invert_quant(cpi->y_quant[q] + 0, cpi->y_quant_shift[q] + 0, quant_val); - cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); - cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7; - cpi->common.y_dequant[q][0] = quant_val; - - quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q); - invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val); - cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); - cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7; - cpi->common.uv_dequant[q][0] = quant_val; - -#if CONFIG_ALPHA - quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q); - invert_quant(cpi->a_quant[q] + 0, cpi->a_quant_shift[q] + 0, quant_val); - cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); - cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7; - cpi->common.a_dequant[q][0] = quant_val; -#endif - - quant_val = vp9_ac_quant(q, 0); - invert_quant(cpi->y_quant[q] + 1, cpi->y_quant_shift[q] + 1, quant_val); - cpi->y_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); - cpi->y_round[q][1] = (qrounding_factor * quant_val) >> 7; - cpi->common.y_dequant[q][1] = quant_val; - - quant_uv_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q); - invert_quant(cpi->uv_quant[q] + 1, cpi->uv_quant_shift[q] + 1, - quant_uv_val); - cpi->uv_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7); - cpi->uv_round[q][1] = (qrounding_factor * quant_uv_val) >> 7; - cpi->common.uv_dequant[q][1] = quant_uv_val; + // uv + for (i = 0; i < 2; ++i) { + const int quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q) + : vp9_ac_quant(q, cm->uv_ac_delta_q); + invert_quant(&cpi->uv_quant[q][i], &cpi->uv_quant_shift[q][i], quant); + cpi->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); + cpi->uv_round[q][i] = (qrounding_factor * quant) >> 7; + cm->uv_dequant[q][i] = quant; + } #if CONFIG_ALPHA - quant_alpha_val = vp9_ac_quant(q, cpi->common.a_ac_delta_q); - invert_quant(cpi->a_quant[q] + 1, cpi->a_quant_shift[q] + 1, - quant_alpha_val); - cpi->a_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7); - cpi->a_round[q][1] = (qrounding_factor * quant_alpha_val) >> 7; - cpi->common.a_dequant[q][1] = quant_alpha_val; + // alpha + for (i = 0; i < 2; ++i) { + const int quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q) + : vp9_ac_quant(q, cm->a_ac_delta_q); + invert_quant(&cpi->a_quant[q][i], &cpi->a_quant_shift[q][i], quant); + cpi->a_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); + cpi->a_round[q][i] = (qrounding_factor * quant) >> 7; + cm->a_dequant[q][i] = quant; + } #endif for (i = 2; i < 8; i++) { @@ -249,20 +250,20 @@ void vp9_init_quantizer(VP9_COMP *cpi) { cpi->y_quant_shift[q][i] = cpi->y_quant_shift[q][1]; cpi->y_zbin[q][i] = cpi->y_zbin[q][1]; cpi->y_round[q][i] = cpi->y_round[q][1]; - cpi->common.y_dequant[q][i] = cpi->common.y_dequant[q][1]; + cm->y_dequant[q][i] = cm->y_dequant[q][1]; cpi->uv_quant[q][i] = cpi->uv_quant[q][1]; cpi->uv_quant_shift[q][i] = cpi->uv_quant_shift[q][1]; cpi->uv_zbin[q][i] = cpi->uv_zbin[q][1]; cpi->uv_round[q][i] = cpi->uv_round[q][1]; - cpi->common.uv_dequant[q][i] = cpi->common.uv_dequant[q][1]; + cm->uv_dequant[q][i] = cm->uv_dequant[q][1]; #if CONFIG_ALPHA cpi->a_quant[q][i] = cpi->a_quant[q][1]; cpi->a_quant_shift[q][i] = cpi->a_quant_shift[q][1]; cpi->a_zbin[q][i] = cpi->a_zbin[q][1]; cpi->a_round[q][i] = cpi->a_round[q][1]; - cpi->common.a_dequant[q][i] = cpi->common.a_dequant[q][1]; + cm->a_dequant[q][i] = cm->a_dequant[q][1]; #endif } } @@ -272,8 +273,9 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { int i; MACROBLOCKD *xd = &x->e_mbd; int zbin_extra; - int segment_id = xd->mode_info_context->mbmi.segment_id; - const int qindex = vp9_get_qindex(xd, segment_id, cpi->common.base_qindex); + int segment_id = xd->this_mi->mbmi.segment_id; + const int qindex = vp9_get_qindex(&cpi->common.seg, segment_id, + cpi->common.base_qindex); // Y zbin_extra = (cpi->common.y_dequant[qindex][1] * @@ -308,7 +310,8 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex]; #endif - x->skip_block = vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP); + x->skip_block = vp9_segfeature_active(&cpi->common.seg, segment_id, + SEG_LVL_SKIP); /* save this macroblock QIndex for vp9_update_zbin_extra() */ x->e_mbd.q_index = qindex; diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c index d3a9529..2d12ba9 100644 --- a/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/libvpx/vp9/encoder/vp9_ratectrl.c @@ -71,7 +71,6 @@ int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex, void vp9_save_coding_context(VP9_COMP *cpi) { CODING_CONTEXT *const cc = &cpi->coding_context; VP9_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &cpi->mb.e_mbd; // Stores a snapshot of key state variables which can subsequently be // restored with a call to vp9_restore_coding_context. These functions are @@ -89,7 +88,7 @@ void vp9_save_coding_context(VP9_COMP *cpi) { vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob); vp9_copy(cc->partition_prob, cm->fc.partition_prob); - vp9_copy(cc->segment_pred_probs, xd->seg.pred_probs); + vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs); vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob); vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob); @@ -99,8 +98,8 @@ void vp9_save_coding_context(VP9_COMP *cpi) { vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy, cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols)); - vp9_copy(cc->last_ref_lf_deltas, xd->lf.last_ref_deltas); - vp9_copy(cc->last_mode_lf_deltas, xd->lf.last_mode_deltas); + vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas); + vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas); vp9_copy(cc->coef_probs, cm->fc.coef_probs); vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob); @@ -111,7 +110,6 @@ void vp9_save_coding_context(VP9_COMP *cpi) { void vp9_restore_coding_context(VP9_COMP *cpi) { CODING_CONTEXT *const cc = &cpi->coding_context; VP9_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &cpi->mb.e_mbd; // Restore key state variables to the snapshot state stored in the // previous call to vp9_save_coding_context. @@ -127,7 +125,7 @@ void vp9_restore_coding_context(VP9_COMP *cpi) { vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob); vp9_copy(cm->fc.partition_prob, cc->partition_prob); - vp9_copy(xd->seg.pred_probs, cc->segment_pred_probs); + vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs); vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob); vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob); @@ -138,8 +136,8 @@ void vp9_restore_coding_context(VP9_COMP *cpi) { cpi->coding_context.last_frame_seg_map_copy, (cm->mi_rows * cm->mi_cols)); - vp9_copy(xd->lf.last_ref_deltas, cc->last_ref_lf_deltas); - vp9_copy(xd->lf.last_mode_deltas, cc->last_mode_lf_deltas); + vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas); + vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas); vp9_copy(cm->fc.coef_probs, cc->coef_probs); vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob); @@ -149,9 +147,8 @@ void vp9_restore_coding_context(VP9_COMP *cpi) { void vp9_setup_key_frame(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &cpi->mb.e_mbd; - vp9_setup_past_independence(cm, xd); + vp9_setup_past_independence(cm); // interval before next GF cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; @@ -162,9 +159,8 @@ void vp9_setup_key_frame(VP9_COMP *cpi) { void vp9_setup_inter_frame(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &cpi->mb.e_mbd; if (cm->error_resilient_mode || cm->intra_only) - vp9_setup_past_independence(cm, xd); + vp9_setup_past_independence(cm); assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS); cm->fc = cm->frame_contexts[cm->frame_context_idx]; diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c index 2d93250..df00334 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.c +++ b/libvpx/vp9/encoder/vp9_rdopt.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include <stdio.h> #include <math.h> #include <limits.h> @@ -49,65 +48,66 @@ DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]); -#define I4X4_PRED 0x8000 -#define SPLITMV 0x10000 +#define LAST_FRAME_MODE_MASK 0xFFDADCD60 +#define GOLDEN_FRAME_MODE_MASK 0xFFB5A3BB0 +#define ALT_REF_MODE_MASK 0xFF8C648D0 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { - {NEARESTMV, LAST_FRAME, NONE}, - {DC_PRED, INTRA_FRAME, NONE}, - - {NEARESTMV, ALTREF_FRAME, NONE}, - {NEARESTMV, GOLDEN_FRAME, NONE}, - {NEWMV, LAST_FRAME, NONE}, - {NEARESTMV, LAST_FRAME, ALTREF_FRAME}, - {NEARMV, LAST_FRAME, NONE}, - {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME}, - - {NEWMV, GOLDEN_FRAME, NONE}, - {NEWMV, ALTREF_FRAME, NONE}, - {NEARMV, ALTREF_FRAME, NONE}, - - {TM_PRED, INTRA_FRAME, NONE}, - - {NEARMV, LAST_FRAME, ALTREF_FRAME}, - {NEWMV, LAST_FRAME, ALTREF_FRAME}, - {NEARMV, GOLDEN_FRAME, NONE}, - {NEARMV, GOLDEN_FRAME, ALTREF_FRAME}, - {NEWMV, GOLDEN_FRAME, ALTREF_FRAME}, - - {SPLITMV, LAST_FRAME, NONE}, - {SPLITMV, GOLDEN_FRAME, NONE}, - {SPLITMV, ALTREF_FRAME, NONE}, - {SPLITMV, LAST_FRAME, ALTREF_FRAME}, - {SPLITMV, GOLDEN_FRAME, ALTREF_FRAME}, - - {ZEROMV, LAST_FRAME, NONE}, - {ZEROMV, GOLDEN_FRAME, NONE}, - {ZEROMV, ALTREF_FRAME, NONE}, - {ZEROMV, LAST_FRAME, ALTREF_FRAME}, - {ZEROMV, GOLDEN_FRAME, ALTREF_FRAME}, - - {I4X4_PRED, INTRA_FRAME, NONE}, - {H_PRED, INTRA_FRAME, NONE}, - {V_PRED, INTRA_FRAME, NONE}, - {D135_PRED, INTRA_FRAME, NONE}, - {D27_PRED, INTRA_FRAME, NONE}, - {D153_PRED, INTRA_FRAME, NONE}, - {D63_PRED, INTRA_FRAME, NONE}, - {D117_PRED, INTRA_FRAME, NONE}, - {D45_PRED, INTRA_FRAME, NONE}, + {RD_NEARESTMV, LAST_FRAME, NONE}, + {RD_NEARESTMV, ALTREF_FRAME, NONE}, + {RD_NEARESTMV, GOLDEN_FRAME, NONE}, + + {RD_DC_PRED, INTRA_FRAME, NONE}, + + {RD_NEWMV, LAST_FRAME, NONE}, + {RD_NEWMV, ALTREF_FRAME, NONE}, + {RD_NEWMV, GOLDEN_FRAME, NONE}, + + {RD_NEARMV, LAST_FRAME, NONE}, + {RD_NEARMV, ALTREF_FRAME, NONE}, + {RD_NEARESTMV, LAST_FRAME, ALTREF_FRAME}, + {RD_NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME}, + + {RD_TM_PRED, INTRA_FRAME, NONE}, + + {RD_NEARMV, LAST_FRAME, ALTREF_FRAME}, + {RD_NEWMV, LAST_FRAME, ALTREF_FRAME}, + {RD_NEARMV, GOLDEN_FRAME, NONE}, + {RD_NEARMV, GOLDEN_FRAME, ALTREF_FRAME}, + {RD_NEWMV, GOLDEN_FRAME, ALTREF_FRAME}, + + {RD_SPLITMV, LAST_FRAME, NONE}, + {RD_SPLITMV, GOLDEN_FRAME, NONE}, + {RD_SPLITMV, ALTREF_FRAME, NONE}, + {RD_SPLITMV, LAST_FRAME, ALTREF_FRAME}, + {RD_SPLITMV, GOLDEN_FRAME, ALTREF_FRAME}, + + {RD_ZEROMV, LAST_FRAME, NONE}, + {RD_ZEROMV, GOLDEN_FRAME, NONE}, + {RD_ZEROMV, ALTREF_FRAME, NONE}, + {RD_ZEROMV, LAST_FRAME, ALTREF_FRAME}, + {RD_ZEROMV, GOLDEN_FRAME, ALTREF_FRAME}, + + {RD_I4X4_PRED, INTRA_FRAME, NONE}, + {RD_H_PRED, INTRA_FRAME, NONE}, + {RD_V_PRED, INTRA_FRAME, NONE}, + {RD_D135_PRED, INTRA_FRAME, NONE}, + {RD_D207_PRED, INTRA_FRAME, NONE}, + {RD_D153_PRED, INTRA_FRAME, NONE}, + {RD_D63_PRED, INTRA_FRAME, NONE}, + {RD_D117_PRED, INTRA_FRAME, NONE}, + {RD_D45_PRED, INTRA_FRAME, NONE}, }; // The baseline rd thresholds for breaking out of the rd loop for // certain modes are assumed to be based on 8x8 blocks. // This table is used to correct for blocks size. // The factors here are << 2 (2 = x0.5, 32 = x8 etc). -static int rd_thresh_block_size_factor[BLOCK_SIZE_TYPES] = +static int rd_thresh_block_size_factor[BLOCK_SIZES] = {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32}; -#define BASE_RD_THRESH_FREQ_FACT 16 -#define MAX_RD_THRESH_FREQ_FACT 32 -#define MAX_RD_THRESH_FREQ_INC 1 +#define MAX_RD_THRESH_FACT 64 +#define RD_THRESH_INC 1 static void fill_token_costs(vp9_coeff_cost *c, vp9_coeff_probs_model (*p)[BLOCK_TYPES]) { @@ -160,6 +160,15 @@ static int compute_rd_mult(int qindex) { return (11 * q * q) >> 2; } +static MB_PREDICTION_MODE rd_mode_to_mode(RD_PREDICTION_MODE rd_mode) { + if (rd_mode == RD_SPLITMV || rd_mode == RD_I4X4_PRED) { + assert(!"Invalid rd_mode"); + return MB_MODE_COUNT; + } + assert((int)rd_mode < (int)MB_MODE_COUNT); + return (MB_PREDICTION_MODE)rd_mode; +} + void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) { cpi->mb.sadperbit16 = sad_per_bit16lut[qindex]; cpi->mb.sadperbit4 = sad_per_bit4lut[qindex]; @@ -199,7 +208,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { cpi->RDDIV = 1; cpi->RDMULT /= 100; - for (bsize = 0; bsize < BLOCK_SIZE_TYPES; ++bsize) { + for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { for (i = 0; i < MAX_MODES; ++i) { // Threshold here seem unecessarily harsh but fine given actual // range of values used for cpi->sf.thresh_mult[] @@ -213,18 +222,12 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { } else { cpi->rd_threshes[bsize][i] = INT_MAX; } - cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i]; - - if (cpi->sf.adaptive_rd_thresh) - cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT; - else - cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT; } } } else { cpi->RDDIV = 100; - for (bsize = 0; bsize < BLOCK_SIZE_TYPES; ++bsize) { + for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { for (i = 0; i < MAX_MODES; i++) { // Threshold here seem unecessarily harsh but fine given actual // range of values used for cpi->sf.thresh_mult[] @@ -237,12 +240,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { } else { cpi->rd_threshes[bsize][i] = INT_MAX; } - cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i]; - - if (cpi->sf.adaptive_rd_thresh) - cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT; - else - cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT; } } } @@ -277,16 +274,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { } } -static INLINE BLOCK_SIZE_TYPE get_block_size(int bwl, int bhl) { - return bsize_from_dim_lookup[bwl][bhl]; -} - -static BLOCK_SIZE_TYPE get_plane_block_size(BLOCK_SIZE_TYPE bsize, - struct macroblockd_plane *pd) { - return get_block_size(plane_block_width_log2by4(bsize, pd), - plane_block_height_log2by4(bsize, pd)); -} - static INLINE void linear_interpolate2(double x, int ntab, int inv_step, const double *tab1, const double *tab2, double *v1, double *v2) { @@ -388,7 +375,7 @@ static void model_rd_from_var_lapndz(int var, int n, int qstep, vp9_clear_system_state(); } -static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, +static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum) { // Note our transform coeffs are 8 times an orthogonal transform. @@ -399,18 +386,14 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, for (i = 0; i < MAX_MB_PLANE; ++i) { struct macroblock_plane *const p = &x->plane[i]; struct macroblockd_plane *const pd = &xd->plane[i]; - - // TODO(dkovalev) the same code in get_plane_block_size - const int bwl = plane_block_width_log2by4(bsize, pd); - const int bhl = plane_block_height_log2by4(bsize, pd); - const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl); + const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); unsigned int sse; int rate; int64_t dist; (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse); // sse works better than var, since there is no dc prediction used - model_rd_from_var_lapndz(sse, 16 << (bwl + bhl), + model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs], pd->dequant[1] >> 3, &rate, &dist); rate_sum += rate; @@ -421,81 +404,52 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, *out_dist_sum = dist_sum << 4; } -static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, - MACROBLOCK *x, MACROBLOCKD *xd, - int *out_rate_sum, int64_t *out_dist_sum) { - // Note our transform coeffs are 8 times an orthogonal transform. - // Hence quantizer step is also 8 times. To get effective quantizer - // we need to divide by 8 before sending to modeling function. - struct macroblock_plane *const p = &x->plane[0]; - struct macroblockd_plane *const pd = &xd->plane[0]; - - // TODO(dkovalev) the same code in get_plane_block_size - const int bwl = plane_block_width_log2by4(bsize, pd); - const int bhl = plane_block_height_log2by4(bsize, pd); - const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl); - unsigned int sse; - int rate; - int64_t dist; - (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, - pd->dst.buf, pd->dst.stride, &sse); - // sse works better than var, since there is no dc prediction used - model_rd_from_var_lapndz(sse, 16 << (bwl + bhl), - pd->dequant[1] >> 3, &rate, &dist); - - *out_rate_sum = rate; - *out_dist_sum = dist << 4; -} - -static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, +static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize, TX_SIZE tx_size, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, int *out_skip) { - int t = 4, j, k; - BLOCK_SIZE_TYPE bs = BLOCK_4X4; + int j, k; + BLOCK_SIZE bs; struct macroblock_plane *const p = &x->plane[0]; struct macroblockd_plane *const pd = &xd->plane[0]; - const int width = plane_block_width(bsize, pd); - const int height = plane_block_height(bsize, pd); + const int width = 4 << num_4x4_blocks_wide_lookup[bsize]; + const int height = 4 << num_4x4_blocks_high_lookup[bsize]; int rate_sum = 0; int64_t dist_sum = 0; + const int t = 4 << tx_size; if (tx_size == TX_4X4) { bs = BLOCK_4X4; - t = 4; } else if (tx_size == TX_8X8) { bs = BLOCK_8X8; - t = 8; } else if (tx_size == TX_16X16) { bs = BLOCK_16X16; - t = 16; } else if (tx_size == TX_32X32) { bs = BLOCK_32X32; - t = 32; } else { assert(0); } + *out_skip = 1; for (j = 0; j < height; j += t) { for (k = 0; k < width; k += t) { int rate; int64_t dist; unsigned int sse; - (void) cpi->fn_ptr[bs].vf(p->src.buf + j * p->src.stride + k, - p->src.stride, - pd->dst.buf + j * pd->dst.stride + k, - pd->dst.stride, &sse); + cpi->fn_ptr[bs].vf(&p->src.buf[j * p->src.stride + k], p->src.stride, + &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride, + &sse); // sse works better than var, since there is no dc prediction used - model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, - &rate, &dist); + model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, &rate, &dist); rate_sum += rate; dist_sum += dist; *out_skip &= (rate < 1024); } } + *out_rate_sum = rate_sum; - *out_dist_sum = (dist_sum << 4); + *out_dist_sum = dist_sum << 4; } int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, @@ -526,42 +480,39 @@ static const int16_t band_counts[TX_SIZES][8] = { }; static INLINE int cost_coeffs(MACROBLOCK *mb, - int plane, int block, PLANE_TYPE type, + int plane, int block, ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L, TX_SIZE tx_size, const int16_t *scan, const int16_t *nb) { MACROBLOCKD *const xd = &mb->e_mbd; - MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; - int pt, c, cost; + MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; + struct macroblockd_plane *pd = &xd->plane[plane]; + const PLANE_TYPE type = pd->plane_type; const int16_t *band_count = &band_counts[tx_size][1]; - const int eob = xd->plane[plane].eobs[block]; - const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16); + const int eob = pd->eobs[block]; + const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block); const int ref = mbmi->ref_frame[0] != INTRA_FRAME; - unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref]; - ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L; + unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] = + mb->token_costs[tx_size][type][ref]; + const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L; uint8_t token_cache[1024]; + int pt = combine_entropy_contexts(above_ec, left_ec); + int c, cost; // Check for consistency of tx_size with mode info - assert((!type && !plane) || (type && plane)); - if (type == PLANE_TYPE_Y_WITH_DC) { - assert(xd->mode_info_context->mbmi.txfm_size == tx_size); - } else { - assert(tx_size == get_uv_tx_size(mbmi)); - } - - pt = combine_entropy_contexts(above_ec, left_ec); + assert(type == PLANE_TYPE_Y_WITH_DC ? mbmi->tx_size == tx_size + : get_uv_tx_size(mbmi) == tx_size); if (eob == 0) { // single eob token cost = token_costs[0][0][pt][DCT_EOB_TOKEN]; c = 0; } else { - int v, prev_t, band_left = *band_count++; + int band_left = *band_count++; // dc token - v = qcoeff_ptr[0]; - prev_t = vp9_dct_value_tokens_ptr[v].token; + int v = qcoeff_ptr[0]; + int prev_t = vp9_dct_value_tokens_ptr[v].token; cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v]; token_cache[0] = vp9_pt_energy_class[prev_t]; ++token_costs; @@ -591,13 +542,12 @@ static INLINE int cost_coeffs(MACROBLOCK *mb, } // is eob first coefficient; - *A = *L = c > 0; + *A = *L = (c > 0); return cost; } struct rdcost_block_args { - VP9_COMMON *cm; MACROBLOCK *x; ENTROPY_CONTEXT t_above[16]; ENTROPY_CONTEXT t_left[16]; @@ -612,23 +562,23 @@ struct rdcost_block_args { const int16_t *scan, *nb; }; -static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { +static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) { + const int ss_txfrm_size = tx_size << 1; struct rdcost_block_args* args = arg; MACROBLOCK* const x = args->x; MACROBLOCKD* const xd = &x->e_mbd; - struct macroblock_plane *const p = &x->plane[0]; - struct macroblockd_plane *const pd = &xd->plane[0]; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; int64_t this_sse; int shift = args->tx_size == TX_32X32 ? 0 : 2; - int16_t *const coeff = BLOCK_OFFSET(p->coeff, block, 16); - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16); + int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse) >> shift; args->sse += this_sse >> shift; if (x->skip_encode && - xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) { + xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME) { // TODO(jingning): tune the model to better capture the distortion. int64_t p = (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >> shift; @@ -637,119 +587,25 @@ static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize, } } -static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { +static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { struct rdcost_block_args* args = arg; - int x_idx, y_idx; - MACROBLOCKD * const xd = &args->x->e_mbd; - txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx, - &y_idx); + int x_idx, y_idx; + txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx); args->rate += cost_coeffs(args->x, plane, block, - xd->plane[plane].plane_type, args->t_above + x_idx, + args->t_above + x_idx, args->t_left + y_idx, args->tx_size, args->scan, args->nb); } -// FIXME(jingning): need to make the rd test of chroma components consistent -// with that of luma component. this function should be deprecated afterwards. -static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane, - BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { - MACROBLOCKD * const xd = &x->e_mbd; - const int bwl = plane_block_width_log2by4(bsize, &xd->plane[plane]); - const int bhl = plane_block_height_log2by4(bsize, &xd->plane[plane]); - const int bw = 1 << bwl, bh = 1 << bhl; - int i; - struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, - 0, 0, 0, INT64_MAX, 0 }; - - switch (tx_size) { - case TX_4X4: - vpx_memcpy(&args.t_above, xd->plane[plane].above_context, - sizeof(ENTROPY_CONTEXT) * bw); - vpx_memcpy(&args.t_left, xd->plane[plane].left_context, - sizeof(ENTROPY_CONTEXT) * bh); - args.scan = vp9_default_scan_4x4; - args.nb = vp9_default_scan_4x4_neighbors; - break; - case TX_8X8: - for (i = 0; i < bw; i += 2) - args.t_above[i] = !!*(uint16_t *)&xd->plane[plane].above_context[i]; - for (i = 0; i < bh; i += 2) - args.t_left[i] = !!*(uint16_t *)&xd->plane[plane].left_context[i]; - args.scan = vp9_default_scan_8x8; - args.nb = vp9_default_scan_8x8_neighbors; - break; - case TX_16X16: - for (i = 0; i < bw; i += 4) - args.t_above[i] = !!*(uint32_t *)&xd->plane[plane].above_context[i]; - for (i = 0; i < bh; i += 4) - args.t_left[i] = !!*(uint32_t *)&xd->plane[plane].left_context[i]; - args.scan = vp9_default_scan_16x16; - args.nb = vp9_default_scan_16x16_neighbors; - break; - case TX_32X32: - for (i = 0; i < bw; i += 8) - args.t_above[i] = !!*(uint64_t *)&xd->plane[plane].above_context[i]; - for (i = 0; i < bh; i += 8) - args.t_left[i] = !!*(uint64_t *)&xd->plane[plane].left_context[i]; - args.scan = vp9_default_scan_32x32; - args.nb = vp9_default_scan_32x32_neighbors; - break; - default: - assert(0); - } - - foreach_transformed_block_in_plane(xd, bsize, plane, rate_block, &args); - return args.rate; -} - -static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { - int cost = 0, plane; - - for (plane = 1; plane < MAX_MB_PLANE; plane++) { - cost += rdcost_plane(cm, x, plane, bsize, tx_size); - } - return cost; -} - -static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, - int shift, int64_t *sse) { - struct macroblockd_plane *p = &x->e_mbd.plane[0]; - const int bwl = plane_block_width_log2by4(bsize, p); - const int bhl = plane_block_height_log2by4(bsize, p); - int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff, - 16 << (bwl + bhl), sse) >> shift; - *sse >>= shift; - return e; -} - -static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, - int shift, int64_t *sse) { - int64_t sum = 0, this_sse; - int plane; - - *sse = 0; - for (plane = 1; plane < MAX_MB_PLANE; plane++) { - struct macroblockd_plane *p = &x->e_mbd.plane[plane]; - const int bwl = plane_block_width_log2by4(bsize, p); - const int bhl = plane_block_height_log2by4(bsize, p); - sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff, - 16 << (bwl + bhl), &this_sse); - *sse += this_sse; - } - *sse >>= shift; - return sum >> shift; -} - -static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { +static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { struct rdcost_block_args *args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; - struct encode_b_args encode_args = {args->cm, x, NULL}; + struct encode_b_args encode_args = {x, NULL}; int64_t rd1, rd2, rd; if (args->skip) @@ -765,58 +621,61 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize, return; } - if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) - encode_block_intra(plane, block, bsize, ss_txfrm_size, &encode_args); + if (!is_inter_block(&xd->this_mi->mbmi)) + vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args); else - xform_quant(plane, block, bsize, ss_txfrm_size, &encode_args); + vp9_xform_quant(plane, block, plane_bsize, tx_size, &encode_args); - dist_block(plane, block, bsize, ss_txfrm_size, args); - rate_block(plane, block, bsize, ss_txfrm_size, args); + dist_block(plane, block, tx_size, args); + rate_block(plane, block, plane_bsize, tx_size, args); } -static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, - int *rate, int64_t *distortion, - int *skippable, int64_t *sse, - int64_t ref_best_rd, - BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { +static void txfm_rd_in_plane(MACROBLOCK *x, + int *rate, int64_t *distortion, + int *skippable, int64_t *sse, + int64_t ref_best_rd, int plane, + BLOCK_SIZE bsize, TX_SIZE tx_size) { MACROBLOCKD *const xd = &x->e_mbd; - struct macroblockd_plane *const pd = &xd->plane[0]; - const int bwl = plane_block_width_log2by4(bsize, pd); - const int bhl = plane_block_height_log2by4(bsize, pd); - const int bw = 1 << bwl, bh = 1 << bhl; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs]; int i; - struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, + struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size, + num_4x4_blocks_wide, num_4x4_blocks_high, 0, 0, 0, ref_best_rd, 0 }; - xd->mode_info_context->mbmi.txfm_size = tx_size; + if (plane == 0) + xd->this_mi->mbmi.tx_size = tx_size; + switch (tx_size) { case TX_4X4: vpx_memcpy(&args.t_above, pd->above_context, - sizeof(ENTROPY_CONTEXT) * bw); + sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide); vpx_memcpy(&args.t_left, pd->left_context, - sizeof(ENTROPY_CONTEXT) * bh); - get_scan_nb_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, 0), + sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high); + get_scan_nb_4x4(get_tx_type_4x4(pd->plane_type, xd, 0), &args.scan, &args.nb); break; case TX_8X8: - for (i = 0; i < bw; i += 2) + for (i = 0; i < num_4x4_blocks_wide; i += 2) args.t_above[i] = !!*(uint16_t *)&pd->above_context[i]; - for (i = 0; i < bh; i += 2) + for (i = 0; i < num_4x4_blocks_high; i += 2) args.t_left[i] = !!*(uint16_t *)&pd->left_context[i]; - get_scan_nb_8x8(get_tx_type_8x8(PLANE_TYPE_Y_WITH_DC, xd), + get_scan_nb_8x8(get_tx_type_8x8(pd->plane_type, xd), &args.scan, &args.nb); break; case TX_16X16: - for (i = 0; i < bw; i += 4) + for (i = 0; i < num_4x4_blocks_wide; i += 4) args.t_above[i] = !!*(uint32_t *)&pd->above_context[i]; - for (i = 0; i < bh; i += 4) + for (i = 0; i < num_4x4_blocks_high; i += 4) args.t_left[i] = !!*(uint32_t *)&pd->left_context[i]; - get_scan_nb_16x16(get_tx_type_16x16(PLANE_TYPE_Y_WITH_DC, xd), + get_scan_nb_16x16(get_tx_type_16x16(pd->plane_type, xd), &args.scan, &args.nb); break; case TX_32X32: - for (i = 0; i < bw; i += 8) + for (i = 0; i < num_4x4_blocks_wide; i += 8) args.t_above[i] = !!*(uint64_t *)&pd->above_context[i]; - for (i = 0; i < bh; i += 8) + for (i = 0; i < num_4x4_blocks_high; i += 8) args.t_left[i] = !!*(uint64_t *)&pd->left_context[i]; args.scan = vp9_default_scan_32x32; args.nb = vp9_default_scan_32x32_neighbors; @@ -825,40 +684,39 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, assert(0); } - foreach_transformed_block_in_plane(xd, bsize, 0, block_yrd_txfm, &args); + foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args); *distortion = args.dist; *rate = args.rate; *sse = args.sse; - *skippable = vp9_sby_is_skippable(xd, bsize) && (!args.skip); + *skippable = vp9_is_skippable_in_plane(xd, bsize, plane) && (!args.skip); } static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skip, int64_t *sse, int64_t ref_best_rd, - BLOCK_SIZE_TYPE bs) { - const TX_SIZE max_txfm_size = TX_32X32 - - (bs < BLOCK_32X32) - (bs < BLOCK_16X16); + BLOCK_SIZE bs) { + const TX_SIZE max_txfm_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; if (max_txfm_size == TX_32X32 && (cm->tx_mode == ALLOW_32X32 || cm->tx_mode == TX_MODE_SELECT)) { - mbmi->txfm_size = TX_32X32; + mbmi->tx_size = TX_32X32; } else if (max_txfm_size >= TX_16X16 && (cm->tx_mode == ALLOW_16X16 || cm->tx_mode == ALLOW_32X32 || cm->tx_mode == TX_MODE_SELECT)) { - mbmi->txfm_size = TX_16X16; + mbmi->tx_size = TX_16X16; } else if (cm->tx_mode != ONLY_4X4) { - mbmi->txfm_size = TX_8X8; + mbmi->tx_size = TX_8X8; } else { - mbmi->txfm_size = TX_4X4; + mbmi->tx_size = TX_4X4; } - super_block_yrd_for_txfm(cm, x, rate, distortion, skip, - &sse[mbmi->txfm_size], ref_best_rd, bs, - mbmi->txfm_size); + txfm_rd_in_plane(x, rate, distortion, skip, + &sse[mbmi->tx_size], ref_best_rd, 0, bs, + mbmi->tx_size); cpi->txfm_stepdown_count[0]++; } @@ -867,18 +725,17 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int64_t *d, int64_t *distortion, int *s, int *skip, int64_t tx_cache[TX_MODES], - BLOCK_SIZE_TYPE bs) { - const TX_SIZE max_tx_size = TX_32X32 - - (bs < BLOCK_32X32) - (bs < BLOCK_16X16); + BLOCK_SIZE bs) { + const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd); int64_t rd[TX_SIZES][2]; int n, m; int s0, s1; - const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs); + const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->this_mi); for (n = TX_4X4; n <= max_tx_size; n++) { r[n][1] = r[n][0]; @@ -914,26 +771,26 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, (cm->tx_mode == TX_MODE_SELECT && rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] && rd[TX_32X32][1] < rd[TX_4X4][1]))) { - mbmi->txfm_size = TX_32X32; + mbmi->tx_size = TX_32X32; } else if (max_tx_size >= TX_16X16 && (cm->tx_mode == ALLOW_16X16 || cm->tx_mode == ALLOW_32X32 || (cm->tx_mode == TX_MODE_SELECT && rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1]))) { - mbmi->txfm_size = TX_16X16; + mbmi->tx_size = TX_16X16; } else if (cm->tx_mode == ALLOW_8X8 || cm->tx_mode == ALLOW_16X16 || cm->tx_mode == ALLOW_32X32 || (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) { - mbmi->txfm_size = TX_8X8; + mbmi->tx_size = TX_8X8; } else { - mbmi->txfm_size = TX_4X4; + mbmi->tx_size = TX_4X4; } - *distortion = d[mbmi->txfm_size]; - *rate = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT]; - *skip = s[mbmi->txfm_size]; + *distortion = d[mbmi->tx_size]; + *rate = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT]; + *skip = s[mbmi->tx_size]; tx_cache[ONLY_4X4] = rd[TX_4X4][0]; tx_cache[ALLOW_8X8] = rd[TX_8X8][0]; @@ -971,13 +828,11 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, int64_t *d, int64_t *distortion, int *s, int *skip, int64_t *sse, int64_t ref_best_rd, - BLOCK_SIZE_TYPE bs, - int *model_used) { - const TX_SIZE max_txfm_size = TX_32X32 - - (bs < BLOCK_32X32) - (bs < BLOCK_16X16); + BLOCK_SIZE bs) { + const TX_SIZE max_txfm_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd); int64_t rd[TX_SIZES][2]; int n, m; @@ -985,7 +840,7 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00}; // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00}; - const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs); + const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->this_mi); // for (n = TX_4X4; n <= max_txfm_size; n++) // r[n][0] = (r[n][0] * scale_r[n]); @@ -1023,35 +878,28 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, rd[TX_32X32][1] <= rd[TX_16X16][1] && rd[TX_32X32][1] <= rd[TX_8X8][1] && rd[TX_32X32][1] <= rd[TX_4X4][1]))) { - mbmi->txfm_size = TX_32X32; + mbmi->tx_size = TX_32X32; } else if (max_txfm_size >= TX_16X16 && (cm->tx_mode == ALLOW_16X16 || cm->tx_mode == ALLOW_32X32 || (cm->tx_mode == TX_MODE_SELECT && rd[TX_16X16][1] <= rd[TX_8X8][1] && rd[TX_16X16][1] <= rd[TX_4X4][1]))) { - mbmi->txfm_size = TX_16X16; + mbmi->tx_size = TX_16X16; } else if (cm->tx_mode == ALLOW_8X8 || cm->tx_mode == ALLOW_16X16 || cm->tx_mode == ALLOW_32X32 || (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] <= rd[TX_4X4][1])) { - mbmi->txfm_size = TX_8X8; + mbmi->tx_size = TX_8X8; } else { - mbmi->txfm_size = TX_4X4; + mbmi->tx_size = TX_4X4; } - if (model_used[mbmi->txfm_size]) { - // Actually encode using the chosen mode if a model was used, but do not - // update the r, d costs - super_block_yrd_for_txfm(cm, x, rate, distortion, skip, - &sse[mbmi->txfm_size], ref_best_rd, - bs, mbmi->txfm_size); - } else { - *distortion = d[mbmi->txfm_size]; - *rate = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT]; - *skip = s[mbmi->txfm_size]; - } + // Actually encode using the chosen mode if a model was used, but do not + // update the r, d costs + txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size], + ref_best_rd, 0, bs, mbmi->tx_size); if (max_txfm_size == TX_32X32 && rd[TX_32X32][1] <= rd[TX_16X16][1] && @@ -1071,14 +919,13 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, - int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs, + int *skip, int64_t *psse, BLOCK_SIZE bs, int64_t txfm_cache[TX_MODES], int64_t ref_best_rd) { - VP9_COMMON *const cm = &cpi->common; int r[TX_SIZES][2], s[TX_SIZES]; int64_t d[TX_SIZES], sse[TX_SIZES]; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; assert(bs == mbmi->sb_type); if (mbmi->ref_frame[0] > INTRA_FRAME) @@ -1091,65 +938,43 @@ static void super_block_yrd(VP9_COMP *cpi, choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, ref_best_rd, bs); if (psse) - *psse = sse[mbmi->txfm_size]; + *psse = sse[mbmi->tx_size]; return; } if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER && mbmi->ref_frame[0] > INTRA_FRAME) { - int model_used[TX_SIZES] = {1, 1, 1, 1}; - if (bs >= BLOCK_32X32) { - if (model_used[TX_32X32]) - model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd, - &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]); - else - super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], - &s[TX_32X32], &sse[TX_32X32], INT64_MAX, - bs, TX_32X32); - } - if (bs >= BLOCK_16X16) { - if (model_used[TX_16X16]) - model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd, - &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]); - else - super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], - &s[TX_16X16], &sse[TX_16X16], INT64_MAX, - bs, TX_16X16); - } - if (model_used[TX_8X8]) - model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd, - &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]); - else - super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], - &sse[TX_8X8], INT64_MAX, bs, TX_8X8); + if (bs >= BLOCK_32X32) + model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd, + &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]); + if (bs >= BLOCK_16X16) + model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd, + &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]); - if (model_used[TX_4X4]) - model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd, - &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]); - else - super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], - &sse[TX_4X4], INT64_MAX, bs, TX_4X4); + model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd, + &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]); + + model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd, + &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]); choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s, - skip, sse, ref_best_rd, bs, model_used); + skip, sse, ref_best_rd, bs); } else { if (bs >= BLOCK_32X32) - super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], - &s[TX_32X32], &sse[TX_32X32], ref_best_rd, - bs, TX_32X32); + txfm_rd_in_plane(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], + &sse[TX_32X32], ref_best_rd, 0, bs, TX_32X32); if (bs >= BLOCK_16X16) - super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], - &s[TX_16X16], &sse[TX_16X16], ref_best_rd, - bs, TX_16X16); - super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], - &sse[TX_8X8], ref_best_rd, bs, TX_8X8); - super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], - &sse[TX_4X4], ref_best_rd, bs, TX_4X4); + txfm_rd_in_plane(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], + &sse[TX_16X16], ref_best_rd, 0, bs, TX_16X16); + txfm_rd_in_plane(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], + &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8); + txfm_rd_in_plane(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], + &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4); choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache, bs); } if (psse) - *psse = sse[mbmi->txfm_size]; + *psse = sse[mbmi->tx_size]; } static int conditional_skipintra(MB_PREDICTION_MODE mode, @@ -1162,7 +987,7 @@ static int conditional_skipintra(MB_PREDICTION_MODE mode, best_intra_mode != V_PRED && best_intra_mode != D45_PRED) return 1; - if (mode == D27_PRED && + if (mode == D207_PRED && best_intra_mode != H_PRED && best_intra_mode != D45_PRED) return 1; @@ -1179,8 +1004,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, int64_t *bestdistortion, - BLOCK_SIZE_TYPE bsize, - int64_t rd_thresh) { + BLOCK_SIZE bsize, int64_t rd_thresh) { MB_PREDICTION_MODE mode; MACROBLOCKD *xd = &x->e_mbd; int64_t best_rd = rd_thresh; @@ -1190,9 +1014,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, struct macroblockd_plane *pd = &xd->plane[0]; const int src_stride = p->src.stride; const int dst_stride = pd->dst.stride; - uint8_t *src_init = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, ib, + uint8_t *src_init = raster_block_offset_uint8(BLOCK_8X8, ib, p->src.buf, src_stride); - uint8_t *dst_init = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, ib, + uint8_t *dst_init = raster_block_offset_uint8(BLOCK_8X8, ib, pd->dst.buf, dst_stride); int16_t *src_diff, *coeff; @@ -1208,11 +1032,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, vpx_memcpy(ta, a, sizeof(ta)); vpx_memcpy(tl, l, sizeof(tl)); - xd->mode_info_context->mbmi.txfm_size = TX_4X4; + xd->this_mi->mbmi.tx_size = TX_4X4; for (mode = DC_PRED; mode <= TM_PRED; ++mode) { int64_t this_rd; int ratey = 0; + + if (!(cpi->sf.intra_y_mode_mask & (1 << mode))) + continue; + // Only do the oblique modes if the best so far is // one of the neighboring directional modes if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { @@ -1234,10 +1062,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride; block = ib + idy * 2 + idx; - xd->mode_info_context->bmi[block].as_mode = mode; - src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block, - p->src_diff); - coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16); + xd->this_mi->bmi[block].as_mode = mode; + src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); + coeff = BLOCK_OFFSET(x->plane[0].coeff, block); vp9_predict_intra_block(xd, block, 1, TX_4X4, mode, x->skip_encode ? src : dst, @@ -1257,20 +1084,19 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, } scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block)); - ratey += cost_coeffs(x, 0, block, PLANE_TYPE_Y_WITH_DC, + ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4, scan, vp9_get_coef_neighbors_handle(scan)); - distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, - block, 16), + distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &ssz) >> 2; if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next; if (tx_type != DCT_DCT) - vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16), + vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride, tx_type); else - xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16), + xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride); } } @@ -1312,7 +1138,10 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi, int64_t best_rd) { int i, j; MACROBLOCKD *const xd = &mb->e_mbd; - BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; + MODE_INFO *const mic = xd->this_mi; + const MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride]; + const MODE_INFO *left_mi = xd->mi_8x8[-1]; + const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; int idx, idy; @@ -1322,7 +1151,6 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi, int64_t total_rd = 0; ENTROPY_CONTEXT t_above[4], t_left[4]; int *bmode_costs; - MODE_INFO *const mic = xd->mode_info_context; vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above)); vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left)); @@ -1332,24 +1160,22 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi, // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block. for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { - const int mis = xd->mode_info_stride; - MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); - int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry); - int64_t UNINITIALIZED_IS_SAFE(d), this_rd; + MB_PREDICTION_MODE best_mode = DC_PRED; + int r = INT_MAX, ry = INT_MAX; + int64_t d = INT64_MAX, this_rd = INT64_MAX; i = idy * 2 + idx; - if (cpi->common.frame_type == KEY_FRAME) { - const MB_PREDICTION_MODE A = above_block_mode(mic, i, mis); + const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, i); const MB_PREDICTION_MODE L = (xd->left_available || idx) ? - left_block_mode(mic, i) : DC_PRED; + left_block_mode(mic, left_mi, i) : + DC_PRED; bmode_costs = mb->y_mode_costs[A][L]; } this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs, - t_above + idx, t_left + idy, - &r, &ry, &d, bsize, - best_rd - total_rd); + t_above + idx, t_left + idy, &r, &ry, &d, + bsize, best_rd - total_rd); if (this_rd >= best_rd - total_rd) return INT64_MAX; @@ -1372,7 +1198,7 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi, *rate = cost; *rate_y = tot_rate_y; *distortion = total_distortion; - xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode; + mic->mbmi.mode = mic->bmi[3].as_mode; return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion); } @@ -1380,15 +1206,16 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi, static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, - BLOCK_SIZE_TYPE bsize, + BLOCK_SIZE bsize, int64_t tx_cache[TX_MODES], int64_t best_rd) { MB_PREDICTION_MODE mode; - MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); + MB_PREDICTION_MODE mode_selected = DC_PRED; MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mic = xd->this_mi; int this_rate, this_rate_tokenonly, s; int64_t this_distortion, this_rd; - TX_SIZE UNINITIALIZED_IS_SAFE(best_tx); + TX_SIZE best_tx = TX_4X4; int i; int *bmode_costs = x->mbmode_cost; @@ -1399,17 +1226,20 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, /* Y Search for intra prediction mode */ for (mode = DC_PRED; mode <= TM_PRED; mode++) { int64_t local_tx_cache[TX_MODES]; - MODE_INFO *const mic = xd->mode_info_context; - const int mis = xd->mode_info_stride; + MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride]; + MODE_INFO *left_mi = xd->mi_8x8[-1]; + + if (!(cpi->sf.intra_y_mode_mask & (1 << mode))) + continue; if (cpi->common.frame_type == KEY_FRAME) { - const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis); + const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, 0); const MB_PREDICTION_MODE L = xd->left_available ? - left_block_mode(mic, 0) : DC_PRED; + left_block_mode(mic, left_mi, 0) : DC_PRED; bmode_costs = x->y_mode_costs[A][L]; } - x->e_mbd.mode_info_context->mbmi.mode = mode; + mic->mbmi.mode = mode; super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL, bsize, local_tx_cache, best_rd); @@ -1423,7 +1253,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, if (this_rd < best_rd) { mode_selected = mode; best_rd = this_rd; - best_tx = x->e_mbd.mode_info_context->mbmi.txfm_size; + best_tx = mic->mbmi.tx_size; *rate = this_rate; *rate_tokenonly = this_rate_tokenonly; *distortion = this_distortion; @@ -1431,7 +1261,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, } if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) { - for (i = 0; i < TX_MODES; i++) { + for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) { const int64_t adj_rd = this_rd + local_tx_cache[i] - local_tx_cache[cpi->common.tx_mode]; if (adj_rd < tx_cache[i]) { @@ -1441,61 +1271,78 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, } } - x->e_mbd.mode_info_context->mbmi.mode = mode_selected; - x->e_mbd.mode_info_context->mbmi.txfm_size = best_tx; + mic->mbmi.mode = mode_selected; + mic->mbmi.tx_size = best_tx; return best_rd; } -static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, - int *rate, int64_t *distortion, - int *skippable, int64_t *sse, - BLOCK_SIZE_TYPE bsize, - TX_SIZE uv_tx_size) { - MACROBLOCKD *const xd = &x->e_mbd; - int64_t dummy; - if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) - vp9_encode_intra_block_uv(cm, x, bsize); - else - vp9_xform_quant_sbuv(cm, x, bsize); - - *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2, - sse ? sse : &dummy); - *rate = rdcost_uv(cm, x, bsize, uv_tx_size); - *skippable = vp9_sbuv_is_skippable(xd, bsize); -} - static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, - int64_t *sse, BLOCK_SIZE_TYPE bsize) { + int64_t *sse, BLOCK_SIZE bsize, + int64_t ref_best_rd) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi); + int plane; + int pnrate = 0, pnskip = 1; + int64_t pndist = 0, pnsse = 0; - if (mbmi->ref_frame[0] > INTRA_FRAME) + if (ref_best_rd < 0) + goto term; + + if (is_inter_block(mbmi)) vp9_subtract_sbuv(x, bsize); - super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, sse, bsize, - uv_txfm_size); + *rate = 0; + *distortion = 0; + *sse = 0; + *skippable = 1; + + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse, + ref_best_rd, plane, bsize, uv_txfm_size); + if (pnrate == INT_MAX) + goto term; + *rate += pnrate; + *distortion += pndist; + *sse += pnsse; + *skippable &= pnskip; + } + return; + + term: + *rate = INT_MAX; + *distortion = INT64_MAX; + *sse = INT64_MAX; + *skippable = 0; + return; } static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, - BLOCK_SIZE_TYPE bsize) { + BLOCK_SIZE bsize) { MB_PREDICTION_MODE mode; - MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); + MB_PREDICTION_MODE mode_selected = DC_PRED; int64_t best_rd = INT64_MAX, this_rd; int this_rate_tokenonly, this_rate, s; - int64_t this_distortion; + int64_t this_distortion, this_sse; + + // int mode_mask = (bsize <= BLOCK_8X8) + // ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask; + + for (mode = DC_PRED; mode <= TM_PRED; mode++) { + // if (!(mode_mask & (1 << mode))) + if (!(cpi->sf.intra_uv_mode_mask & (1 << mode))) + continue; - MB_PREDICTION_MODE last_mode = bsize <= BLOCK_SIZE_SB8X8 ? - TM_PRED : cpi->sf.last_chroma_intra_mode; + x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode; - for (mode = DC_PRED; mode <= last_mode; mode++) { - x->e_mbd.mode_info_context->mbmi.uv_mode = mode; super_block_uvrd(&cpi->common, x, &this_rate_tokenonly, - &this_distortion, &s, NULL, bsize); + &this_distortion, &s, &this_sse, bsize, best_rd); + if (this_rate_tokenonly == INT_MAX) + continue; this_rate = this_rate_tokenonly + x->intra_uv_mode_cost[cpi->common.frame_type][mode]; this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion); @@ -1510,7 +1357,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, } } - x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected; + x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected; return best_rd; } @@ -1518,12 +1365,13 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, - BLOCK_SIZE_TYPE bsize) { + BLOCK_SIZE bsize) { int64_t this_rd; + int64_t this_sse; - x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; + x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED; super_block_uvrd(&cpi->common, x, rate_tokenonly, - distortion, skippable, NULL, bsize); + distortion, skippable, &this_sse, bsize, INT64_MAX); *rate = *rate_tokenonly + x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED]; this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion); @@ -1531,7 +1379,7 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x, return this_rd; } -static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, +static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize, int *rate_uv, int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv, MB_PREDICTION_MODE *mode_uv) { @@ -1541,27 +1389,25 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, // appropriate speed flag is set. if (cpi->sf.use_uv_intra_rd_estimate) { rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, - (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : - bsize); + bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); // Else do a proper rd search for each possible transform size that may // be considered in the main rd loop. } else { rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, - (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 - : bsize); + bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); } - *mode_uv = x->e_mbd.mode_info_context->mbmi.uv_mode; + *mode_uv = x->e_mbd.mi_8x8[0]->mbmi.uv_mode; } static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode, int mode_context) { MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - const int segment_id = xd->mode_info_context->mbmi.segment_id; + const int segment_id = xd->this_mi->mbmi.segment_id; // Don't account for mode here if segment skip is enabled. - if (!vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) { + if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) { assert(is_inter_mode(mode)); return x->inter_mode_cost[mode_context][mode - NEARESTMV]; } else { @@ -1570,18 +1416,18 @@ static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode, } void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) { - x->e_mbd.mode_info_context->mbmi.mode = mb; - x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int; + x->e_mbd.mi_8x8[0]->mbmi.mode = mb; + x->e_mbd.mi_8x8[0]->mbmi.mv[0].as_int = mv->as_int; } static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize, + BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], int *rate_mv); static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize, + BLOCK_SIZE bsize, int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv); @@ -1594,8 +1440,8 @@ static int labels2mode(MACROBLOCK *x, int i, int_mv *second_best_ref_mv, int *mvjcost, int *mvcost[2], VP9_COMP *cpi) { MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *const mic = xd->mode_info_context; - MB_MODE_INFO * mbmi = &mic->mbmi; + MODE_INFO *const mic = xd->this_mi; + MB_MODE_INFO *mbmi = &mic->mbmi; int cost = 0, thismvcost = 0; int idx, idy; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type]; @@ -1641,7 +1487,7 @@ static int labels2mode(MACROBLOCK *x, int i, } cost = cost_mv_ref(cpi, this_mode, - mbmi->mb_mode_context[mbmi->ref_frame[0]]); + mbmi->mode_context[mbmi->ref_frame[0]]); mic->bmi[i].as_mv[0].as_int = this_mv->as_int; if (mbmi->ref_frame[1] > 0) @@ -1668,42 +1514,32 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, int k; MACROBLOCKD *xd = &x->e_mbd; struct macroblockd_plane *const pd = &xd->plane[0]; - MODE_INFO *const mi = xd->mode_info_context; - const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type; + MODE_INFO *const mi = xd->this_mi; + const BLOCK_SIZE bsize = mi->mbmi.sb_type; const int width = plane_block_width(bsize, pd); const int height = plane_block_height(bsize, pd); int idx, idy; const int src_stride = x->plane[0].src.stride; - uint8_t* const src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, + uint8_t* const src = raster_block_offset_uint8(BLOCK_8X8, i, x->plane[0].src.buf, src_stride); - int16_t* src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i, + int16_t* src_diff = raster_block_offset_int16(BLOCK_8X8, i, x->plane[0].src_diff); - int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i); - uint8_t* const pre = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, - pd->pre[0].buf, - pd->pre[0].stride); - uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, - pd->dst.buf, - pd->dst.stride); + int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, i); + uint8_t* const dst = raster_block_offset_uint8(BLOCK_8X8, i, + pd->dst.buf, pd->dst.stride); int64_t thisdistortion = 0, thissse = 0; int thisrate = 0; + int ref, second_ref = has_second_ref(&mi->mbmi); - vp9_build_inter_predictor(pre, pd->pre[0].stride, - dst, pd->dst.stride, - &mi->bmi[i].as_mv[0].as_mv, - &xd->scale_factor[0], - width, height, 0, &xd->subpix, MV_PRECISION_Q3); - - if (mi->mbmi.ref_frame[1] > 0) { - uint8_t* const second_pre = - raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, - pd->pre[1].buf, pd->pre[1].stride); - vp9_build_inter_predictor(second_pre, pd->pre[1].stride, + for (ref = 0; ref < 1 + second_ref; ++ref) { + const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i, + pd->pre[ref].buf, pd->pre[ref].stride); + vp9_build_inter_predictor(pre, pd->pre[ref].stride, dst, pd->dst.stride, - &mi->bmi[i].as_mv[1].as_mv, - &xd->scale_factor[1], - width, height, 1, &xd->subpix, MV_PRECISION_Q3); + &mi->bmi[i].as_mv[ref].as_mv, + &xd->scale_factor[ref], + width, height, ref, &xd->subpix, MV_PRECISION_Q3); } vp9_subtract_block(height, width, src_diff, 8, src, src_stride, @@ -1715,15 +1551,15 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, int64_t ssz, rd, rd1, rd2; k += (idy * 2 + idx); - src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k, + src_diff = raster_block_offset_int16(BLOCK_8X8, k, x->plane[0].src_diff); - coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k); + coeff = BLOCK_OFFSET(x->plane[0].coeff, k); x->fwd_txm4x4(src_diff, coeff, 16); x->quantize_b_4x4(x, k, DCT_DCT, 16); - thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k, 16), + thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz); thissse += ssz; - thisrate += cost_coeffs(x, 0, k, PLANE_TYPE_Y_WITH_DC, + thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4, vp9_default_scan_4x4, @@ -1764,7 +1600,7 @@ typedef struct { int64_t sse; int segment_yrate; MB_PREDICTION_MODE modes[4]; - SEG_RDSTAT rdstat[4][VP9_INTER_MODES]; + SEG_RDSTAT rdstat[4][INTER_MODES]; int mvthresh; } BEST_SEG_INFO; @@ -1778,26 +1614,23 @@ static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) { } static INLINE void mi_buf_shift(MACROBLOCK *x, int i) { - MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi; - x->plane[0].src.buf = - raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i, - x->plane[0].src.buf, - x->plane[0].src.stride); - assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0x7) == 0); - x->e_mbd.plane[0].pre[0].buf = - raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i, - x->e_mbd.plane[0].pre[0].buf, - x->e_mbd.plane[0].pre[0].stride); + MB_MODE_INFO *const mbmi = &x->e_mbd.mi_8x8[0]->mbmi; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &x->e_mbd.plane[0]; + + p->src.buf = raster_block_offset_uint8(BLOCK_8X8, i, p->src.buf, + p->src.stride); + assert(((intptr_t)pd->pre[0].buf & 0x7) == 0); + pd->pre[0].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[0].buf, + pd->pre[0].stride); if (mbmi->ref_frame[1]) - x->e_mbd.plane[0].pre[1].buf = - raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_SB8X8, 0, i, - x->e_mbd.plane[0].pre[1].buf, - x->e_mbd.plane[0].pre[1].stride); + pd->pre[1].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[1].buf, + pd->pre[1].stride); } static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src, struct buf_2d orig_pre[2]) { - MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi; + MB_MODE_INFO *mbmi = &x->e_mbd.mi_8x8[0]->mbmi; x->plane[0].src = orig_src; x->e_mbd.plane[0].pre[0] = orig_pre[0]; if (mbmi->ref_frame[1]) @@ -1811,13 +1644,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, int i, j, br = 0, idx, idy; int64_t bd = 0, block_sse = 0; MB_PREDICTION_MODE this_mode; - MODE_INFO *mi = x->e_mbd.mode_info_context; + MODE_INFO *mi = x->e_mbd.mi_8x8[0]; MB_MODE_INFO *const mbmi = &mi->mbmi; const int label_count = 4; int64_t this_segment_rd = 0; int label_mv_thresh; int segmentyrate = 0; - BLOCK_SIZE_TYPE bsize = mbmi->sb_type; + const BLOCK_SIZE bsize = mbmi->sb_type; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; vp9_variance_fn_ptr_t *v_fn_ptr; @@ -1874,7 +1707,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 && (mbmi->ref_frame[1] <= 0 || frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) { - int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]]; + int rfc = mbmi->mode_context[mbmi->ref_frame[0]]; int c1 = cost_mv_ref(cpi, NEARMV, rfc); int c2 = cost_mv_ref(cpi, NEARESTMV, rfc); int c3 = cost_mv_ref(cpi, ZEROMV, rfc); @@ -1919,6 +1752,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, int thissme, bestsme = INT_MAX; int sadpb = x->sadperbit4; int_mv mvp_full; + int max_mv; /* Is the best so far sufficiently good that we cant justify doing * and new motion search. */ @@ -1928,40 +1762,58 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (cpi->compressor_speed) { // use previous block's result as next block's MV predictor. if (i > 0) { - bsi->mvp.as_int = - x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int; + bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int; if (i == 2) - bsi->mvp.as_int = - x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int; + bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int; } } + if (i == 0) + max_mv = x->max_mv_context[mbmi->ref_frame[0]]; + else + max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3; + if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) { // Take wtd average of the step_params based on the last frame's // max mv magnitude and the best ref mvs of the current block for // the given reference. - if (i == 0) - step_param = (vp9_init_search_range( - cpi, x->max_mv_context[mbmi->ref_frame[0]]) + - cpi->mv_step_param) >> 1; - else - step_param = (vp9_init_search_range( - cpi, MAX(abs(bsi->mvp.as_mv.row), - abs(bsi->mvp.as_mv.col)) >> 3) + - cpi->mv_step_param) >> 1; + step_param = (vp9_init_search_range(cpi, max_mv) + + cpi->mv_step_param) >> 1; } else { step_param = cpi->mv_step_param; } - further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; - mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3; mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3; + if (cpi->sf.adaptive_motion_search && cpi->common.show_frame) { + mvp_full.as_mv.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3; + mvp_full.as_mv.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3; + step_param = MAX(step_param, 8); + } + + further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; // adjust src pointer for this block mi_buf_shift(x, i); - bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, - sadpb, further_steps, 0, v_fn_ptr, - bsi->ref_mv, &mode_mv[NEWMV]); + if (cpi->sf.search_method == HEX) { + bestsme = vp9_hex_search(x, &mvp_full, + step_param, + sadpb, 1, v_fn_ptr, 1, + bsi->ref_mv, &mode_mv[NEWMV]); + } else if (cpi->sf.search_method == SQUARE) { + bestsme = vp9_square_search(x, &mvp_full, + step_param, + sadpb, 1, v_fn_ptr, 1, + bsi->ref_mv, &mode_mv[NEWMV]); + } else if (cpi->sf.search_method == BIGDIA) { + bestsme = vp9_bigdia_search(x, &mvp_full, + step_param, + sadpb, 1, v_fn_ptr, 1, + bsi->ref_mv, &mode_mv[NEWMV]); + } else { + bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, + sadpb, further_steps, 0, v_fn_ptr, + bsi->ref_mv, &mode_mv[NEWMV]); + } // Should we do a full search (best quality only) if (cpi->compressor_speed == 0) { @@ -1976,13 +1828,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (thissme < bestsme) { bestsme = thissme; - mode_mv[NEWMV].as_int = - x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int; + mode_mv[NEWMV].as_int = mi->bmi[i].as_mv[0].as_int; } else { /* The full search result is actually worse so re-instate the * previous best vector */ - x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int = - mode_mv[NEWMV].as_int; + mi->bmi[i].as_mv[0].as_int = mode_mv[NEWMV].as_int; } } @@ -1991,19 +1841,23 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, unsigned int sse; cpi->find_fractional_mv_step(x, &mode_mv[NEWMV], bsi->ref_mv, x->errorperbit, v_fn_ptr, + 0, cpi->sf.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &distortion, &sse); - // safe motion search result for use in compound prediction + // save motion search result for use in compound prediction seg_mvs[i][mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int; } + if (cpi->sf.adaptive_motion_search) + x->pred_mv[mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int; + // restore src pointers mi_buf_restore(x, orig_src, orig_pre); } if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV && - mbmi->interp_filter == vp9_switchable_interp[0]) { + mbmi->interp_filter == EIGHTTAP) { if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV || seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) continue; @@ -2114,7 +1968,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (best_rd == INT64_MAX) { int iy, midx; for (iy = i + 1; iy < 4; ++iy) - for (midx = 0; midx < VP9_INTER_MODES; ++midx) + for (midx = 0; midx < INTER_MODES; ++midx) bsi->rdstat[iy][midx].brdcost = INT64_MAX; bsi->segment_rd = INT64_MAX; return; @@ -2138,7 +1992,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (this_segment_rd > bsi->segment_rd) { int iy, midx; for (iy = i + 1; iy < 4; ++iy) - for (midx = 0; midx < VP9_INTER_MODES; ++midx) + for (midx = 0; midx < INTER_MODES; ++midx) bsi->rdstat[iy][midx].brdcost = INT64_MAX; bsi->segment_rd = INT64_MAX; return; @@ -2182,7 +2036,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, int i; BEST_SEG_INFO *bsi = bsi_buf + filter_idx; MACROBLOCKD *xd = &x->e_mbd; - MODE_INFO *mi = xd->mode_info_context; + MODE_INFO *mi = xd->this_mi; MB_MODE_INFO *mbmi = &mi->mbmi; int mode_idx; @@ -2217,7 +2071,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, *returntotrate = bsi->r; *returndistortion = bsi->d; *returnyrate = bsi->segment_yrate; - *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8); + *skippable = vp9_is_skippable_in_plane(&x->e_mbd, BLOCK_8X8, 0); *psse = bsi->sse; mbmi->mode = bsi->modes[3]; @@ -2226,9 +2080,9 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, int ref_y_stride, - int ref_frame, BLOCK_SIZE_TYPE block_size ) { + int ref_frame, BLOCK_SIZE block_size ) { MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; int_mv this_mv; int i; int zero_seen = 0; @@ -2240,10 +2094,15 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *src_y_ptr = x->plane[0].src.buf; uint8_t *ref_y_ptr; int row_offset, col_offset; + int num_mv_refs = MAX_MV_REF_CANDIDATES + + (cpi->sf.adaptive_motion_search && + cpi->common.show_frame && + block_size < cpi->sf.max_partition_size); // Get the sad for each candidate reference mv - for (i = 0; i < MAX_MV_REF_CANDIDATES; i++) { - this_mv.as_int = mbmi->ref_mvs[ref_frame][i].as_int; + for (i = 0; i < num_mv_refs; i++) { + this_mv.as_int = (i < MAX_MV_REF_CANDIDATES) ? + mbmi->ref_mvs[ref_frame][i].as_int : x->pred_mv[ref_frame].as_int; max_mv = MAX(max_mv, MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3); @@ -2279,7 +2138,7 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, vp9_prob *comp_mode_p) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; - int seg_ref_active = vp9_segfeature_active(&xd->seg, segment_id, + int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); if (seg_ref_active) { vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single)); @@ -2341,14 +2200,14 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int_mv *second_ref_mv, int64_t comp_pred_diff[NB_PREDICTION_TYPES], int64_t tx_size_diff[TX_MODES], - int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]) { + int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]) { MACROBLOCKD *const xd = &x->e_mbd; // Take a snapshot of the coding context so it can be // restored if we decide to encode this way ctx->skip = x->skip; ctx->best_mode_index = mode_index; - ctx->mic = *xd->mode_info_context; + ctx->mic = *xd->this_mi; if (partition) ctx->partition_info = *partition; @@ -2364,7 +2223,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, // doesn't actually work this way memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff)); memcpy(ctx->best_filter_diff, best_filter_diff, - sizeof(*best_filter_diff) * (VP9_SWITCHABLE_FILTERS + 1)); + sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1)); } static void setup_pred_block(const MACROBLOCKD *xd, @@ -2395,7 +2254,7 @@ static void setup_pred_block(const MACROBLOCKD *xd, static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, int idx, MV_REFERENCE_FRAME frame_type, - BLOCK_SIZE_TYPE block_size, + BLOCK_SIZE block_size, int mi_row, int mi_col, int_mv frame_nearest_mv[MAX_REF_FRAMES], int_mv frame_near_mv[MAX_REF_FRAMES], @@ -2404,17 +2263,17 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, VP9_COMMON *cm = &cpi->common; YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]]; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; // set up scaling factors scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1]; scale[frame_type].x_offset_q4 = ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].x_scale_fp, - VP9_REF_SCALE_SHIFT) & 0xf; + REF_SCALE_SHIFT) & 0xf; scale[frame_type].y_offset_q4 = ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].y_scale_fp, - VP9_REF_SCALE_SHIFT) & 0xf; + REF_SCALE_SHIFT) & 0xf; // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this // use the UV scaling factors. @@ -2422,11 +2281,10 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, &scale[frame_type], &scale[frame_type]); // Gets an initial list of candidate vectors from neighbours and orders them - vp9_find_mv_refs(&cpi->common, xd, xd->mode_info_context, - xd->prev_mode_info_context, + vp9_find_mv_refs(&cpi->common, xd, xd->this_mi, + xd->last_mi, frame_type, - mbmi->ref_mvs[frame_type], - cpi->common.ref_frame_sign_bias, mi_row, mi_col); + mbmi->ref_mvs[frame_type], mi_row, mi_col); // Candidate refinement carried out at encoder and decoder vp9_find_best_ref_mvs(xd, @@ -2437,8 +2295,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, // Further refinement that is encode side only to test the top few candidates // in full and choose the best as the centre point for subsequent searches. // The current implementation doesn't support scaling. - if (scale[frame_type].x_scale_fp == VP9_REF_NO_SCALE && - scale[frame_type].y_scale_fp == VP9_REF_NO_SCALE) + if (!vp9_is_scaled(&scale[frame_type])) mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride, frame_type, block_size); } @@ -2446,27 +2303,27 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) { YV12_BUFFER_CONFIG *scaled_ref_frame = NULL; int fb = get_ref_frame_idx(cpi, ref_frame); - if (cpi->scaled_ref_idx[fb] != cpi->common.ref_frame_map[fb]) - scaled_ref_frame = &cpi->common.yv12_fb[cpi->scaled_ref_idx[fb]]; + int fb_scale = get_scale_ref_frame_idx(cpi, ref_frame); + if (cpi->scaled_ref_idx[fb_scale] != cpi->common.ref_frame_map[fb]) + scaled_ref_frame = &cpi->common.yv12_fb[cpi->scaled_ref_idx[fb_scale]]; return scaled_ref_frame; } -static INLINE int get_switchable_rate(MACROBLOCK *x) { - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; - - const int c = vp9_get_pred_context_switchable_interp(xd); - const int m = vp9_switchable_interp_map[mbmi->interp_filter]; - return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m]; +static INLINE int get_switchable_rate(const MACROBLOCK *x) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; + const int ctx = vp9_get_pred_context_switchable_interp(xd); + return SWITCHABLE_INTERP_RATE_FACTOR * + x->switchable_interp_costs[ctx][mbmi->interp_filter]; } static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize, + BLOCK_SIZE bsize, int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv) { MACROBLOCKD *xd = &x->e_mbd; VP9_COMMON *cm = &cpi->common; - MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}}; int bestsme = INT_MAX; int further_steps, step_param; @@ -2474,7 +2331,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int_mv mvp_full; int ref = mbmi->ref_frame[0]; int_mv ref_mv = mbmi->ref_mvs[ref][0]; - const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]); + const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); int tmp_col_min = x->mv_col_min; int tmp_col_max = x->mv_col_max; @@ -2494,7 +2351,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); } - vp9_clamp_mv_min_max(x, &ref_mv); + vp9_clamp_mv_min_max(x, &ref_mv.as_mv); // Adjust search parameters based on small partitions' result. if (x->fast_ms) { @@ -2506,7 +2363,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, step_param = 8; // Get prediction MV. - mvp_full.as_int = x->pred_mv.as_int; + mvp_full.as_int = x->pred_mv[ref].as_int; // Adjust MV sign if needed. if (cm->ref_frame_sign_bias[ref]) { @@ -2525,21 +2382,49 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } else { step_param = cpi->mv_step_param; } - // mvp_full.as_int = ref_mv[0].as_int; - mvp_full.as_int = - mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int; } + if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 && + cpi->common.show_frame) { + int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize), + b_width_log2(bsize))); + step_param = MAX(step_param, boffset); + } + + mvp_full.as_int = x->mv_best_ref_index[ref] < MAX_MV_REF_CANDIDATES ? + mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int : + x->pred_mv[ref].as_int; + mvp_full.as_mv.col >>= 3; mvp_full.as_mv.row >>= 3; // Further step/diamond searches as necessary further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; - bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, - sadpb, further_steps, 1, - &cpi->fn_ptr[block_size], - &ref_mv, tmp_mv); + if (cpi->sf.search_method == HEX) { + bestsme = vp9_hex_search(x, &mvp_full, + step_param, + sadpb, 1, + &cpi->fn_ptr[block_size], 1, + &ref_mv, tmp_mv); + } else if (cpi->sf.search_method == SQUARE) { + bestsme = vp9_square_search(x, &mvp_full, + step_param, + sadpb, 1, + &cpi->fn_ptr[block_size], 1, + &ref_mv, tmp_mv); + } else if (cpi->sf.search_method == BIGDIA) { + bestsme = vp9_bigdia_search(x, &mvp_full, + step_param, + sadpb, 1, + &cpi->fn_ptr[block_size], 1, + &ref_mv, tmp_mv); + } else { + bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, + sadpb, further_steps, 1, + &cpi->fn_ptr[block_size], + &ref_mv, tmp_mv); + } x->mv_col_min = tmp_col_min; x->mv_col_max = tmp_col_max; @@ -2547,17 +2432,22 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, x->mv_row_max = tmp_row_max; if (bestsme < INT_MAX) { - int dis; /* TODO: use dis in distortion calculation later. */ + int dis; /* TODO: use dis in distortion calculation later. */ unsigned int sse; cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv, x->errorperbit, &cpi->fn_ptr[block_size], + 0, cpi->sf.subpel_iters_per_step, x->nmvjointcost, x->mvcost, &dis, &sse); } *rate_mv = vp9_mv_bit_cost(tmp_mv, &ref_mv, x->nmvjointcost, x->mvcost, 96); + + if (cpi->sf.adaptive_motion_search && cpi->common.show_frame) + x->pred_mv[ref].as_int = tmp_mv->as_int; + if (scaled_ref_frame) { int i; for (i = 0; i < MAX_MB_PLANE; i++) @@ -2566,18 +2456,18 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize, + BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], int *rate_mv) { int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize); MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; int refs[2] = { mbmi->ref_frame[0], (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; int_mv ref_mv[2]; - const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]); + const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); int ite; // Prediction buffer from second frame. uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t)); @@ -2653,7 +2543,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, // Compound motion search on first ref frame. if (id) xd->plane[0].pre[0] = ref_yv12[id]; - vp9_clamp_mv_min_max(x, &ref_mv[id]); + vp9_clamp_mv_min_max(x, &ref_mv[id].as_mv); // Use mv result from single mode as mvp. tmp_mv.as_int = frame_mv[refs[id]].as_int; @@ -2678,13 +2568,15 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int dis; /* TODO: use dis in distortion calculation later. */ unsigned int sse; - bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv, - &ref_mv[id], - x->errorperbit, - &cpi->fn_ptr[block_size], - x->nmvjointcost, x->mvcost, - &dis, &sse, second_pred, - pw, ph); + bestsme = cpi->find_fractional_mv_step_comp( + x, &tmp_mv, + &ref_mv[id], + x->errorperbit, + &cpi->fn_ptr[block_size], + 0, cpi->sf.subpel_iters_per_step, + x->nmvjointcost, x->mvcost, + &dis, &sse, second_pred, + pw, ph); } if (id) @@ -2721,7 +2613,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize, + BLOCK_SIZE bsize, int64_t txfm_cache[], int *rate2, int64_t *distortion, int *skippable, @@ -2732,10 +2624,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], - int64_t *psse, int64_t ref_best_rd) { + int64_t *psse, + const int64_t ref_best_rd) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; const int is_comp_pred = (mbmi->ref_frame[1] > 0); const int num_refs = is_comp_pred ? 2 : 1; const int this_mode = mbmi->mode; @@ -2747,7 +2640,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int64_t this_rd = 0; DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64); int pred_exists = 0; - int interpolating_intpel_seen = 0; int intpel_mv; int64_t rd, best_rd = INT64_MAX; int best_needs_copy = 0; @@ -2782,7 +2674,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv); *rate2 += rate_mv; frame_mv[refs[0]].as_int = - xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int; + xd->this_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int; single_newmv[refs[0]].as_int = tmp_mv.as_int; } } @@ -2790,9 +2682,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // if we're near/nearest and mv == 0,0, compare to zeromv if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) && frame_mv[refs[0]].as_int == 0 && - !vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP) && + !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) && (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) { - int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]]; + int rfc = mbmi->mode_context[mbmi->ref_frame[0]]; int c1 = cost_mv_ref(cpi, NEARMV, rfc); int c2 = cost_mv_ref(cpi, NEARESTMV, rfc); int c3 = cost_mv_ref(cpi, ZEROMV, rfc); @@ -2849,7 +2741,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, * words if you present them in that order, the second one is always known * if the first is known */ *rate2 += cost_mv_ref(cpi, this_mode, - mbmi->mb_mode_context[mbmi->ref_frame[0]]); + mbmi->mode_context[mbmi->ref_frame[0]]); if (!(*mode_excluded)) { if (is_comp_pred) { @@ -2860,7 +2752,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } pred_exists = 0; - interpolating_intpel_seen = 0; // Are all MVs integer pel for Y and UV intpel_mv = (mbmi->mv[0].as_mv.row & 15) == 0 && (mbmi->mv[0].as_mv.col & 15) == 0; @@ -2869,98 +2760,97 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, (mbmi->mv[1].as_mv.col & 15) == 0; // Search for best switchable filter by checking the variance of // pred error irrespective of whether the filter will be used - *best_filter = EIGHTTAP; - if (cpi->sf.use_8tap_always) { + if (cm->mcomp_filter_type != BILINEAR) { *best_filter = EIGHTTAP; - vp9_zero(cpi->rd_filter_cache); - } else { - int i, newbest; - int tmp_rate_sum = 0; - int64_t tmp_dist_sum = 0; - - cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX; - for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) { - int j; - int64_t rs_rd; - const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i]; - const int is_intpel_interp = intpel_mv; - mbmi->interp_filter = filter; - vp9_setup_interp_filters(xd, mbmi->interp_filter, cm); - rs = get_switchable_rate(x); - rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0); - - if (interpolating_intpel_seen && is_intpel_interp) { - cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv, - tmp_rate_sum, tmp_dist_sum); - cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = - MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], - cpi->rd_filter_cache[i] + rs_rd); - rd = cpi->rd_filter_cache[i]; - if (cm->mcomp_filter_type == SWITCHABLE) - rd += rs_rd; - } else { - int rate_sum = 0; - int64_t dist_sum = 0; - if ((cm->mcomp_filter_type == SWITCHABLE && - (!i || best_needs_copy)) || - (cm->mcomp_filter_type != SWITCHABLE && - (cm->mcomp_filter_type == mbmi->interp_filter || - (!interpolating_intpel_seen && is_intpel_interp)))) { - for (j = 0; j < MAX_MB_PLANE; j++) { - xd->plane[j].dst.buf = orig_dst[j]; - xd->plane[j].dst.stride = orig_dst_stride[j]; - } + if (x->source_variance < + cpi->sf.disable_filter_search_var_thresh) { + *best_filter = EIGHTTAP; + vp9_zero(cpi->rd_filter_cache); + } else { + int i, newbest; + int tmp_rate_sum = 0; + int64_t tmp_dist_sum = 0; + + cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX; + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { + int j; + int64_t rs_rd; + mbmi->interp_filter = i; + vp9_setup_interp_filters(xd, mbmi->interp_filter, cm); + rs = get_switchable_rate(x); + rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0); + + if (i > 0 && intpel_mv) { + cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv, + tmp_rate_sum, tmp_dist_sum); + cpi->rd_filter_cache[SWITCHABLE_FILTERS] = + MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], + cpi->rd_filter_cache[i] + rs_rd); + rd = cpi->rd_filter_cache[i]; + if (cm->mcomp_filter_type == SWITCHABLE) + rd += rs_rd; } else { - for (j = 0; j < MAX_MB_PLANE; j++) { - xd->plane[j].dst.buf = tmp_buf + j * 64 * 64; - xd->plane[j].dst.stride = 64; + int rate_sum = 0; + int64_t dist_sum = 0; + if ((cm->mcomp_filter_type == SWITCHABLE && + (!i || best_needs_copy)) || + (cm->mcomp_filter_type != SWITCHABLE && + (cm->mcomp_filter_type == mbmi->interp_filter || + (i == 0 && intpel_mv)))) { + for (j = 0; j < MAX_MB_PLANE; j++) { + xd->plane[j].dst.buf = orig_dst[j]; + xd->plane[j].dst.stride = orig_dst_stride[j]; + } + } else { + for (j = 0; j < MAX_MB_PLANE; j++) { + xd->plane[j].dst.buf = tmp_buf + j * 64 * 64; + xd->plane[j].dst.stride = 64; + } + } + vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); + model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum); + cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv, + rate_sum, dist_sum); + cpi->rd_filter_cache[SWITCHABLE_FILTERS] = + MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], + cpi->rd_filter_cache[i] + rs_rd); + rd = cpi->rd_filter_cache[i]; + if (cm->mcomp_filter_type == SWITCHABLE) + rd += rs_rd; + if (i == 0 && intpel_mv) { + tmp_rate_sum = rate_sum; + tmp_dist_sum = dist_sum; } } - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum); - cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv, - rate_sum, dist_sum); - cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = - MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], - cpi->rd_filter_cache[i] + rs_rd); - rd = cpi->rd_filter_cache[i]; - if (cm->mcomp_filter_type == SWITCHABLE) - rd += rs_rd; - if (!interpolating_intpel_seen && is_intpel_interp) { - tmp_rate_sum = rate_sum; - tmp_dist_sum = dist_sum; - } - } - if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { - if (rd / 2 > ref_best_rd) { - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = orig_dst[i]; - xd->plane[i].dst.stride = orig_dst_stride[i]; + if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { + if (rd / 2 > ref_best_rd) { + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = orig_dst[i]; + xd->plane[i].dst.stride = orig_dst_stride[i]; + } + return INT64_MAX; } - return INT64_MAX; } - } - newbest = i == 0 || rd < best_rd; - - if (newbest) { - best_rd = rd; - *best_filter = mbmi->interp_filter; - if (cm->mcomp_filter_type == SWITCHABLE && i && - !(interpolating_intpel_seen && is_intpel_interp)) - best_needs_copy = !best_needs_copy; - } + newbest = i == 0 || rd < best_rd; + + if (newbest) { + best_rd = rd; + *best_filter = mbmi->interp_filter; + if (cm->mcomp_filter_type == SWITCHABLE && i && !intpel_mv) + best_needs_copy = !best_needs_copy; + } - if ((cm->mcomp_filter_type == SWITCHABLE && newbest) || - (cm->mcomp_filter_type != SWITCHABLE && - cm->mcomp_filter_type == mbmi->interp_filter)) { - pred_exists = 1; + if ((cm->mcomp_filter_type == SWITCHABLE && newbest) || + (cm->mcomp_filter_type != SWITCHABLE && + cm->mcomp_filter_type == mbmi->interp_filter)) { + pred_exists = 1; + } } - interpolating_intpel_seen |= is_intpel_interp; - } - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = orig_dst[i]; - xd->plane[i].dst.stride = orig_dst_stride[i]; + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = orig_dst[i]; + xd->plane[i].dst.stride = orig_dst_stride[i]; + } } } // Set the appropriate filter @@ -3003,30 +2893,34 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (cpi->common.mcomp_filter_type == SWITCHABLE) *rate2 += get_switchable_rate(x); - if (!is_comp_pred) { + if (!is_comp_pred && cpi->enable_encode_breakout) { if (cpi->active_map_enabled && x->active_ptr[0] == 0) x->skip = 1; else if (x->encode_breakout) { - const BLOCK_SIZE_TYPE y_size = get_plane_block_size(bsize, &xd->plane[0]); - const BLOCK_SIZE_TYPE uv_size = get_plane_block_size(bsize, - &xd->plane[1]); + const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]); + const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); unsigned int var, sse; // Skipping threshold for ac. unsigned int thresh_ac; // The encode_breakout input unsigned int encode_breakout = x->encode_breakout << 4; + int max_thresh = 36000; + + // Use extreme low threshold for static frames to limit skipping. + if (cpi->enable_encode_breakout == 2) + max_thresh = 128; // Calculate threshold according to dequant value. thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9; - // Set a maximum for threshold to avoid big PSNR loss in low bitrate case. - if (thresh_ac > 36000) - thresh_ac = 36000; - // Use encode_breakout input if it is bigger than internal threshold. if (thresh_ac < encode_breakout) thresh_ac = encode_breakout; + // Set a maximum for threshold to avoid big PSNR loss in low bitrate case. + if (thresh_ac > max_thresh) + thresh_ac = max_thresh; + var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, xd->plane[0].dst.stride, &sse); @@ -3065,8 +2959,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, (sse_v - var_v < thresh_dc || sse_v == var_v)) { x->skip = 1; - *rate2 = 500; - *rate_uv = 0; + // The cost of skip bit needs to be added. + *rate2 += vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1); // Scaling factor for SSE from spatial domain to frequency domain // is 16. Adjust distortion accordingly. @@ -3084,7 +2978,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (!x->skip) { int skippable_y, skippable_uv; - int64_t sseuv = INT_MAX; + int64_t sseuv = INT64_MAX; + int64_t rdcosty = INT64_MAX; // Y cost and distortion super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse, @@ -3103,8 +2998,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, *rate2 += *rate_y; *distortion += *distortion_y; - super_block_uvrd(cm, x, rate_uv, distortion_uv, - &skippable_uv, &sseuv, bsize); + rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion); + rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse)); + + super_block_uvrd(cm, x, rate_uv, distortion_uv, &skippable_uv, &sseuv, + bsize, ref_best_rd - rdcosty); + if (*rate_uv == INT_MAX) { + *rate2 = INT_MAX; + *distortion = INT64_MAX; + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = orig_dst[i]; + xd->plane[i].dst.stride = orig_dst_stride[i]; + } + return INT64_MAX; + } *psse += sseuv; *rate2 += *rate_uv; @@ -3122,17 +3029,17 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int *returnrate, int64_t *returndist, - BLOCK_SIZE_TYPE bsize, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0; - int y_skip = 0, uv_skip; + int y_skip = 0, uv_skip = 0; int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 }; x->skip_encode = 0; ctx->skip = 0; - xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME; - if (bsize >= BLOCK_SIZE_SB8X8) { + xd->this_mi->mbmi.ref_frame[0] = INTRA_FRAME; + if (bsize >= BLOCK_8X8) { if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, &y_skip, bsize, tx_cache, best_rd) >= best_rd) { @@ -3149,46 +3056,46 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, return; } rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, - &dist_uv, &uv_skip, BLOCK_SIZE_SB8X8); + &dist_uv, &uv_skip, BLOCK_8X8); } if (y_skip && uv_skip) { *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly + vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1); - *returndist = dist_y + (dist_uv >> 2); + *returndist = dist_y + dist_uv; vp9_zero(ctx->tx_rd_diff); } else { int i; *returnrate = rate_y + rate_uv + vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0); - *returndist = dist_y + (dist_uv >> 2); + *returndist = dist_y + dist_uv; if (cpi->sf.tx_size_search_method == USE_FULL_RD) for (i = 0; i < TX_MODES; i++) ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode]; } - ctx->mic = *xd->mode_info_context; + ctx->mic = *xd->this_mi; } int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, int *returnrate, int64_t *returndistortion, - BLOCK_SIZE_TYPE bsize, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; - const struct segmentation *seg = &xd->seg; - const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]); - MB_PREDICTION_MODE this_mode; + MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; + const struct segmentation *seg = &cm->seg; + const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); + RD_PREDICTION_MODE this_mode; MV_REFERENCE_FRAME ref_frame, second_ref_frame; - unsigned char segment_id = xd->mode_info_context->mbmi.segment_id; + unsigned char segment_id = mbmi->segment_id; int comp_pred, i; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; - int_mv single_newmv[MAX_REF_FRAMES]; + int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } }; static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG }; int idx_list[4] = {0, @@ -3201,9 +3108,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t best_tx_diff[TX_MODES]; int64_t best_pred_diff[NB_PREDICTION_TYPES]; int64_t best_pred_rd[NB_PREDICTION_TYPES]; - int64_t best_filter_rd[VP9_SWITCHABLE_FILTERS + 1]; - int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]; - MB_MODE_INFO best_mbmode; + int64_t best_filter_rd[SWITCHABLE_FILTERS + 1]; + int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]; + MB_MODE_INFO best_mbmode = { 0 }; int j; int mode_index, best_mode_index = 0; unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; @@ -3228,14 +3135,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int_mv seg_mvs[4][MAX_REF_FRAMES]; union b_mode_info best_bmodes[4]; PARTITION_INFO best_partition; - int bwsl = b_width_log2(bsize); - int bws = (1 << bwsl) / 4; // mode_info step for subsize - int bhsl = b_height_log2(bsize); - int bhs = (1 << bhsl) / 4; // mode_info step for subsize + const int bws = num_8x8_blocks_wide_lookup[bsize] / 2; + const int bhs = num_8x8_blocks_high_lookup[bsize] / 2; int best_skip2 = 0; - x->skip_encode = (cpi->sf.skip_encode_frame && - xd->q_index < QIDX_SKIP_THRESH); + x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH; for (i = 0; i < 4; i++) { int j; @@ -3248,14 +3152,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp, &comp_mode_p); - vpx_memset(&best_mbmode, 0, sizeof(best_mbmode)); - vpx_memset(&single_newmv, 0, sizeof(single_newmv)); for (i = 0; i < NB_PREDICTION_TYPES; ++i) best_pred_rd[i] = INT64_MAX; for (i = 0; i < TX_MODES; i++) best_tx_rd[i] = INT64_MAX; - for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) + for (i = 0; i <= SWITCHABLE_FILTERS; i++) best_filter_rd[i] = INT64_MAX; for (i = 0; i < TX_SIZES; i++) rate_uv_intra[i] = INT_MAX; @@ -3312,7 +3214,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int compmode_cost = 0; int rate2 = 0, rate_y = 0, rate_uv = 0; int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; - int skippable; + int skippable = 0; int64_t tx_cache[TX_MODES]; int i; int this_skip2 = 0; @@ -3327,10 +3229,31 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, ref_frame = vp9_mode_order[mode_index].ref_frame; second_ref_frame = vp9_mode_order[mode_index].second_ref_frame; - // Skip modes that have been masked off but always consider first mode. - if (mode_index && (bsize > cpi->sf.unused_mode_skip_lvl) && - (cpi->unused_mode_skip_mask & (1 << mode_index)) ) - continue; + // Look at the reference frame of the best mode so far and set the + // skip mask to look at a subset of the remaining modes. + if (mode_index > cpi->sf.mode_skip_start) { + if (mode_index == (cpi->sf.mode_skip_start + 1)) { + switch (vp9_mode_order[best_mode_index].ref_frame) { + case INTRA_FRAME: + cpi->mode_skip_mask = 0; + break; + case LAST_FRAME: + cpi->mode_skip_mask = LAST_FRAME_MODE_MASK; + break; + case GOLDEN_FRAME: + cpi->mode_skip_mask = GOLDEN_FRAME_MODE_MASK; + break; + case ALTREF_FRAME: + cpi->mode_skip_mask = ALT_REF_MODE_MASK; + break; + case NONE: + case MAX_REF_FRAMES: + assert(!"Invalid Reference frame"); + } + } + if (cpi->mode_skip_mask & (1 << mode_index)) + continue; + } // Skip if the current reference frame has been masked off if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask && @@ -3339,7 +3262,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // Test best rd so far against threshold for trying this mode. if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] * - cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 4)) || + cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 5)) || cpi->rd_threshes[bsize][mode_index] == INT_MAX) continue; @@ -3355,7 +3278,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (x->fast_ms > 2 && ref_frame != x->subblock_ref) continue; - if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_SIZE_SB8X8) { + if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_8X8) { if (!(ref_frame_mask & (1 << ref_frame))) { continue; } @@ -3393,19 +3316,24 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // TODO(jingning, jkoleszar): scaling reference frame not supported for // SPLITMV. if (ref_frame > 0 && - (scale_factor[ref_frame].x_scale_fp != VP9_REF_NO_SCALE || - scale_factor[ref_frame].y_scale_fp != VP9_REF_NO_SCALE) && - this_mode == SPLITMV) + vp9_is_scaled(&scale_factor[ref_frame]) && + this_mode == RD_SPLITMV) continue; if (second_ref_frame > 0 && - (scale_factor[second_ref_frame].x_scale_fp != VP9_REF_NO_SCALE || - scale_factor[second_ref_frame].y_scale_fp != VP9_REF_NO_SCALE) && - this_mode == SPLITMV) + vp9_is_scaled(&scale_factor[second_ref_frame]) && + this_mode == RD_SPLITMV) + continue; + + if (bsize >= BLOCK_8X8 && + (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV)) + continue; + + if (bsize < BLOCK_8X8 && + !(this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV)) continue; set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor); - mbmi->mode = this_mode; mbmi->uv_mode = DC_PRED; // Evaluate all sub-pel filters irrespective of whether we can use @@ -3413,13 +3341,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, mbmi->interp_filter = cm->mcomp_filter_type; vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); - if (bsize >= BLOCK_SIZE_SB8X8 && - (this_mode == I4X4_PRED || this_mode == SPLITMV)) - continue; - if (bsize < BLOCK_SIZE_SB8X8 && - !(this_mode == I4X4_PRED || this_mode == SPLITMV)) - continue; - if (comp_pred) { if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; @@ -3452,7 +3373,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // If the segment skip feature is enabled.... // then do nothing if the current mode is not allowed.. } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) && - (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) { + (this_mode != RD_ZEROMV && ref_frame != INTRA_FRAME)) { continue; // Disable this drop out case if the ref frame // segment level feature is enabled for this segment. This is to @@ -3464,11 +3385,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // an unfiltered alternative. We allow near/nearest as well // because they may result in zero-zero MVs but be cheaper. if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { - if ((this_mode != ZEROMV && - !(this_mode == NEARMV && - frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) && - !(this_mode == NEARESTMV && - frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) || + if ((this_mode != RD_ZEROMV && + !(this_mode == RD_NEARMV && + frame_mv[RD_NEARMV][ALTREF_FRAME].as_int == 0) && + !(this_mode == RD_NEARESTMV && + frame_mv[RD_NEARESTMV][ALTREF_FRAME].as_int == 0)) || ref_frame != ALTREF_FRAME) { continue; } @@ -3480,11 +3401,17 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // a representative block in the boundary ( first ) and then implement a // function that does sads when inside the border.. if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) && - this_mode == NEWMV) { + this_mode == RD_NEWMV) { continue; } - if (this_mode == I4X4_PRED) { +#ifdef MODE_TEST_HIT_STATS + // TEST/DEBUG CODE + // Keep a rcord of the number of test hits at each size + cpi->mode_test_hits[bsize]++; +#endif + + if (this_mode == RD_I4X4_PRED) { int rate; /* @@ -3493,8 +3420,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, continue; */ - // I4X4_PRED is only considered for block sizes less than 8x8. - mbmi->txfm_size = TX_4X4; + // RD_I4X4_PRED is only considered for block sizes less than 8x8. + mbmi->tx_size = TX_4X4; if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y, &distortion_y, best_rd) >= best_rd) continue; @@ -3521,31 +3448,33 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // Disable intra modes other than DC_PRED for blocks with low variance // Threshold for intra skipping based on source variance // TODO(debargha): Specialize the threshold for super block sizes - static const int skip_intra_var_thresh[BLOCK_SIZE_TYPES] = { + static const int skip_intra_var_thresh[BLOCK_SIZES] = { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, }; if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && - this_mode != DC_PRED && + this_mode != RD_DC_PRED && x->source_variance < skip_intra_var_thresh[mbmi->sb_type]) continue; // Only search the oblique modes if the best so far is // one of the neighboring directional modes if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && - (this_mode >= D45_PRED && this_mode <= TM_PRED)) { + (this_mode >= RD_D45_PRED && this_mode <= RD_TM_PRED)) { if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME) continue; } + mbmi->mode = rd_mode_to_mode(this_mode); if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { if (conditional_skipintra(mbmi->mode, best_intra_mode)) continue; } + super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize, tx_cache, best_rd); if (rate_y == INT_MAX) continue; - uv_tx = MIN(mbmi->txfm_size, max_uv_txsize_lookup[bsize]); + uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]); if (rate_uv_intra[uv_tx] == INT_MAX) { choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx], @@ -3559,10 +3488,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, mbmi->uv_mode = mode_uv[uv_tx]; rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx]; - if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED) + if (this_mode != RD_DC_PRED && this_mode != RD_TM_PRED) rate2 += intra_cost_penalty; distortion2 = distortion_y + distortion_uv; - } else if (this_mode == SPLITMV) { + } else if (this_mode == RD_SPLITMV) { const int is_comp_pred = second_ref_frame > 0; int rate; int64_t distortion; @@ -3577,7 +3506,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, union b_mode_info tmp_best_bmodes[16]; MB_MODE_INFO tmp_best_mbmode; PARTITION_INFO tmp_best_partition; - BEST_SEG_INFO bsi[VP9_SWITCHABLE_FILTERS]; + BEST_SEG_INFO bsi[SWITCHABLE_FILTERS]; int pred_exists = 0; int uv_skippable; if (is_comp_pred) { @@ -3595,70 +3524,79 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, cpi->rd_threshes[bsize][THR_NEWA]; this_rd_thresh = (ref_frame == GOLDEN_FRAME) ? cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh; - xd->mode_info_context->mbmi.txfm_size = TX_4X4; - - cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX; - for (switchable_filter_index = 0; - switchable_filter_index < VP9_SWITCHABLE_FILTERS; - ++switchable_filter_index) { - int newbest, rs; - int64_t rs_rd; - mbmi->interp_filter = - vp9_switchable_interp[switchable_filter_index]; - vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); - - tmp_rd = rd_pick_best_mbsegmentation(cpi, x, - &mbmi->ref_mvs[ref_frame][0], - second_ref, - best_yrd, - &rate, &rate_y, &distortion, - &skippable, &total_sse, - (int)this_rd_thresh, seg_mvs, - bsi, switchable_filter_index, - mi_row, mi_col); - - if (tmp_rd == INT64_MAX) - continue; - cpi->rd_filter_cache[switchable_filter_index] = tmp_rd; - rs = get_switchable_rate(x); - rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0); - cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = - MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], tmp_rd + rs_rd); - if (cm->mcomp_filter_type == SWITCHABLE) - tmp_rd += rs_rd; - - newbest = (tmp_rd < tmp_best_rd); - if (newbest) { - tmp_best_filter = mbmi->interp_filter; - tmp_best_rd = tmp_rd; - } - if ((newbest && cm->mcomp_filter_type == SWITCHABLE) || - (mbmi->interp_filter == cm->mcomp_filter_type && - cm->mcomp_filter_type != SWITCHABLE)) { - tmp_best_rdu = tmp_rd; - tmp_best_rate = rate; - tmp_best_ratey = rate_y; - tmp_best_distortion = distortion; - tmp_best_sse = total_sse; - tmp_best_skippable = skippable; - tmp_best_mbmode = *mbmi; - tmp_best_partition = *x->partition_info; - for (i = 0; i < 4; i++) - tmp_best_bmodes[i] = xd->mode_info_context->bmi[i]; - pred_exists = 1; - if (switchable_filter_index == 0 && - cpi->sf.use_rd_breakout && - best_rd < INT64_MAX) { - if (tmp_best_rdu / 2 > best_rd) { - // skip searching the other filters if the first is - // already substantially larger than the best so far + xd->this_mi->mbmi.tx_size = TX_4X4; + + cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX; + if (cm->mcomp_filter_type != BILINEAR) { + tmp_best_filter = EIGHTTAP; + if (x->source_variance < + cpi->sf.disable_filter_search_var_thresh) { + tmp_best_filter = EIGHTTAP; + vp9_zero(cpi->rd_filter_cache); + } else { + for (switchable_filter_index = 0; + switchable_filter_index < SWITCHABLE_FILTERS; + ++switchable_filter_index) { + int newbest, rs; + int64_t rs_rd; + mbmi->interp_filter = switchable_filter_index; + vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + + tmp_rd = rd_pick_best_mbsegmentation(cpi, x, + &mbmi->ref_mvs[ref_frame][0], + second_ref, + best_yrd, + &rate, &rate_y, &distortion, + &skippable, &total_sse, + (int)this_rd_thresh, seg_mvs, + bsi, switchable_filter_index, + mi_row, mi_col); + + if (tmp_rd == INT64_MAX) + continue; + cpi->rd_filter_cache[switchable_filter_index] = tmp_rd; + rs = get_switchable_rate(x); + rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0); + cpi->rd_filter_cache[SWITCHABLE_FILTERS] = + MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], + tmp_rd + rs_rd); + if (cm->mcomp_filter_type == SWITCHABLE) + tmp_rd += rs_rd; + + newbest = (tmp_rd < tmp_best_rd); + if (newbest) { tmp_best_filter = mbmi->interp_filter; - tmp_best_rdu = INT64_MAX; - break; + tmp_best_rd = tmp_rd; } - } + if ((newbest && cm->mcomp_filter_type == SWITCHABLE) || + (mbmi->interp_filter == cm->mcomp_filter_type && + cm->mcomp_filter_type != SWITCHABLE)) { + tmp_best_rdu = tmp_rd; + tmp_best_rate = rate; + tmp_best_ratey = rate_y; + tmp_best_distortion = distortion; + tmp_best_sse = total_sse; + tmp_best_skippable = skippable; + tmp_best_mbmode = *mbmi; + tmp_best_partition = *x->partition_info; + for (i = 0; i < 4; i++) + tmp_best_bmodes[i] = xd->this_mi->bmi[i]; + pred_exists = 1; + if (switchable_filter_index == 0 && + cpi->sf.use_rd_breakout && + best_rd < INT64_MAX) { + if (tmp_best_rdu / 2 > best_rd) { + // skip searching the other filters if the first is + // already substantially larger than the best so far + tmp_best_filter = mbmi->interp_filter; + tmp_best_rdu = INT64_MAX; + break; + } + } + } + } // switchable_filter_index loop } - } // switchable_filter_index loop + } if (tmp_best_rdu == INT64_MAX) continue; @@ -3694,7 +3632,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, *mbmi = tmp_best_mbmode; *x->partition_info = tmp_best_partition; for (i = 0; i < 4; i++) - xd->mode_info_context->bmi[i] = tmp_best_bmodes[i]; + xd->this_mi->bmi[i] = tmp_best_bmodes[i]; } rate2 += rate; @@ -3711,16 +3649,19 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred); - if (RDCOST(x->rdmult, x->rddiv, rate2, distortion2) < - best_rd) { + tmp_best_rdu = best_rd - + MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2), + RDCOST(x->rdmult, x->rddiv, 0, total_sse)); + + if (tmp_best_rdu > 0) { // If even the 'Y' rd value of split is higher than best so far // then dont bother looking at UV vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, - BLOCK_SIZE_SB8X8); - vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8); - super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv, - &uv_skippable, &uv_sse, - BLOCK_SIZE_SB8X8, TX_4X4); + BLOCK_8X8); + super_block_uvrd(cm, x, &rate_uv, &distortion_uv, &uv_skippable, + &uv_sse, BLOCK_8X8, tmp_best_rdu); + if (rate_uv == INT_MAX) + continue; rate2 += rate_uv; distortion2 += distortion_uv; skippable = skippable && uv_skippable; @@ -3731,6 +3672,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, tx_cache[i] = tx_cache[ONLY_4X4]; } } else { + mbmi->mode = rd_mode_to_mode(this_mode); compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME); this_rd = handle_inter_mode(cpi, x, bsize, tx_cache, @@ -3766,7 +3708,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP); - if (skippable && bsize >= BLOCK_SIZE_SB8X8) { + if (skippable && bsize >= BLOCK_8X8) { // Back out the coefficient coding costs rate2 -= (rate_y + rate_uv); // for best yrd calculation @@ -3815,30 +3757,30 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } // Keep record of best intra rd - if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME && - is_intra_mode(xd->mode_info_context->mbmi.mode) && + if (xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME && + is_intra_mode(xd->this_mi->mbmi.mode) && this_rd < best_intra_rd) { best_intra_rd = this_rd; - best_intra_mode = xd->mode_info_context->mbmi.mode; + best_intra_mode = xd->this_mi->mbmi.mode; } // Keep record of best inter rd with single reference - if (xd->mode_info_context->mbmi.ref_frame[0] > INTRA_FRAME && - xd->mode_info_context->mbmi.ref_frame[1] == NONE && + if (xd->this_mi->mbmi.ref_frame[0] > INTRA_FRAME && + xd->this_mi->mbmi.ref_frame[1] == NONE && !mode_excluded && this_rd < best_inter_rd) { best_inter_rd = this_rd; best_inter_ref_frame = ref_frame; - // best_inter_mode = xd->mode_info_context->mbmi.mode; + // best_inter_mode = xd->this_mi->mbmi.mode; } if (!disable_skip && ref_frame == INTRA_FRAME) { for (i = 0; i < NB_PREDICTION_TYPES; ++i) best_pred_rd[i] = MIN(best_pred_rd[i], this_rd); - for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) + for (i = 0; i <= SWITCHABLE_FILTERS; i++) best_filter_rd[i] = MIN(best_filter_rd[i], this_rd); } - if (this_mode != I4X4_PRED && this_mode != SPLITMV) { + if (this_mode != RD_I4X4_PRED && this_mode != RD_SPLITMV) { // Store the respective mode distortions for later use. if (mode_distortions[this_mode] == -1 || distortion2 < mode_distortions[this_mode]) { @@ -3870,9 +3812,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_skip2 = this_skip2; best_partition = *x->partition_info; - if (this_mode == I4X4_PRED || this_mode == SPLITMV) + if (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV) for (i = 0; i < 4; i++) - best_bmodes[i] = xd->mode_info_context->bmi[i]; + best_bmodes[i] = xd->this_mi->bmi[i]; // TODO(debargha): enhance this test with a better distortion prediction // based on qp, activity mask and history @@ -3890,29 +3832,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } } } -#if 0 - // Testing this mode gave rise to an improvement in best error score. - // Lower threshold a bit for next time - cpi->rd_thresh_mult[mode_index] = - (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? - cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT; - cpi->rd_threshes[mode_index] = - (cpi->rd_baseline_thresh[mode_index] >> 7) - * cpi->rd_thresh_mult[mode_index]; -#endif - } else { - // If the mode did not help improve the best error case then - // raise the threshold for testing that mode next time around. -#if 0 - cpi->rd_thresh_mult[mode_index] += 4; - - if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT) - cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT; - - cpi->rd_threshes[mode_index] = - (cpi->rd_baseline_thresh[mode_index] >> 7) - * cpi->rd_thresh_mult[mode_index]; -#endif } /* keep record of best compound/single-only prediction */ @@ -3945,9 +3864,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME && cm->mcomp_filter_type != BILINEAR) { int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ? - VP9_SWITCHABLE_FILTERS : - vp9_switchable_interp_map[cm->mcomp_filter_type]]; - for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) { + SWITCHABLE_FILTERS : cm->mcomp_filter_type]; + for (i = 0; i <= SWITCHABLE_FILTERS; i++) { int64_t adj_rd; // In cases of poor prediction, filter_cache[] can contain really big // values, which actually are bigger than this_rd itself. This can @@ -3964,16 +3882,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, /* keep record of best txfm size */ if (bsize < BLOCK_32X32) { if (bsize < BLOCK_16X16) { - if (this_mode == SPLITMV || this_mode == I4X4_PRED) + if (this_mode == RD_SPLITMV || this_mode == RD_I4X4_PRED) tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4]; tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8]; } tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16]; } if (!mode_excluded && this_rd != INT64_MAX) { - for (i = 0; i < TX_MODES; i++) { + for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) { int64_t adj_rd = INT64_MAX; - if (this_mode != I4X4_PRED) { + if (this_mode != RD_I4X4_PRED) { adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode]; } else { adj_rd = this_rd; @@ -4003,18 +3921,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, &rate_uv_tokenonly[uv_tx_size], &dist_uv[uv_tx_size], &skip_uv[uv_tx_size], - (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 - : bsize); + bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); } } - // If indicated then mark the index of the chosen mode to be inspected at - // other block sizes. - if (bsize <= cpi->sf.unused_mode_skip_lvl) { - cpi->unused_mode_skip_mask = cpi->unused_mode_skip_mask & - (~((int64_t)1 << best_mode_index)); - } - // If we are using reference masking and the set mask flag is set then // create the reference frame mask. if (cpi->sf.reference_masking && cpi->set_ref_frame_mask) @@ -4039,7 +3949,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } } - if (best_rd == INT64_MAX && bsize < BLOCK_SIZE_SB8X8) { + if (best_rd == INT64_MAX && bsize < BLOCK_8X8) { *returnrate = INT_MAX; *returndistortion = INT_MAX; return best_rd; @@ -4057,57 +3967,43 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (cpi->sf.adaptive_rd_thresh) { for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { if (mode_index == best_mode_index) { - cpi->rd_thresh_freq_fact[bsize][mode_index] = BASE_RD_THRESH_FREQ_FACT; + cpi->rd_thresh_freq_fact[bsize][mode_index] -= + (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3); } else { - cpi->rd_thresh_freq_fact[bsize][mode_index] += MAX_RD_THRESH_FREQ_INC; + cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC; if (cpi->rd_thresh_freq_fact[bsize][mode_index] > - (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) { + (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) { cpi->rd_thresh_freq_fact[bsize][mode_index] = - cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT; + cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT; } } } } - // TODO(rbultje) integrate with RD trd_thresh_freq_facthresholding -#if 0 - // Reduce the activation RD thresholds for the best choice mode - if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && - (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) { - int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2); - - cpi->rd_thresh_mult[best_mode_index] = - (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? - cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT; - cpi->rd_threshes[best_mode_index] = - (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index]; - } -#endif - // macroblock modes *mbmi = best_mbmode; x->skip |= best_skip2; if (best_mbmode.ref_frame[0] == INTRA_FRAME && - best_mbmode.sb_type < BLOCK_SIZE_SB8X8) { + best_mbmode.sb_type < BLOCK_8X8) { for (i = 0; i < 4; i++) - xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode; + xd->this_mi->bmi[i].as_mode = best_bmodes[i].as_mode; } if (best_mbmode.ref_frame[0] != INTRA_FRAME && - best_mbmode.sb_type < BLOCK_SIZE_SB8X8) { + best_mbmode.sb_type < BLOCK_8X8) { for (i = 0; i < 4; i++) - xd->mode_info_context->bmi[i].as_mv[0].as_int = + xd->this_mi->bmi[i].as_mv[0].as_int = best_bmodes[i].as_mv[0].as_int; if (mbmi->ref_frame[1] > 0) for (i = 0; i < 4; i++) - xd->mode_info_context->bmi[i].as_mv[1].as_int = + xd->this_mi->bmi[i].as_mv[1].as_int = best_bmodes[i].as_mv[1].as_int; *x->partition_info = best_partition; - mbmi->mv[0].as_int = xd->mode_info_context->bmi[3].as_mv[0].as_int; - mbmi->mv[1].as_int = xd->mode_info_context->bmi[3].as_mv[1].as_int; + mbmi->mv[0].as_int = xd->this_mi->bmi[3].as_mv[0].as_int; + mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int; } for (i = 0; i < NB_PREDICTION_TYPES; ++i) { @@ -4118,14 +4014,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } if (!x->skip) { - for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) { + for (i = 0; i <= SWITCHABLE_FILTERS; i++) { if (best_filter_rd[i] == INT64_MAX) best_filter_diff[i] = 0; else best_filter_diff[i] = best_rd - best_filter_rd[i]; } if (cm->mcomp_filter_type == SWITCHABLE) - assert(best_filter_diff[VP9_SWITCHABLE_FILTERS] == 0); + assert(best_filter_diff[SWITCHABLE_FILTERS] == 0); } else { vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff)); } diff --git a/libvpx/vp9/encoder/vp9_rdopt.h b/libvpx/vp9/encoder/vp9_rdopt.h index 7c84b48..eba7df9 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.h +++ b/libvpx/vp9/encoder/vp9_rdopt.h @@ -13,8 +13,6 @@ #define VP9_ENCODER_VP9_RDOPT_H_ #define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) ) -#define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) ) - #define QIDX_SKIP_THRESH 115 void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex); @@ -22,12 +20,12 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex); void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex); void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, - int *r, int64_t *d, BLOCK_SIZE_TYPE bsize, + int *r, int64_t *d, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd); int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, - int *r, int64_t *d, BLOCK_SIZE_TYPE bsize, + int *r, int64_t *d, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd); void vp9_init_me_luts(); diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c index 9564edc..10655e8 100644 --- a/libvpx/vp9/encoder/vp9_segmentation.c +++ b/libvpx/vp9/encoder/vp9_segmentation.c @@ -17,39 +17,42 @@ void vp9_enable_segmentation(VP9_PTR ptr) { VP9_COMP *cpi = (VP9_COMP *)ptr; + struct segmentation *const seg = &cpi->common.seg; - cpi->mb.e_mbd.seg.enabled = 1; - cpi->mb.e_mbd.seg.update_map = 1; - cpi->mb.e_mbd.seg.update_data = 1; + seg->enabled = 1; + seg->update_map = 1; + seg->update_data = 1; } void vp9_disable_segmentation(VP9_PTR ptr) { VP9_COMP *cpi = (VP9_COMP *)ptr; - cpi->mb.e_mbd.seg.enabled = 0; + struct segmentation *const seg = &cpi->common.seg; + seg->enabled = 0; } void vp9_set_segmentation_map(VP9_PTR ptr, unsigned char *segmentation_map) { - VP9_COMP *cpi = (VP9_COMP *)(ptr); + VP9_COMP *cpi = (VP9_COMP *)ptr; + struct segmentation *const seg = &cpi->common.seg; // Copy in the new segmentation map vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mi_rows * cpi->common.mi_cols)); // Signal that the map should be updated. - cpi->mb.e_mbd.seg.update_map = 1; - cpi->mb.e_mbd.seg.update_data = 1; + seg->update_map = 1; + seg->update_data = 1; } void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data, unsigned char abs_delta) { - VP9_COMP *cpi = (VP9_COMP *)(ptr); + VP9_COMP *cpi = (VP9_COMP *)ptr; + struct segmentation *const seg = &cpi->common.seg; - cpi->mb.e_mbd.seg.abs_delta = abs_delta; + seg->abs_delta = abs_delta; - vpx_memcpy(cpi->mb.e_mbd.seg.feature_data, feature_data, - sizeof(cpi->mb.e_mbd.seg.feature_data)); + vpx_memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data)); // TBD ?? Set the feature mask // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0, @@ -114,7 +117,7 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) { return cost; } -static void count_segs(VP9_COMP *cpi, MODE_INFO *mi, +static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8, int *no_pred_segcounts, int (*temporal_predictor_count)[2], int *t_unpred_seg_counts, @@ -126,8 +129,8 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO *mi, if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - segment_id = mi->mbmi.segment_id; - xd->mode_info_context = mi; + segment_id = mi_8x8[0]->mbmi.segment_id; + set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw); // Count the number of hits on each segment with no prediction @@ -135,7 +138,7 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO *mi, // Temporal prediction not allowed on key frames if (cm->frame_type != KEY_FRAME) { - const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type; + const BLOCK_SIZE bsize = mi_8x8[0]->mbmi.sb_type; // Test to see if the segment id matches the predicted value. const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col); @@ -144,7 +147,7 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO *mi, // Store the prediction status for this mb and update counts // as appropriate - vp9_set_pred_flag_seg_id(cm, bsize, mi_row, mi_col, pred_flag); + vp9_set_pred_flag_seg_id(xd, pred_flag); temporal_predictor_count[pred_context][pred_flag]++; if (!pred_flag) @@ -153,95 +156,85 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO *mi, } } -static void count_segs_sb(VP9_COMP *cpi, MODE_INFO *mi, +static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, int *no_pred_segcounts, int (*temporal_predictor_count)[2], int *t_unpred_seg_counts, int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const cm = &cpi->common; + BLOCK_SIZE bsize) { + const VP9_COMMON *const cm = &cpi->common; const int mis = cm->mode_info_stride; - int bwl, bhl; - const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1); + int bw, bh; + const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - bwl = mi_width_log2(mi->mbmi.sb_type); - bhl = mi_height_log2(mi->mbmi.sb_type); - - if (bwl == bsl && bhl == bsl) { - count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count, - t_unpred_seg_counts, 1 << bsl, 1 << bsl, mi_row, mi_col); - } else if (bwl == bsl && bhl < bsl) { - count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count, - t_unpred_seg_counts, 1 << bsl, bs, mi_row, mi_col); - count_segs(cpi, mi + bs * mis, no_pred_segcounts, temporal_predictor_count, - t_unpred_seg_counts, 1 << bsl, bs, mi_row + bs, mi_col); - } else if (bwl < bsl && bhl == bsl) { - count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count, - t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col); - count_segs(cpi, mi + bs, no_pred_segcounts, temporal_predictor_count, - t_unpred_seg_counts, bs, 1 << bsl, mi_row, mi_col + bs); + bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type]; + bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type]; + + if (bw == bs && bh == bs) { + count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, bs, bs, mi_row, mi_col); + } else if (bw == bs && bh < bs) { + count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, bs, hbs, mi_row, mi_col); + count_segs(cpi, mi_8x8 + hbs * mis, no_pred_segcounts, + temporal_predictor_count, t_unpred_seg_counts, bs, hbs, + mi_row + hbs, mi_col); + } else if (bw < bs && bh == bs) { + count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, hbs, bs, mi_row, mi_col); + count_segs(cpi, mi_8x8 + hbs, no_pred_segcounts, temporal_predictor_count, + t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs); } else { - BLOCK_SIZE_TYPE subsize; + const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize]; int n; - assert(bwl < bsl && bhl < bsl); - if (bsize == BLOCK_64X64) { - subsize = BLOCK_32X32; - } else if (bsize == BLOCK_32X32) { - subsize = BLOCK_16X16; - } else { - assert(bsize == BLOCK_16X16); - subsize = BLOCK_8X8; - } + assert(bw < bs && bh < bs); for (n = 0; n < 4; n++) { - const int y_idx = n >> 1, x_idx = n & 0x01; + const int mi_dc = hbs * (n & 1); + const int mi_dr = hbs * (n >> 1); - count_segs_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, + count_segs_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc], no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, - mi_row + y_idx * bs, mi_col + x_idx * bs, subsize); + mi_row + mi_dr, mi_col + mi_dc, subsize); } } } void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; - struct segmentation *seg = &cpi->mb.e_mbd.seg; + struct segmentation *seg = &cm->seg; int no_pred_cost; int t_pred_cost = INT_MAX; int i, tile_col, mi_row, mi_col; - int temporal_predictor_count[PREDICTION_PROBS][2]; - int no_pred_segcounts[MAX_SEGMENTS]; - int t_unpred_seg_counts[MAX_SEGMENTS]; + int temporal_predictor_count[PREDICTION_PROBS][2] = { { 0 } }; + int no_pred_segcounts[MAX_SEGMENTS] = { 0 }; + int t_unpred_seg_counts[MAX_SEGMENTS] = { 0 }; vp9_prob no_pred_tree[SEG_TREE_PROBS]; vp9_prob t_pred_tree[SEG_TREE_PROBS]; vp9_prob t_nopred_prob[PREDICTION_PROBS]; const int mis = cm->mode_info_stride; - MODE_INFO *mi_ptr, *mi; + MODE_INFO **mi_ptr, **mi; // Set default state for the segment tree probabilities and the // temporal coding probabilities vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs)); vpx_memset(seg->pred_probs, 255, sizeof(seg->pred_probs)); - vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts)); - vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts)); - vpx_memset(temporal_predictor_count, 0, sizeof(temporal_predictor_count)); - // First of all generate stats regarding how well the last segment map // predicts this one for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) { vp9_get_tile_col_offsets(cm, tile_col); - mi_ptr = cm->mi + cm->cur_tile_mi_col_start; + mi_ptr = cm->mi_grid_visible + cm->cur_tile_mi_col_start; for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) { mi = mi_ptr; diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c index a692c01..63826ee 100644 --- a/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -153,11 +153,11 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, /*cpi->sf.search_method == HEX*/ // TODO Check that the 16x16 vf & sdf are selected here // Ignore mv costing by sending NULL pointer instead of cost arrays - ref_mv = &x->e_mbd.mode_info_context->bmi[0].as_mv[0]; - bestsme = vp9_hex_search(x, &best_ref_mv1_full, ref_mv, - step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16], - NULL, NULL, NULL, NULL, - &best_ref_mv1); + ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0]; + bestsme = vp9_hex_search(x, &best_ref_mv1_full, + step_param, sadpb, 1, + &cpi->fn_ptr[BLOCK_16X16], + 0, &best_ref_mv1, ref_mv); #if ALT_REF_SUBPEL_ENABLED // Try sub-pixel MC? @@ -170,6 +170,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, &best_ref_mv1, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], + 0, cpi->sf.subpel_iters_per_step, NULL, NULL, &distortion, &sse); } @@ -244,8 +245,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, if (cpi->frames[frame] == NULL) continue; - mbd->mode_info_context->bmi[0].as_mv[0].as_mv.row = 0; - mbd->mode_info_context->bmi[0].as_mv[0].as_mv.col = 0; + mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row = 0; + mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col = 0; if (frame == alt_ref_index) { filter_weight = 2; @@ -278,8 +279,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, cpi->frames[frame]->u_buffer + mb_uv_offset, cpi->frames[frame]->v_buffer + mb_uv_offset, cpi->frames[frame]->y_stride, - mbd->mode_info_context->bmi[0].as_mv[0].as_mv.row, - mbd->mode_info_context->bmi[0].as_mv[0].as_mv.col, + mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row, + mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col, predictor); // Apply the filter (YUV) diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c index caa89b2..0c9bf9d 100644 --- a/libvpx/vp9/encoder/vp9_tokenize.c +++ b/libvpx/vp9/encoder/vp9_tokenize.c @@ -97,103 +97,51 @@ struct tokenize_b_args { TX_SIZE tx_size; }; -static void set_entropy_context_b(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { +static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { struct tokenize_b_args* const args = arg; - TX_SIZE tx_size = ss_txfrm_size >> 1; - MACROBLOCKD *xd = args->xd; - const int bwl = b_width_log2(bsize); - const int off = block >> (2 * tx_size); - const int mod = bwl - tx_size - xd->plane[plane].subsampling_x; - const int aoff = (off & ((1 << mod) - 1)) << tx_size; - const int loff = (off >> mod) << tx_size; - ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff; - ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff; - const int eob = xd->plane[plane].eobs[block]; - const int tx_size_in_blocks = 1 << tx_size; - - if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { - set_contexts_on_border(xd, bsize, plane, tx_size_in_blocks, eob, aoff, loff, - A, L); - } else { - vpx_memset(A, eob > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); - vpx_memset(L, eob > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); - } + MACROBLOCKD *const xd = args->xd; + struct macroblockd_plane *pd = &xd->plane[plane]; + int aoff, loff; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff); + set_contexts(xd, pd, plane_bsize, tx_size, pd->eobs[block] > 0, aoff, loff); } -static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { +static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { struct tokenize_b_args* const args = arg; VP9_COMP *cpi = args->cpi; MACROBLOCKD *xd = args->xd; TOKENEXTRA **tp = args->tp; - const TX_SIZE tx_size = ss_txfrm_size >> 1; - const int tx_size_in_blocks = 1 << tx_size; - MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + struct macroblockd_plane *pd = &xd->plane[plane]; + MB_MODE_INFO *mbmi = &xd->this_mi->mbmi; int pt; /* near block/prev token context index */ int c = 0, rc = 0; TOKENEXTRA *t = *tp; /* store tokens starting here */ - const int eob = xd->plane[plane].eobs[block]; - const PLANE_TYPE type = xd->plane[plane].plane_type; - const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16); - const int bwl = b_width_log2(bsize); - const int off = block >> (2 * tx_size); - const int mod = bwl - tx_size - xd->plane[plane].subsampling_x; - const int aoff = (off & ((1 << mod) - 1)) << tx_size; - const int loff = (off >> mod) << tx_size; - ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff; - ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff; - int seg_eob; + const int eob = pd->eobs[block]; + const PLANE_TYPE type = pd->plane_type; + const int16_t *qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block); + const int segment_id = mbmi->segment_id; const int16_t *scan, *nb; - vp9_coeff_count *counts; - vp9_coeff_probs_model *coef_probs; + vp9_coeff_count *const counts = cpi->coef_counts[tx_size]; + vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size]; const int ref = is_inter_block(mbmi); - ENTROPY_CONTEXT above_ec, left_ec; uint8_t token_cache[1024]; const uint8_t *band_translate; - assert((!type && !plane) || (type && plane)); + ENTROPY_CONTEXT *A, *L; + const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size); + int aoff, loff; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff); - counts = cpi->coef_counts[tx_size]; - coef_probs = cpi->common.fc.coef_probs[tx_size]; - switch (tx_size) { - default: - case TX_4X4: - above_ec = A[0] != 0; - left_ec = L[0] != 0; - seg_eob = 16; - scan = get_scan_4x4(get_tx_type_4x4(type, xd, block)); - band_translate = vp9_coefband_trans_4x4; - break; - case TX_8X8: - above_ec = !!*(uint16_t *)A; - left_ec = !!*(uint16_t *)L; - seg_eob = 64; - scan = get_scan_8x8(get_tx_type_8x8(type, xd)); - band_translate = vp9_coefband_trans_8x8plus; - break; - case TX_16X16: - above_ec = !!*(uint32_t *)A; - left_ec = !!*(uint32_t *)L; - seg_eob = 256; - scan = get_scan_16x16(get_tx_type_16x16(type, xd)); - band_translate = vp9_coefband_trans_8x8plus; - break; - case TX_32X32: - above_ec = !!*(uint64_t *)A; - left_ec = !!*(uint64_t *)L; - seg_eob = 1024; - scan = vp9_default_scan_32x32; - band_translate = vp9_coefband_trans_8x8plus; - break; - } - - pt = combine_entropy_contexts(above_ec, left_ec); - nb = vp9_get_coef_neighbors_handle(scan); + A = pd->above_context + aoff; + L = pd->left_context + loff; - if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) - seg_eob = 0; + assert((!type && !plane) || (type && plane)); + pt = get_entropy_context(xd, tx_size, type, block, A, L, + &scan, &band_translate); + nb = vp9_get_coef_neighbors_handle(scan); c = 0; do { const int band = get_coef_band(band_translate, c); @@ -227,62 +175,53 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, } while (c < eob && ++c < seg_eob); *tp = t; - if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { - set_contexts_on_border(xd, bsize, plane, tx_size_in_blocks, c, aoff, loff, - A, L); - } else { - vpx_memset(A, c > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); - vpx_memset(L, c > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); - } + + set_contexts(xd, pd, plane_bsize, tx_size, c > 0, aoff, loff); } struct is_skippable_args { MACROBLOCKD *xd; int *skippable; }; + static void is_skippable(int plane, int block, - BLOCK_SIZE_TYPE bsize, int ss_txfrm_size, void *argv) { + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + void *argv) { struct is_skippable_args *args = argv; args->skippable[0] &= (!args->xd->plane[plane].eobs[block]); } -int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) { +int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize) { int result = 1; struct is_skippable_args args = {xd, &result}; foreach_transformed_block(xd, bsize, is_skippable, &args); return result; } -int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) { - int result = 1; - struct is_skippable_args args = {xd, &result}; - foreach_transformed_block_in_plane(xd, bsize, 0, is_skippable, &args); - return result; -} - -int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) { +int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane) { int result = 1; struct is_skippable_args args = {xd, &result}; - foreach_transformed_block_uv(xd, bsize, is_skippable, &args); + foreach_transformed_block_in_plane(xd, bsize, plane, is_skippable, &args); return result; } void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, - BLOCK_SIZE_TYPE bsize) { + BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi; TOKENEXTRA *t_backup = *t; const int mb_skip_context = vp9_get_pred_context_mbskip(xd); - const int skip_inc = !vp9_segfeature_active(&xd->seg, mbmi->segment_id, + const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); - struct tokenize_b_args arg = {cpi, xd, t, mbmi->txfm_size}; + struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size}; - mbmi->mb_skip_coeff = vp9_sb_is_skippable(xd, bsize); - if (mbmi->mb_skip_coeff) { + mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize); + if (mbmi->skip_coeff) { if (!dry_run) cm->counts.mbskip[mb_skip_context][1] += skip_inc; - vp9_reset_sb_tokens_context(xd, bsize); + reset_skip_context(xd, bsize); if (dry_run) *t = t_backup; return; diff --git a/libvpx/vp9/encoder/vp9_tokenize.h b/libvpx/vp9/encoder/vp9_tokenize.h index 968bec7..b78e100 100644 --- a/libvpx/vp9/encoder/vp9_tokenize.h +++ b/libvpx/vp9/encoder/vp9_tokenize.h @@ -31,13 +31,13 @@ typedef struct { typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS + 1]; -int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize); -int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize); -int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize); +int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize); +int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize, + int plane); struct VP9_COMP; void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, - BLOCK_SIZE_TYPE bsize); + BLOCK_SIZE bsize); #ifdef ENTROPY_STATS void init_context_counters(); diff --git a/libvpx/vp9/encoder/vp9_variance_c.c b/libvpx/vp9/encoder/vp9_variance_c.c index 23e7767..155ba8a 100644 --- a/libvpx/vp9/encoder/vp9_variance_c.c +++ b/libvpx/vp9/encoder/vp9_variance_c.c @@ -46,12 +46,12 @@ unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering + uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering uint8_t temp2[68 * 64]; const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 33, 64, hfilter); @@ -68,13 +68,13 @@ unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int dst_pixels_per_line, unsigned int *sse, const uint8_t *second_pred) { - uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering + uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering uint8_t temp2[68 * 64]; DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 33, 64, hfilter); @@ -103,12 +103,12 @@ unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering + uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering uint8_t temp2[68 * 64]; const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 65, 32, hfilter); @@ -125,13 +125,13 @@ unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int dst_pixels_per_line, unsigned int *sse, const uint8_t *second_pred) { - uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering + uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering uint8_t temp2[68 * 64]; DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64); // compound pred buffer const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 65, 32, hfilter); @@ -160,12 +160,12 @@ unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering + uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering uint8_t temp2[36 * 32]; const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 17, 32, hfilter); @@ -182,13 +182,13 @@ unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int dst_pixels_per_line, unsigned int *sse, const uint8_t *second_pred) { - uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering + uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering uint8_t temp2[36 * 32]; DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16); // compound pred buffer const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 17, 32, hfilter); @@ -217,12 +217,12 @@ unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering + uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering uint8_t temp2[36 * 32]; const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 33, 16, hfilter); @@ -239,13 +239,13 @@ unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int dst_pixels_per_line, unsigned int *sse, const uint8_t *second_pred) { - uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering + uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering uint8_t temp2[36 * 32]; DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32); // compound pred buffer const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 33, 16, hfilter); @@ -440,10 +440,10 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, unsigned int *sse) { uint8_t temp2[20 * 16]; const int16_t *hfilter, *vfilter; - uint16_t fdata3[5 * 4]; // Temp data bufffer used in filtering + uint16_t fdata3[5 * 4]; // Temp data buffer used in filtering - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); // First filter 1d Horizontal var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, @@ -466,10 +466,10 @@ unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *hfilter, *vfilter; DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4); // compound pred buffer - uint16_t fdata3[5 * 4]; // Temp data bufffer used in filtering + uint16_t fdata3[5 * 4]; // Temp data buffer used in filtering - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); // First filter 1d Horizontal var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, @@ -488,12 +488,12 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t fdata3[9 * 8]; // Temp data bufffer used in filtering + uint16_t fdata3[9 * 8]; // Temp data buffer used in filtering uint8_t temp2[20 * 16]; const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 9, 8, hfilter); @@ -510,13 +510,13 @@ unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int dst_pixels_per_line, unsigned int *sse, const uint8_t *second_pred) { - uint16_t fdata3[9 * 8]; // Temp data bufffer used in filtering + uint16_t fdata3[9 * 8]; // Temp data buffer used in filtering uint8_t temp2[20 * 16]; DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8); // compound pred buffer const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 9, 8, hfilter); @@ -532,12 +532,12 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t fdata3[17 * 16]; // Temp data bufffer used in filtering + uint16_t fdata3[17 * 16]; // Temp data buffer used in filtering uint8_t temp2[20 * 16]; const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 17, 16, hfilter); @@ -559,8 +559,8 @@ unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16); // compound pred buffer const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 17, 16, hfilter); @@ -577,12 +577,12 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering + uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering uint8_t temp2[68 * 64]; const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 65, 64, hfilter); @@ -599,13 +599,13 @@ unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int dst_pixels_per_line, unsigned int *sse, const uint8_t *second_pred) { - uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering + uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering uint8_t temp2[68 * 64]; DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 65, 64, hfilter); @@ -621,12 +621,12 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering + uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering uint8_t temp2[36 * 32]; const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 33, 32, hfilter); @@ -643,13 +643,13 @@ unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int dst_pixels_per_line, unsigned int *sse, const uint8_t *second_pred) { - uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering + uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering uint8_t temp2[36 * 32]; DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32); // compound pred buffer const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 33, 32, hfilter); @@ -785,12 +785,12 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t fdata3[16 * 9]; // Temp data bufffer used in filtering + uint16_t fdata3[16 * 9]; // Temp data buffer used in filtering uint8_t temp2[20 * 16]; const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 9, 16, hfilter); @@ -807,13 +807,13 @@ unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int dst_pixels_per_line, unsigned int *sse, const uint8_t *second_pred) { - uint16_t fdata3[16 * 9]; // Temp data bufffer used in filtering + uint16_t fdata3[16 * 9]; // Temp data buffer used in filtering uint8_t temp2[20 * 16]; DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8); // compound pred buffer const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 9, 16, hfilter); @@ -829,12 +829,12 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t fdata3[9 * 16]; // Temp data bufffer used in filtering + uint16_t fdata3[9 * 16]; // Temp data buffer used in filtering uint8_t temp2[20 * 16]; const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 17, 8, hfilter); @@ -851,13 +851,13 @@ unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int dst_pixels_per_line, unsigned int *sse, const uint8_t *second_pred) { - uint16_t fdata3[9 * 16]; // Temp data bufffer used in filtering + uint16_t fdata3[9 * 16]; // Temp data buffer used in filtering uint8_t temp2[20 * 16]; DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16); // compound pred buffer const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 17, 8, hfilter); @@ -873,12 +873,12 @@ unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t fdata3[8 * 5]; // Temp data bufffer used in filtering + uint16_t fdata3[8 * 5]; // Temp data buffer used in filtering uint8_t temp2[20 * 16]; const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 5, 8, hfilter); @@ -895,13 +895,13 @@ unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int dst_pixels_per_line, unsigned int *sse, const uint8_t *second_pred) { - uint16_t fdata3[8 * 5]; // Temp data bufffer used in filtering + uint16_t fdata3[8 * 5]; // Temp data buffer used in filtering uint8_t temp2[20 * 16]; DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 4); // compound pred buffer const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 5, 8, hfilter); @@ -917,14 +917,14 @@ unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t fdata3[5 * 8]; // Temp data bufffer used in filtering + uint16_t fdata3[5 * 8]; // Temp data buffer used in filtering // FIXME(jingning,rbultje): this temp2 buffer probably doesn't need to be // of this big? same issue appears in all other block size settings. uint8_t temp2[20 * 16]; const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 9, 4, hfilter); @@ -941,13 +941,13 @@ unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int dst_pixels_per_line, unsigned int *sse, const uint8_t *second_pred) { - uint16_t fdata3[5 * 8]; // Temp data bufffer used in filtering + uint16_t fdata3[5 * 8]; // Temp data buffer used in filtering uint8_t temp2[20 * 16]; DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 8); // compound pred buffer const int16_t *hfilter, *vfilter; - hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); - vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + hfilter = BILINEAR_FILTERS_2TAP(xoffset); + vfilter = BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, 1, 9, 4, hfilter); diff --git a/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c new file mode 100644 index 0000000..95ae266 --- /dev/null +++ b/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c @@ -0,0 +1,2650 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> // SSE2 +#include "vp9/common/vp9_idct.h" // for cospi constants +#include "vpx_ports/mem.h" + +#if FDCT32x32_HIGH_PRECISION +static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { + __m128i buf0, buf1; + buf0 = _mm_mul_epu32(a, b); + a = _mm_srli_epi64(a, 32); + b = _mm_srli_epi64(b, 32); + buf1 = _mm_mul_epu32(a, b); + return _mm_add_epi64(buf0, buf1); +} + +static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { + __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); + return _mm_unpacklo_epi64(buf0, buf1); +} + +static INLINE __m128i k_cvtlo_epi16(__m128i a, __m128i mask16, __m128i kZero) { + // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers + __m128i sign_bit = _mm_and_si128(a, mask16); + __m128i b = _mm_unpacklo_epi16(a, kZero); + sign_bit = _mm_cmplt_epi16(sign_bit, kZero); + sign_bit = _mm_unpacklo_epi16(kZero, sign_bit); + return _mm_or_si128(sign_bit, b); +} + +static INLINE __m128i k_cvthi_epi16(__m128i a, __m128i mask16, __m128i kZero) { + // convert the lower 4 signed 16-bit integers into 4 signed 32-bit integers + __m128i sign_bit = _mm_and_si128(a, mask16); + __m128i b = _mm_unpackhi_epi16(a, kZero); + sign_bit = _mm_cmplt_epi16(sign_bit, kZero); + sign_bit = _mm_unpackhi_epi16(kZero, sign_bit); + return _mm_or_si128(sign_bit, b); +} +#endif + +void FDCT32x32_2D(int16_t *input, + int16_t *output_org, int pitch) { + // Calculate pre-multiplied strides + const int str1 = pitch >> 1; + const int str2 = pitch; + const int str3 = pitch + str1; + // We need an intermediate buffer between passes. + DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); + const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); + const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); + const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); + const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); + const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); + const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); + const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); + const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); + const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); + const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); + const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); + const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); + const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); + const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); + const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); + const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_set1_epi16(0); + const __m128i kOne = _mm_set1_epi16(1); + // Do the two transform/transpose passes + int pass; + for (pass = 0; pass < 2; ++pass) { + // We process eight columns (transposed rows in second pass) at a time. + int column_start; + for (column_start = 0; column_start < 32; column_start += 8) { + __m128i step1[32]; + __m128i step2[32]; + __m128i step3[32]; + __m128i out[32]; + // Stage 1 + // Note: even though all the loads below are aligned, using the aligned + // intrinsic make the code slightly slower. + if (0 == pass) { + int16_t *in = &input[column_start]; + // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + int16_t *ina = in + 0 * str1; + int16_t *inb = in + 31 * str1; + __m128i *step1a = &step1[ 0]; + __m128i *step1b = &step1[31]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[ 0] = _mm_add_epi16(ina0, inb0); + step1a[ 1] = _mm_add_epi16(ina1, inb1); + step1a[ 2] = _mm_add_epi16(ina2, inb2); + step1a[ 3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); + step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); + step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); + step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + int16_t *ina = in + 4 * str1; + int16_t *inb = in + 27 * str1; + __m128i *step1a = &step1[ 4]; + __m128i *step1b = &step1[27]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[ 0] = _mm_add_epi16(ina0, inb0); + step1a[ 1] = _mm_add_epi16(ina1, inb1); + step1a[ 2] = _mm_add_epi16(ina2, inb2); + step1a[ 3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); + step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); + step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); + step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + int16_t *ina = in + 8 * str1; + int16_t *inb = in + 23 * str1; + __m128i *step1a = &step1[ 8]; + __m128i *step1b = &step1[23]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[ 0] = _mm_add_epi16(ina0, inb0); + step1a[ 1] = _mm_add_epi16(ina1, inb1); + step1a[ 2] = _mm_add_epi16(ina2, inb2); + step1a[ 3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); + step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); + step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); + step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + int16_t *ina = in + 12 * str1; + int16_t *inb = in + 19 * str1; + __m128i *step1a = &step1[12]; + __m128i *step1b = &step1[19]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[ 0] = _mm_add_epi16(ina0, inb0); + step1a[ 1] = _mm_add_epi16(ina1, inb1); + step1a[ 2] = _mm_add_epi16(ina2, inb2); + step1a[ 3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); + step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); + step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); + step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + } else { + int16_t *in = &intermediate[column_start]; + // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; + // Note: using the same approach as above to have common offset is + // counter-productive as all offsets can be calculated at compile + // time. + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); + __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); + __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); + __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); + __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); + __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); + __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); + __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); + step1[ 0] = _mm_add_epi16(in00, in31); + step1[ 1] = _mm_add_epi16(in01, in30); + step1[ 2] = _mm_add_epi16(in02, in29); + step1[ 3] = _mm_add_epi16(in03, in28); + step1[28] = _mm_sub_epi16(in03, in28); + step1[29] = _mm_sub_epi16(in02, in29); + step1[30] = _mm_sub_epi16(in01, in30); + step1[31] = _mm_sub_epi16(in00, in31); + } + { + __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); + __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); + __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); + __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); + __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); + __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); + __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); + __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); + step1[ 4] = _mm_add_epi16(in04, in27); + step1[ 5] = _mm_add_epi16(in05, in26); + step1[ 6] = _mm_add_epi16(in06, in25); + step1[ 7] = _mm_add_epi16(in07, in24); + step1[24] = _mm_sub_epi16(in07, in24); + step1[25] = _mm_sub_epi16(in06, in25); + step1[26] = _mm_sub_epi16(in05, in26); + step1[27] = _mm_sub_epi16(in04, in27); + } + { + __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); + __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); + __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); + __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); + __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); + __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); + __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); + __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); + step1[ 8] = _mm_add_epi16(in08, in23); + step1[ 9] = _mm_add_epi16(in09, in22); + step1[10] = _mm_add_epi16(in10, in21); + step1[11] = _mm_add_epi16(in11, in20); + step1[20] = _mm_sub_epi16(in11, in20); + step1[21] = _mm_sub_epi16(in10, in21); + step1[22] = _mm_sub_epi16(in09, in22); + step1[23] = _mm_sub_epi16(in08, in23); + } + { + __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); + __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); + __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); + __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); + __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); + __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); + __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); + __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); + step1[12] = _mm_add_epi16(in12, in19); + step1[13] = _mm_add_epi16(in13, in18); + step1[14] = _mm_add_epi16(in14, in17); + step1[15] = _mm_add_epi16(in15, in16); + step1[16] = _mm_sub_epi16(in15, in16); + step1[17] = _mm_sub_epi16(in14, in17); + step1[18] = _mm_sub_epi16(in13, in18); + step1[19] = _mm_sub_epi16(in12, in19); + } + } + // Stage 2 + { + step2[ 0] = _mm_add_epi16(step1[0], step1[15]); + step2[ 1] = _mm_add_epi16(step1[1], step1[14]); + step2[ 2] = _mm_add_epi16(step1[2], step1[13]); + step2[ 3] = _mm_add_epi16(step1[3], step1[12]); + step2[ 4] = _mm_add_epi16(step1[4], step1[11]); + step2[ 5] = _mm_add_epi16(step1[5], step1[10]); + step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]); + step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]); + step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]); + step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]); + step2[10] = _mm_sub_epi16(step1[5], step1[10]); + step2[11] = _mm_sub_epi16(step1[4], step1[11]); + step2[12] = _mm_sub_epi16(step1[3], step1[12]); + step2[13] = _mm_sub_epi16(step1[2], step1[13]); + step2[14] = _mm_sub_epi16(step1[1], step1[14]); + step2[15] = _mm_sub_epi16(step1[0], step1[15]); + } + { + const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); + const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); + const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); + const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); + const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); + const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); + const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); + const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); + const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); + const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); + const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); + const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); + const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); + const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); + const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); + const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); + const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); + const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); + const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); + const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); + const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); + const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); + const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); + const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); + const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); + const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); + const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); + const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); + const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); + const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); + const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); + const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); + const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); + const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); + const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); + const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); + const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); + const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); + const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); + const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); + const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); + const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); + const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); + const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); + const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); + const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); + const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); + const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); + const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); + const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); + const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); + const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); + const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); + const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); + const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); + // Combine + step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); + step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); + step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); + step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); + step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); + step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); + step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); + step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); + } + +#if !FDCT32x32_HIGH_PRECISION + // dump the magnitude by half, hence the intermediate values are within + // the range of 16 bits. + if (1 == pass) { + __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero); + __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero); + __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero); + __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero); + __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero); + __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero); + __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero); + __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero); + __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero); + __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero); + __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero); + __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero); + __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero); + __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero); + __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); + __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); + __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero); + __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero); + __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero); + __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero); + __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero); + __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero); + __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero); + __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero); + __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero); + __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero); + __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero); + __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero); + __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero); + __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero); + __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero); + __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero); + + step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0); + step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0); + step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0); + step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0); + step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0); + step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0); + step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0); + step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0); + step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0); + step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0); + step2[10] = _mm_sub_epi16(step2[10], s3_10_0); + step2[11] = _mm_sub_epi16(step2[11], s3_11_0); + step2[12] = _mm_sub_epi16(step2[12], s3_12_0); + step2[13] = _mm_sub_epi16(step2[13], s3_13_0); + step2[14] = _mm_sub_epi16(step2[14], s2_14_0); + step2[15] = _mm_sub_epi16(step2[15], s2_15_0); + step1[16] = _mm_sub_epi16(step1[16], s3_16_0); + step1[17] = _mm_sub_epi16(step1[17], s3_17_0); + step1[18] = _mm_sub_epi16(step1[18], s3_18_0); + step1[19] = _mm_sub_epi16(step1[19], s3_19_0); + step2[20] = _mm_sub_epi16(step2[20], s3_20_0); + step2[21] = _mm_sub_epi16(step2[21], s3_21_0); + step2[22] = _mm_sub_epi16(step2[22], s3_22_0); + step2[23] = _mm_sub_epi16(step2[23], s3_23_0); + step2[24] = _mm_sub_epi16(step2[24], s3_24_0); + step2[25] = _mm_sub_epi16(step2[25], s3_25_0); + step2[26] = _mm_sub_epi16(step2[26], s3_26_0); + step2[27] = _mm_sub_epi16(step2[27], s3_27_0); + step1[28] = _mm_sub_epi16(step1[28], s3_28_0); + step1[29] = _mm_sub_epi16(step1[29], s3_29_0); + step1[30] = _mm_sub_epi16(step1[30], s3_30_0); + step1[31] = _mm_sub_epi16(step1[31], s3_31_0); + + step2[ 0] = _mm_add_epi16(step2[ 0], kOne); + step2[ 1] = _mm_add_epi16(step2[ 1], kOne); + step2[ 2] = _mm_add_epi16(step2[ 2], kOne); + step2[ 3] = _mm_add_epi16(step2[ 3], kOne); + step2[ 4] = _mm_add_epi16(step2[ 4], kOne); + step2[ 5] = _mm_add_epi16(step2[ 5], kOne); + step2[ 6] = _mm_add_epi16(step2[ 6], kOne); + step2[ 7] = _mm_add_epi16(step2[ 7], kOne); + step2[ 8] = _mm_add_epi16(step2[ 8], kOne); + step2[ 9] = _mm_add_epi16(step2[ 9], kOne); + step2[10] = _mm_add_epi16(step2[10], kOne); + step2[11] = _mm_add_epi16(step2[11], kOne); + step2[12] = _mm_add_epi16(step2[12], kOne); + step2[13] = _mm_add_epi16(step2[13], kOne); + step2[14] = _mm_add_epi16(step2[14], kOne); + step2[15] = _mm_add_epi16(step2[15], kOne); + step1[16] = _mm_add_epi16(step1[16], kOne); + step1[17] = _mm_add_epi16(step1[17], kOne); + step1[18] = _mm_add_epi16(step1[18], kOne); + step1[19] = _mm_add_epi16(step1[19], kOne); + step2[20] = _mm_add_epi16(step2[20], kOne); + step2[21] = _mm_add_epi16(step2[21], kOne); + step2[22] = _mm_add_epi16(step2[22], kOne); + step2[23] = _mm_add_epi16(step2[23], kOne); + step2[24] = _mm_add_epi16(step2[24], kOne); + step2[25] = _mm_add_epi16(step2[25], kOne); + step2[26] = _mm_add_epi16(step2[26], kOne); + step2[27] = _mm_add_epi16(step2[27], kOne); + step1[28] = _mm_add_epi16(step1[28], kOne); + step1[29] = _mm_add_epi16(step1[29], kOne); + step1[30] = _mm_add_epi16(step1[30], kOne); + step1[31] = _mm_add_epi16(step1[31], kOne); + + step2[ 0] = _mm_srai_epi16(step2[ 0], 2); + step2[ 1] = _mm_srai_epi16(step2[ 1], 2); + step2[ 2] = _mm_srai_epi16(step2[ 2], 2); + step2[ 3] = _mm_srai_epi16(step2[ 3], 2); + step2[ 4] = _mm_srai_epi16(step2[ 4], 2); + step2[ 5] = _mm_srai_epi16(step2[ 5], 2); + step2[ 6] = _mm_srai_epi16(step2[ 6], 2); + step2[ 7] = _mm_srai_epi16(step2[ 7], 2); + step2[ 8] = _mm_srai_epi16(step2[ 8], 2); + step2[ 9] = _mm_srai_epi16(step2[ 9], 2); + step2[10] = _mm_srai_epi16(step2[10], 2); + step2[11] = _mm_srai_epi16(step2[11], 2); + step2[12] = _mm_srai_epi16(step2[12], 2); + step2[13] = _mm_srai_epi16(step2[13], 2); + step2[14] = _mm_srai_epi16(step2[14], 2); + step2[15] = _mm_srai_epi16(step2[15], 2); + step1[16] = _mm_srai_epi16(step1[16], 2); + step1[17] = _mm_srai_epi16(step1[17], 2); + step1[18] = _mm_srai_epi16(step1[18], 2); + step1[19] = _mm_srai_epi16(step1[19], 2); + step2[20] = _mm_srai_epi16(step2[20], 2); + step2[21] = _mm_srai_epi16(step2[21], 2); + step2[22] = _mm_srai_epi16(step2[22], 2); + step2[23] = _mm_srai_epi16(step2[23], 2); + step2[24] = _mm_srai_epi16(step2[24], 2); + step2[25] = _mm_srai_epi16(step2[25], 2); + step2[26] = _mm_srai_epi16(step2[26], 2); + step2[27] = _mm_srai_epi16(step2[27], 2); + step1[28] = _mm_srai_epi16(step1[28], 2); + step1[29] = _mm_srai_epi16(step1[29], 2); + step1[30] = _mm_srai_epi16(step1[30], 2); + step1[31] = _mm_srai_epi16(step1[31], 2); + } +#endif + +#if FDCT32x32_HIGH_PRECISION + if (pass == 0) { +#endif + // Stage 3 + { + step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]); + step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]); + step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]); + step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]); + step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]); + step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]); + step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]); + step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]); + } + { + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); + const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); + const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); + const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); + const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); + const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); + const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); + const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); + // Combine + step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); + step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); + step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); + step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); + } + { + step3[16] = _mm_add_epi16(step2[23], step1[16]); + step3[17] = _mm_add_epi16(step2[22], step1[17]); + step3[18] = _mm_add_epi16(step2[21], step1[18]); + step3[19] = _mm_add_epi16(step2[20], step1[19]); + step3[20] = _mm_sub_epi16(step1[19], step2[20]); + step3[21] = _mm_sub_epi16(step1[18], step2[21]); + step3[22] = _mm_sub_epi16(step1[17], step2[22]); + step3[23] = _mm_sub_epi16(step1[16], step2[23]); + step3[24] = _mm_sub_epi16(step1[31], step2[24]); + step3[25] = _mm_sub_epi16(step1[30], step2[25]); + step3[26] = _mm_sub_epi16(step1[29], step2[26]); + step3[27] = _mm_sub_epi16(step1[28], step2[27]); + step3[28] = _mm_add_epi16(step2[27], step1[28]); + step3[29] = _mm_add_epi16(step2[26], step1[29]); + step3[30] = _mm_add_epi16(step2[25], step1[30]); + step3[31] = _mm_add_epi16(step2[24], step1[31]); + } + + // Stage 4 + { + step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]); + step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]); + step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]); + step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]); + step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]); + step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]); + step1[10] = _mm_sub_epi16(step2[ 9], step3[10]); + step1[11] = _mm_sub_epi16(step2[ 8], step3[11]); + step1[12] = _mm_sub_epi16(step2[15], step3[12]); + step1[13] = _mm_sub_epi16(step2[14], step3[13]); + step1[14] = _mm_add_epi16(step3[13], step2[14]); + step1[15] = _mm_add_epi16(step3[12], step2[15]); + } + { + const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); + const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); + const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); + const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); + const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); + const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); + const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); + const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); + const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); + const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); + const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); + const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); + const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); + // Combine + step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); + step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); + } + { + const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); + const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); + const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); + const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); + const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); + const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); + const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); + const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); + const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); + const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); + const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); + const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); + const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); + const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); + const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); + const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); + const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); + const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); + const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); + const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); + const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); + const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); + const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); + const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); + const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); + const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); + const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); + const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); + const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); + const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); + const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); + const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); + const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); + const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); + const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); + const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); + const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); + const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); + const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); + const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); + const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); + const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); + const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); + const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); + const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); + const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); + const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); + const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); + const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); + const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); + const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); + const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); + const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); + const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); + const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); + // Combine + step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); + step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); + step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); + step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); + step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); + step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); + step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); + step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); + } + // Stage 5 + { + step2[4] = _mm_add_epi16(step1[5], step3[4]); + step2[5] = _mm_sub_epi16(step3[4], step1[5]); + step2[6] = _mm_sub_epi16(step3[7], step1[6]); + step2[7] = _mm_add_epi16(step1[6], step3[7]); + } + { + const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); + const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); + const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); + const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); + const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); + const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); + const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); + const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); + const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); + const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); + const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); + const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); + const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); + const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); + const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); + const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); + const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); + const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); + const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); + const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); + const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); + const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); + const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); + const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); + const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); + const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); + const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); + // Combine + out[ 0] = _mm_packs_epi32(out_00_6, out_00_7); + out[16] = _mm_packs_epi32(out_16_6, out_16_7); + out[ 8] = _mm_packs_epi32(out_08_6, out_08_7); + out[24] = _mm_packs_epi32(out_24_6, out_24_7); + } + { + const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]); + const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]); + const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); + const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); + const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); + const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); + const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); + const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); + const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); + const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); + const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); + const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); + const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); + const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); + const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); + const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); + const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); + const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); + const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); + const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); + const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); + const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); + const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); + const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); + const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); + const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); + const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); + // Combine + step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7); + step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); + step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); + step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); + } + { + step2[16] = _mm_add_epi16(step1[19], step3[16]); + step2[17] = _mm_add_epi16(step1[18], step3[17]); + step2[18] = _mm_sub_epi16(step3[17], step1[18]); + step2[19] = _mm_sub_epi16(step3[16], step1[19]); + step2[20] = _mm_sub_epi16(step3[23], step1[20]); + step2[21] = _mm_sub_epi16(step3[22], step1[21]); + step2[22] = _mm_add_epi16(step1[21], step3[22]); + step2[23] = _mm_add_epi16(step1[20], step3[23]); + step2[24] = _mm_add_epi16(step1[27], step3[24]); + step2[25] = _mm_add_epi16(step1[26], step3[25]); + step2[26] = _mm_sub_epi16(step3[25], step1[26]); + step2[27] = _mm_sub_epi16(step3[24], step1[27]); + step2[28] = _mm_sub_epi16(step3[31], step1[28]); + step2[29] = _mm_sub_epi16(step3[30], step1[29]); + step2[30] = _mm_add_epi16(step1[29], step3[30]); + step2[31] = _mm_add_epi16(step1[28], step3[31]); + } + // Stage 6 + { + const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); + const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); + const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); + const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); + const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); + const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); + const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); + const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); + // dct_const_round_shift + const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); + const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); + const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); + const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); + const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); + const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); + const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); + const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); + const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); + const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); + const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); + const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); + const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); + const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); + const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); + const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); + // Combine + out[ 4] = _mm_packs_epi32(out_04_6, out_04_7); + out[20] = _mm_packs_epi32(out_20_6, out_20_7); + out[12] = _mm_packs_epi32(out_12_6, out_12_7); + out[28] = _mm_packs_epi32(out_28_6, out_28_7); + } + { + step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]); + step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]); + step3[10] = _mm_sub_epi16(step1[11], step2[10]); + step3[11] = _mm_add_epi16(step2[10], step1[11]); + step3[12] = _mm_add_epi16(step2[13], step1[12]); + step3[13] = _mm_sub_epi16(step1[12], step2[13]); + step3[14] = _mm_sub_epi16(step1[15], step2[14]); + step3[15] = _mm_add_epi16(step2[14], step1[15]); + } + { + const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); + const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); + const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); + const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); + const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); + const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); + const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); + const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); + const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); + const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); + const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); + const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); + const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); + const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); + const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); + const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); + const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); + const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); + const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); + const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); + const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); + const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); + const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); + const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); + // dct_const_round_shift + const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); + const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); + const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); + const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); + const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); + const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); + const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); + const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); + const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); + const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); + const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); + const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); + const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); + const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); + const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); + const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); + const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); + const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); + const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); + const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); + const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); + const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); + const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); + const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); + const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); + const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); + const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); + const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); + const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); + const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); + const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); + const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); + // Combine + step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); + step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); + step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); + step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); + // Combine + step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); + step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); + step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); + step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); + } + // Stage 7 + { + const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]); + const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]); + const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]); + const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]); + const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); + const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); + const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); + const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); + const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); + const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); + const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); + const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); + const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); + const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); + const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); + const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); + const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); + const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); + const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); + const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); + const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); + const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); + const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); + const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); + // dct_const_round_shift + const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); + const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); + const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); + const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); + const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); + const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); + const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); + const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); + const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); + const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); + const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); + const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); + const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); + const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); + const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); + const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); + const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); + const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); + const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); + const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); + const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); + const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); + const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); + const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); + const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); + const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); + const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); + const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); + const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); + const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); + const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); + const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); + // Combine + out[ 2] = _mm_packs_epi32(out_02_6, out_02_7); + out[18] = _mm_packs_epi32(out_18_6, out_18_7); + out[10] = _mm_packs_epi32(out_10_6, out_10_7); + out[26] = _mm_packs_epi32(out_26_6, out_26_7); + out[ 6] = _mm_packs_epi32(out_06_6, out_06_7); + out[22] = _mm_packs_epi32(out_22_6, out_22_7); + out[14] = _mm_packs_epi32(out_14_6, out_14_7); + out[30] = _mm_packs_epi32(out_30_6, out_30_7); + } + { + step1[16] = _mm_add_epi16(step3[17], step2[16]); + step1[17] = _mm_sub_epi16(step2[16], step3[17]); + step1[18] = _mm_sub_epi16(step2[19], step3[18]); + step1[19] = _mm_add_epi16(step3[18], step2[19]); + step1[20] = _mm_add_epi16(step3[21], step2[20]); + step1[21] = _mm_sub_epi16(step2[20], step3[21]); + step1[22] = _mm_sub_epi16(step2[23], step3[22]); + step1[23] = _mm_add_epi16(step3[22], step2[23]); + step1[24] = _mm_add_epi16(step3[25], step2[24]); + step1[25] = _mm_sub_epi16(step2[24], step3[25]); + step1[26] = _mm_sub_epi16(step2[27], step3[26]); + step1[27] = _mm_add_epi16(step3[26], step2[27]); + step1[28] = _mm_add_epi16(step3[29], step2[28]); + step1[29] = _mm_sub_epi16(step2[28], step3[29]); + step1[30] = _mm_sub_epi16(step2[31], step3[30]); + step1[31] = _mm_add_epi16(step3[30], step2[31]); + } + // Final stage --- outputs indices are bit-reversed. + { + const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); + const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); + const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); + const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); + const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); + const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); + const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); + const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); + const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); + const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); + const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); + const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); + const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); + const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); + const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); + const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); + const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); + const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); + const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); + const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); + const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); + const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); + const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); + const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); + // dct_const_round_shift + const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); + const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); + const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); + const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); + const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); + const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); + const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); + const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); + const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); + const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); + const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); + const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); + const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); + const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); + const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); + const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); + const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); + const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); + const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); + const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); + const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); + const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); + const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); + const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); + const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); + const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); + const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); + const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); + const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); + const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); + const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); + const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); + // Combine + out[ 1] = _mm_packs_epi32(out_01_6, out_01_7); + out[17] = _mm_packs_epi32(out_17_6, out_17_7); + out[ 9] = _mm_packs_epi32(out_09_6, out_09_7); + out[25] = _mm_packs_epi32(out_25_6, out_25_7); + out[ 7] = _mm_packs_epi32(out_07_6, out_07_7); + out[23] = _mm_packs_epi32(out_23_6, out_23_7); + out[15] = _mm_packs_epi32(out_15_6, out_15_7); + out[31] = _mm_packs_epi32(out_31_6, out_31_7); + } + { + const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); + const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); + const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); + const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); + const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); + const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); + const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); + const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); + const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); + const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); + const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); + const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); + const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); + const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); + const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); + const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); + const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); + const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); + const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); + const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); + const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); + const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); + const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); + const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); + // dct_const_round_shift + const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); + const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); + const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); + const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); + const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); + const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); + const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); + const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); + const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); + const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); + const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); + const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); + const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); + const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); + const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); + const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); + const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); + const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); + const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); + const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); + const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); + const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); + const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); + const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); + const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); + const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); + const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); + const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); + const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); + const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); + const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); + const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); + // Combine + out[ 5] = _mm_packs_epi32(out_05_6, out_05_7); + out[21] = _mm_packs_epi32(out_21_6, out_21_7); + out[13] = _mm_packs_epi32(out_13_6, out_13_7); + out[29] = _mm_packs_epi32(out_29_6, out_29_7); + out[ 3] = _mm_packs_epi32(out_03_6, out_03_7); + out[19] = _mm_packs_epi32(out_19_6, out_19_7); + out[11] = _mm_packs_epi32(out_11_6, out_11_7); + out[27] = _mm_packs_epi32(out_27_6, out_27_7); + } +#if FDCT32x32_HIGH_PRECISION + } else { + __m128i lstep1[64], lstep2[64], lstep3[64]; + __m128i u[32], v[32], sign[16]; + const __m128i mask16 = _mm_set1_epi32(0x80008000); + const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); + // start using 32-bit operations + // stage 3 + { + // expanding to 32-bit length priori to addition operations + lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero); + lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero); + lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero); + lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero); + lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero); + lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero); + lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero); + lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero); + lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero); + lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero); + lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero); + lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero); + lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero); + lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero); + lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero); + lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero); + + lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]); + lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]); + lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]); + lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]); + lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]); + lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]); + lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]); + lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]); + lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]); + lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]); + lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]); + lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]); + lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]); + lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]); + lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]); + lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]); + } + { + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); + lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); + lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); + lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); + lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); + lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); + lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); + lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); + } + { + lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero); + lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero); + lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero); + lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero); + lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero); + lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero); + lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero); + lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero); + lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero); + lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero); + lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero); + lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero); + lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero); + lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero); + lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero); + lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero); + + lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero); + lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero); + lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero); + lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero); + lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero); + lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero); + lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero); + lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero); + lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero); + lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero); + lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero); + lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero); + lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero); + lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero); + lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero); + lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero); + + lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); + lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); + lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); + lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); + lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); + lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]); + lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]); + lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]); + lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]); + lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]); + lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]); + lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]); + lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]); + lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]); + lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]); + lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]); + lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]); + lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]); + lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]); + lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]); + lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]); + lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]); + lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]); + lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]); + lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]); + lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]); + lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]); + lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]); + lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]); + lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]); + lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]); + lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]); + } + + // stage 4 + { + // expanding to 32-bit length priori to addition operations + lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero); + lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero); + lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero); + lstep2[19] = k_cvthi_epi16(step2[ 9], mask16, kZero); + lstep2[28] = k_cvtlo_epi16(step2[14], mask16, kZero); + lstep2[29] = k_cvthi_epi16(step2[14], mask16, kZero); + lstep2[30] = k_cvtlo_epi16(step2[15], mask16, kZero); + lstep2[31] = k_cvthi_epi16(step2[15], mask16, kZero); + + lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]); + lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]); + lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]); + lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]); + lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]); + lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]); + lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]); + lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]); + lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]); + lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]); + lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]); + lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]); + lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]); + lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]); + lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]); + lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]); + lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]); + lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]); + lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]); + lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]); + lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]); + lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]); + lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]); + lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); + } + { + // to be continued... + // + const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); + const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); + + u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); + u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); + u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); + u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); + + // TODO(jingning): manually inline k_madd_epi32_ to further hide + // instruction latency. + v[ 0] = k_madd_epi32(u[0], k32_p16_m16); + v[ 1] = k_madd_epi32(u[1], k32_p16_m16); + v[ 2] = k_madd_epi32(u[2], k32_p16_m16); + v[ 3] = k_madd_epi32(u[3], k32_p16_m16); + v[ 4] = k_madd_epi32(u[0], k32_p16_p16); + v[ 5] = k_madd_epi32(u[1], k32_p16_p16); + v[ 6] = k_madd_epi32(u[2], k32_p16_p16); + v[ 7] = k_madd_epi32(u[3], k32_p16_p16); + + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + + lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + } + { + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); + const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); + + u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); + u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); + u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); + u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); + u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]); + u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]); + u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]); + u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]); + u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]); + u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]); + u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]); + u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]); + u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]); + u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]); + u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]); + u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]); + + v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24); + v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24); + v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24); + v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24); + v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24); + v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24); + v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24); + v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24); + v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08); + v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08); + v[10] = k_madd_epi32(u[10], k32_m24_m08); + v[11] = k_madd_epi32(u[11], k32_m24_m08); + v[12] = k_madd_epi32(u[12], k32_m24_m08); + v[13] = k_madd_epi32(u[13], k32_m24_m08); + v[14] = k_madd_epi32(u[14], k32_m24_m08); + v[15] = k_madd_epi32(u[15], k32_m24_m08); + v[16] = k_madd_epi32(u[12], k32_m08_p24); + v[17] = k_madd_epi32(u[13], k32_m08_p24); + v[18] = k_madd_epi32(u[14], k32_m08_p24); + v[19] = k_madd_epi32(u[15], k32_m08_p24); + v[20] = k_madd_epi32(u[ 8], k32_m08_p24); + v[21] = k_madd_epi32(u[ 9], k32_m08_p24); + v[22] = k_madd_epi32(u[10], k32_m08_p24); + v[23] = k_madd_epi32(u[11], k32_m08_p24); + v[24] = k_madd_epi32(u[ 4], k32_p24_p08); + v[25] = k_madd_epi32(u[ 5], k32_p24_p08); + v[26] = k_madd_epi32(u[ 6], k32_p24_p08); + v[27] = k_madd_epi32(u[ 7], k32_p24_p08); + v[28] = k_madd_epi32(u[ 0], k32_p24_p08); + v[29] = k_madd_epi32(u[ 1], k32_p24_p08); + v[30] = k_madd_epi32(u[ 2], k32_p24_p08); + v[31] = k_madd_epi32(u[ 3], k32_p24_p08); + + u[ 0] = k_packs_epi64(v[ 0], v[ 1]); + u[ 1] = k_packs_epi64(v[ 2], v[ 3]); + u[ 2] = k_packs_epi64(v[ 4], v[ 5]); + u[ 3] = k_packs_epi64(v[ 6], v[ 7]); + u[ 4] = k_packs_epi64(v[ 8], v[ 9]); + u[ 5] = k_packs_epi64(v[10], v[11]); + u[ 6] = k_packs_epi64(v[12], v[13]); + u[ 7] = k_packs_epi64(v[14], v[15]); + u[ 8] = k_packs_epi64(v[16], v[17]); + u[ 9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); + v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); + v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); + v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); + v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); + v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); + v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); + v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); + v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); + v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); + lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); + lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); + lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); + lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); + lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); + lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); + lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); + lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); + lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); + lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 5 + { + lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]); + lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]); + lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]); + lstep2[11] = _mm_sub_epi32(lstep3[ 9], lstep1[11]); + lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]); + lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]); + lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]); + lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]); + } + { + const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); + const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); + + u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]); + u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]); + u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]); + u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]); + u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]); + u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]); + u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]); + u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]); + + // TODO(jingning): manually inline k_madd_epi32_ to further hide + // instruction latency. + v[ 0] = k_madd_epi32(u[0], k32_p16_p16); + v[ 1] = k_madd_epi32(u[1], k32_p16_p16); + v[ 2] = k_madd_epi32(u[2], k32_p16_p16); + v[ 3] = k_madd_epi32(u[3], k32_p16_p16); + v[ 4] = k_madd_epi32(u[0], k32_p16_m16); + v[ 5] = k_madd_epi32(u[1], k32_p16_m16); + v[ 6] = k_madd_epi32(u[2], k32_p16_m16); + v[ 7] = k_madd_epi32(u[3], k32_p16_m16); + v[ 8] = k_madd_epi32(u[4], k32_p24_p08); + v[ 9] = k_madd_epi32(u[5], k32_p24_p08); + v[10] = k_madd_epi32(u[6], k32_p24_p08); + v[11] = k_madd_epi32(u[7], k32_p24_p08); + v[12] = k_madd_epi32(u[4], k32_m08_p24); + v[13] = k_madd_epi32(u[5], k32_m08_p24); + v[14] = k_madd_epi32(u[6], k32_m08_p24); + v[15] = k_madd_epi32(u[7], k32_m08_p24); + + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm_cmplt_epi32(u[0], kZero); + sign[1] = _mm_cmplt_epi32(u[1], kZero); + sign[2] = _mm_cmplt_epi32(u[2], kZero); + sign[3] = _mm_cmplt_epi32(u[3], kZero); + sign[4] = _mm_cmplt_epi32(u[4], kZero); + sign[5] = _mm_cmplt_epi32(u[5], kZero); + sign[6] = _mm_cmplt_epi32(u[6], kZero); + sign[7] = _mm_cmplt_epi32(u[7], kZero); + + u[0] = _mm_sub_epi32(u[0], sign[0]); + u[1] = _mm_sub_epi32(u[1], sign[1]); + u[2] = _mm_sub_epi32(u[2], sign[2]); + u[3] = _mm_sub_epi32(u[3], sign[3]); + u[4] = _mm_sub_epi32(u[4], sign[4]); + u[5] = _mm_sub_epi32(u[5], sign[5]); + u[6] = _mm_sub_epi32(u[6], sign[6]); + u[7] = _mm_sub_epi32(u[7], sign[7]); + + u[0] = _mm_add_epi32(u[0], K32One); + u[1] = _mm_add_epi32(u[1], K32One); + u[2] = _mm_add_epi32(u[2], K32One); + u[3] = _mm_add_epi32(u[3], K32One); + u[4] = _mm_add_epi32(u[4], K32One); + u[5] = _mm_add_epi32(u[5], K32One); + u[6] = _mm_add_epi32(u[6], K32One); + u[7] = _mm_add_epi32(u[7], K32One); + + u[0] = _mm_srai_epi32(u[0], 2); + u[1] = _mm_srai_epi32(u[1], 2); + u[2] = _mm_srai_epi32(u[2], 2); + u[3] = _mm_srai_epi32(u[3], 2); + u[4] = _mm_srai_epi32(u[4], 2); + u[5] = _mm_srai_epi32(u[5], 2); + u[6] = _mm_srai_epi32(u[6], 2); + u[7] = _mm_srai_epi32(u[7], 2); + + // Combine + out[ 0] = _mm_packs_epi32(u[0], u[1]); + out[16] = _mm_packs_epi32(u[2], u[3]); + out[ 8] = _mm_packs_epi32(u[4], u[5]); + out[24] = _mm_packs_epi32(u[6], u[7]); + } + { + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); + const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]); + u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]); + u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]); + u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]); + u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]); + u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]); + u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]); + u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]); + + v[0] = k_madd_epi32(u[0], k32_m08_p24); + v[1] = k_madd_epi32(u[1], k32_m08_p24); + v[2] = k_madd_epi32(u[2], k32_m08_p24); + v[3] = k_madd_epi32(u[3], k32_m08_p24); + v[4] = k_madd_epi32(u[4], k32_m24_m08); + v[5] = k_madd_epi32(u[5], k32_m24_m08); + v[6] = k_madd_epi32(u[6], k32_m24_m08); + v[7] = k_madd_epi32(u[7], k32_m24_m08); + v[ 8] = k_madd_epi32(u[4], k32_m08_p24); + v[ 9] = k_madd_epi32(u[5], k32_m08_p24); + v[10] = k_madd_epi32(u[6], k32_m08_p24); + v[11] = k_madd_epi32(u[7], k32_m08_p24); + v[12] = k_madd_epi32(u[0], k32_p24_p08); + v[13] = k_madd_epi32(u[1], k32_p24_p08); + v[14] = k_madd_epi32(u[2], k32_p24_p08); + v[15] = k_madd_epi32(u[3], k32_p24_p08); + + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + } + { + lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]); + lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]); + lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]); + lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]); + lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]); + lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]); + lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]); + lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]); + lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]); + lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]); + lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]); + lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]); + lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]); + lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]); + lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]); + lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]); + lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]); + lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]); + lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]); + lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]); + lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]); + lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]); + lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]); + lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]); + lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]); + lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]); + lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]); + lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]); + lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]); + lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]); + lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]); + lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]); + } + // stage 6 + { + const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); + const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); + const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); + const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); + + u[0] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]); + u[1] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]); + u[2] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]); + u[3] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]); + u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); + u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); + u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); + u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); + u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); + u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); + u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); + u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); + u[12] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]); + u[13] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]); + u[14] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]); + u[15] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]); + + v[0] = k_madd_epi32(u[0], k32_p28_p04); + v[1] = k_madd_epi32(u[1], k32_p28_p04); + v[2] = k_madd_epi32(u[2], k32_p28_p04); + v[3] = k_madd_epi32(u[3], k32_p28_p04); + v[4] = k_madd_epi32(u[4], k32_p12_p20); + v[5] = k_madd_epi32(u[5], k32_p12_p20); + v[6] = k_madd_epi32(u[6], k32_p12_p20); + v[7] = k_madd_epi32(u[7], k32_p12_p20); + v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12); + v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12); + v[10] = k_madd_epi32(u[10], k32_m20_p12); + v[11] = k_madd_epi32(u[11], k32_m20_p12); + v[12] = k_madd_epi32(u[12], k32_m04_p28); + v[13] = k_madd_epi32(u[13], k32_m04_p28); + v[14] = k_madd_epi32(u[14], k32_m04_p28); + v[15] = k_madd_epi32(u[15], k32_m04_p28); + + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm_cmplt_epi32(u[0], kZero); + sign[1] = _mm_cmplt_epi32(u[1], kZero); + sign[2] = _mm_cmplt_epi32(u[2], kZero); + sign[3] = _mm_cmplt_epi32(u[3], kZero); + sign[4] = _mm_cmplt_epi32(u[4], kZero); + sign[5] = _mm_cmplt_epi32(u[5], kZero); + sign[6] = _mm_cmplt_epi32(u[6], kZero); + sign[7] = _mm_cmplt_epi32(u[7], kZero); + + u[0] = _mm_sub_epi32(u[0], sign[0]); + u[1] = _mm_sub_epi32(u[1], sign[1]); + u[2] = _mm_sub_epi32(u[2], sign[2]); + u[3] = _mm_sub_epi32(u[3], sign[3]); + u[4] = _mm_sub_epi32(u[4], sign[4]); + u[5] = _mm_sub_epi32(u[5], sign[5]); + u[6] = _mm_sub_epi32(u[6], sign[6]); + u[7] = _mm_sub_epi32(u[7], sign[7]); + + u[0] = _mm_add_epi32(u[0], K32One); + u[1] = _mm_add_epi32(u[1], K32One); + u[2] = _mm_add_epi32(u[2], K32One); + u[3] = _mm_add_epi32(u[3], K32One); + u[4] = _mm_add_epi32(u[4], K32One); + u[5] = _mm_add_epi32(u[5], K32One); + u[6] = _mm_add_epi32(u[6], K32One); + u[7] = _mm_add_epi32(u[7], K32One); + + u[0] = _mm_srai_epi32(u[0], 2); + u[1] = _mm_srai_epi32(u[1], 2); + u[2] = _mm_srai_epi32(u[2], 2); + u[3] = _mm_srai_epi32(u[3], 2); + u[4] = _mm_srai_epi32(u[4], 2); + u[5] = _mm_srai_epi32(u[5], 2); + u[6] = _mm_srai_epi32(u[6], 2); + u[7] = _mm_srai_epi32(u[7], 2); + + out[ 4] = _mm_packs_epi32(u[0], u[1]); + out[20] = _mm_packs_epi32(u[2], u[3]); + out[12] = _mm_packs_epi32(u[4], u[5]); + out[28] = _mm_packs_epi32(u[6], u[7]); + } + { + lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]); + lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]); + lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]); + lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]); + lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]); + lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]); + lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]); + lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]); + lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]); + lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]); + lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]); + lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]); + lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]); + lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]); + lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]); + lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]); + } + { + const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); + const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64); + const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); + const __m128i k32_m12_m20 = pair_set_epi32(-cospi_12_64, + -cospi_20_64); + const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); + const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); + + u[ 0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]); + u[ 1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]); + u[ 2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]); + u[ 3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]); + u[ 4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]); + u[ 5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]); + u[ 6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]); + u[ 7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]); + u[ 8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]); + u[ 9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]); + u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]); + u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]); + u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]); + u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]); + u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]); + u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]); + + v[ 0] = k_madd_epi32(u[ 0], k32_m04_p28); + v[ 1] = k_madd_epi32(u[ 1], k32_m04_p28); + v[ 2] = k_madd_epi32(u[ 2], k32_m04_p28); + v[ 3] = k_madd_epi32(u[ 3], k32_m04_p28); + v[ 4] = k_madd_epi32(u[ 4], k32_m28_m04); + v[ 5] = k_madd_epi32(u[ 5], k32_m28_m04); + v[ 6] = k_madd_epi32(u[ 6], k32_m28_m04); + v[ 7] = k_madd_epi32(u[ 7], k32_m28_m04); + v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12); + v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12); + v[10] = k_madd_epi32(u[10], k32_m20_p12); + v[11] = k_madd_epi32(u[11], k32_m20_p12); + v[12] = k_madd_epi32(u[12], k32_m12_m20); + v[13] = k_madd_epi32(u[13], k32_m12_m20); + v[14] = k_madd_epi32(u[14], k32_m12_m20); + v[15] = k_madd_epi32(u[15], k32_m12_m20); + v[16] = k_madd_epi32(u[12], k32_m20_p12); + v[17] = k_madd_epi32(u[13], k32_m20_p12); + v[18] = k_madd_epi32(u[14], k32_m20_p12); + v[19] = k_madd_epi32(u[15], k32_m20_p12); + v[20] = k_madd_epi32(u[ 8], k32_p12_p20); + v[21] = k_madd_epi32(u[ 9], k32_p12_p20); + v[22] = k_madd_epi32(u[10], k32_p12_p20); + v[23] = k_madd_epi32(u[11], k32_p12_p20); + v[24] = k_madd_epi32(u[ 4], k32_m04_p28); + v[25] = k_madd_epi32(u[ 5], k32_m04_p28); + v[26] = k_madd_epi32(u[ 6], k32_m04_p28); + v[27] = k_madd_epi32(u[ 7], k32_m04_p28); + v[28] = k_madd_epi32(u[ 0], k32_p28_p04); + v[29] = k_madd_epi32(u[ 1], k32_p28_p04); + v[30] = k_madd_epi32(u[ 2], k32_p28_p04); + v[31] = k_madd_epi32(u[ 3], k32_p28_p04); + + u[ 0] = k_packs_epi64(v[ 0], v[ 1]); + u[ 1] = k_packs_epi64(v[ 2], v[ 3]); + u[ 2] = k_packs_epi64(v[ 4], v[ 5]); + u[ 3] = k_packs_epi64(v[ 6], v[ 7]); + u[ 4] = k_packs_epi64(v[ 8], v[ 9]); + u[ 5] = k_packs_epi64(v[10], v[11]); + u[ 6] = k_packs_epi64(v[12], v[13]); + u[ 7] = k_packs_epi64(v[14], v[15]); + u[ 8] = k_packs_epi64(v[16], v[17]); + u[ 9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); + v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); + v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); + v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); + v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); + v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); + v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); + v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); + v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); + v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep3[34] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); + lstep3[35] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); + lstep3[36] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); + lstep3[37] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); + lstep3[42] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); + lstep3[43] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); + lstep3[44] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); + lstep3[45] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); + lstep3[50] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); + lstep3[51] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); + lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 7 + { + const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64); + const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64); + const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64); + const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64); + const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64); + const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64); + const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64); + const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64); + + u[ 0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]); + u[ 1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]); + u[ 2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]); + u[ 3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]); + u[ 4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]); + u[ 5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]); + u[ 6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]); + u[ 7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]); + u[ 8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]); + u[ 9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]); + u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]); + u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]); + u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]); + u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]); + u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]); + u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]); + + v[ 0] = k_madd_epi32(u[ 0], k32_p30_p02); + v[ 1] = k_madd_epi32(u[ 1], k32_p30_p02); + v[ 2] = k_madd_epi32(u[ 2], k32_p30_p02); + v[ 3] = k_madd_epi32(u[ 3], k32_p30_p02); + v[ 4] = k_madd_epi32(u[ 4], k32_p14_p18); + v[ 5] = k_madd_epi32(u[ 5], k32_p14_p18); + v[ 6] = k_madd_epi32(u[ 6], k32_p14_p18); + v[ 7] = k_madd_epi32(u[ 7], k32_p14_p18); + v[ 8] = k_madd_epi32(u[ 8], k32_p22_p10); + v[ 9] = k_madd_epi32(u[ 9], k32_p22_p10); + v[10] = k_madd_epi32(u[10], k32_p22_p10); + v[11] = k_madd_epi32(u[11], k32_p22_p10); + v[12] = k_madd_epi32(u[12], k32_p06_p26); + v[13] = k_madd_epi32(u[13], k32_p06_p26); + v[14] = k_madd_epi32(u[14], k32_p06_p26); + v[15] = k_madd_epi32(u[15], k32_p06_p26); + v[16] = k_madd_epi32(u[12], k32_m26_p06); + v[17] = k_madd_epi32(u[13], k32_m26_p06); + v[18] = k_madd_epi32(u[14], k32_m26_p06); + v[19] = k_madd_epi32(u[15], k32_m26_p06); + v[20] = k_madd_epi32(u[ 8], k32_m10_p22); + v[21] = k_madd_epi32(u[ 9], k32_m10_p22); + v[22] = k_madd_epi32(u[10], k32_m10_p22); + v[23] = k_madd_epi32(u[11], k32_m10_p22); + v[24] = k_madd_epi32(u[ 4], k32_m18_p14); + v[25] = k_madd_epi32(u[ 5], k32_m18_p14); + v[26] = k_madd_epi32(u[ 6], k32_m18_p14); + v[27] = k_madd_epi32(u[ 7], k32_m18_p14); + v[28] = k_madd_epi32(u[ 0], k32_m02_p30); + v[29] = k_madd_epi32(u[ 1], k32_m02_p30); + v[30] = k_madd_epi32(u[ 2], k32_m02_p30); + v[31] = k_madd_epi32(u[ 3], k32_m02_p30); + + u[ 0] = k_packs_epi64(v[ 0], v[ 1]); + u[ 1] = k_packs_epi64(v[ 2], v[ 3]); + u[ 2] = k_packs_epi64(v[ 4], v[ 5]); + u[ 3] = k_packs_epi64(v[ 6], v[ 7]); + u[ 4] = k_packs_epi64(v[ 8], v[ 9]); + u[ 5] = k_packs_epi64(v[10], v[11]); + u[ 6] = k_packs_epi64(v[12], v[13]); + u[ 7] = k_packs_epi64(v[14], v[15]); + u[ 8] = k_packs_epi64(v[16], v[17]); + u[ 9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); + v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); + v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); + v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); + v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); + v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); + v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); + v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); + v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); + v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); + u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); + u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); + u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); + u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); + u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); + u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); + u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); + u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); + u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + v[ 0] = _mm_cmplt_epi32(u[ 0], kZero); + v[ 1] = _mm_cmplt_epi32(u[ 1], kZero); + v[ 2] = _mm_cmplt_epi32(u[ 2], kZero); + v[ 3] = _mm_cmplt_epi32(u[ 3], kZero); + v[ 4] = _mm_cmplt_epi32(u[ 4], kZero); + v[ 5] = _mm_cmplt_epi32(u[ 5], kZero); + v[ 6] = _mm_cmplt_epi32(u[ 6], kZero); + v[ 7] = _mm_cmplt_epi32(u[ 7], kZero); + v[ 8] = _mm_cmplt_epi32(u[ 8], kZero); + v[ 9] = _mm_cmplt_epi32(u[ 9], kZero); + v[10] = _mm_cmplt_epi32(u[10], kZero); + v[11] = _mm_cmplt_epi32(u[11], kZero); + v[12] = _mm_cmplt_epi32(u[12], kZero); + v[13] = _mm_cmplt_epi32(u[13], kZero); + v[14] = _mm_cmplt_epi32(u[14], kZero); + v[15] = _mm_cmplt_epi32(u[15], kZero); + + u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]); + u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]); + u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]); + u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]); + u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]); + u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]); + u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]); + u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]); + u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]); + u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]); + u[10] = _mm_sub_epi32(u[10], v[10]); + u[11] = _mm_sub_epi32(u[11], v[11]); + u[12] = _mm_sub_epi32(u[12], v[12]); + u[13] = _mm_sub_epi32(u[13], v[13]); + u[14] = _mm_sub_epi32(u[14], v[14]); + u[15] = _mm_sub_epi32(u[15], v[15]); + + v[ 0] = _mm_add_epi32(u[ 0], K32One); + v[ 1] = _mm_add_epi32(u[ 1], K32One); + v[ 2] = _mm_add_epi32(u[ 2], K32One); + v[ 3] = _mm_add_epi32(u[ 3], K32One); + v[ 4] = _mm_add_epi32(u[ 4], K32One); + v[ 5] = _mm_add_epi32(u[ 5], K32One); + v[ 6] = _mm_add_epi32(u[ 6], K32One); + v[ 7] = _mm_add_epi32(u[ 7], K32One); + v[ 8] = _mm_add_epi32(u[ 8], K32One); + v[ 9] = _mm_add_epi32(u[ 9], K32One); + v[10] = _mm_add_epi32(u[10], K32One); + v[11] = _mm_add_epi32(u[11], K32One); + v[12] = _mm_add_epi32(u[12], K32One); + v[13] = _mm_add_epi32(u[13], K32One); + v[14] = _mm_add_epi32(u[14], K32One); + v[15] = _mm_add_epi32(u[15], K32One); + + u[ 0] = _mm_srai_epi32(v[ 0], 2); + u[ 1] = _mm_srai_epi32(v[ 1], 2); + u[ 2] = _mm_srai_epi32(v[ 2], 2); + u[ 3] = _mm_srai_epi32(v[ 3], 2); + u[ 4] = _mm_srai_epi32(v[ 4], 2); + u[ 5] = _mm_srai_epi32(v[ 5], 2); + u[ 6] = _mm_srai_epi32(v[ 6], 2); + u[ 7] = _mm_srai_epi32(v[ 7], 2); + u[ 8] = _mm_srai_epi32(v[ 8], 2); + u[ 9] = _mm_srai_epi32(v[ 9], 2); + u[10] = _mm_srai_epi32(v[10], 2); + u[11] = _mm_srai_epi32(v[11], 2); + u[12] = _mm_srai_epi32(v[12], 2); + u[13] = _mm_srai_epi32(v[13], 2); + u[14] = _mm_srai_epi32(v[14], 2); + u[15] = _mm_srai_epi32(v[15], 2); + + out[ 2] = _mm_packs_epi32(u[0], u[1]); + out[18] = _mm_packs_epi32(u[2], u[3]); + out[10] = _mm_packs_epi32(u[4], u[5]); + out[26] = _mm_packs_epi32(u[6], u[7]); + out[ 6] = _mm_packs_epi32(u[8], u[9]); + out[22] = _mm_packs_epi32(u[10], u[11]); + out[14] = _mm_packs_epi32(u[12], u[13]); + out[30] = _mm_packs_epi32(u[14], u[15]); + } + { + lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]); + lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]); + lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]); + lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]); + lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]); + lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]); + lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]); + lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]); + lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]); + lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]); + lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]); + lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]); + lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]); + lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]); + lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]); + lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]); + lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]); + lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]); + lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]); + lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]); + lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]); + lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]); + lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]); + lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]); + lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]); + lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]); + lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]); + lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]); + lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]); + lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]); + lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]); + lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]); + } + // stage 8 + { + const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64); + const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64); + const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64); + const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64); + const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64); + const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64); + const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64); + const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64); + + u[ 0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]); + u[ 1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]); + u[ 2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]); + u[ 3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]); + u[ 4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]); + u[ 5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]); + u[ 6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]); + u[ 7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]); + u[ 8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]); + u[ 9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]); + u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]); + u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]); + u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]); + u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]); + u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]); + u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]); + + v[ 0] = k_madd_epi32(u[ 0], k32_p31_p01); + v[ 1] = k_madd_epi32(u[ 1], k32_p31_p01); + v[ 2] = k_madd_epi32(u[ 2], k32_p31_p01); + v[ 3] = k_madd_epi32(u[ 3], k32_p31_p01); + v[ 4] = k_madd_epi32(u[ 4], k32_p15_p17); + v[ 5] = k_madd_epi32(u[ 5], k32_p15_p17); + v[ 6] = k_madd_epi32(u[ 6], k32_p15_p17); + v[ 7] = k_madd_epi32(u[ 7], k32_p15_p17); + v[ 8] = k_madd_epi32(u[ 8], k32_p23_p09); + v[ 9] = k_madd_epi32(u[ 9], k32_p23_p09); + v[10] = k_madd_epi32(u[10], k32_p23_p09); + v[11] = k_madd_epi32(u[11], k32_p23_p09); + v[12] = k_madd_epi32(u[12], k32_p07_p25); + v[13] = k_madd_epi32(u[13], k32_p07_p25); + v[14] = k_madd_epi32(u[14], k32_p07_p25); + v[15] = k_madd_epi32(u[15], k32_p07_p25); + v[16] = k_madd_epi32(u[12], k32_m25_p07); + v[17] = k_madd_epi32(u[13], k32_m25_p07); + v[18] = k_madd_epi32(u[14], k32_m25_p07); + v[19] = k_madd_epi32(u[15], k32_m25_p07); + v[20] = k_madd_epi32(u[ 8], k32_m09_p23); + v[21] = k_madd_epi32(u[ 9], k32_m09_p23); + v[22] = k_madd_epi32(u[10], k32_m09_p23); + v[23] = k_madd_epi32(u[11], k32_m09_p23); + v[24] = k_madd_epi32(u[ 4], k32_m17_p15); + v[25] = k_madd_epi32(u[ 5], k32_m17_p15); + v[26] = k_madd_epi32(u[ 6], k32_m17_p15); + v[27] = k_madd_epi32(u[ 7], k32_m17_p15); + v[28] = k_madd_epi32(u[ 0], k32_m01_p31); + v[29] = k_madd_epi32(u[ 1], k32_m01_p31); + v[30] = k_madd_epi32(u[ 2], k32_m01_p31); + v[31] = k_madd_epi32(u[ 3], k32_m01_p31); + + u[ 0] = k_packs_epi64(v[ 0], v[ 1]); + u[ 1] = k_packs_epi64(v[ 2], v[ 3]); + u[ 2] = k_packs_epi64(v[ 4], v[ 5]); + u[ 3] = k_packs_epi64(v[ 6], v[ 7]); + u[ 4] = k_packs_epi64(v[ 8], v[ 9]); + u[ 5] = k_packs_epi64(v[10], v[11]); + u[ 6] = k_packs_epi64(v[12], v[13]); + u[ 7] = k_packs_epi64(v[14], v[15]); + u[ 8] = k_packs_epi64(v[16], v[17]); + u[ 9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); + v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); + v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); + v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); + v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); + v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); + v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); + v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); + v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); + v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); + u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); + u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); + u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); + u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); + u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); + u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); + u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); + u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); + u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + v[ 0] = _mm_cmplt_epi32(u[ 0], kZero); + v[ 1] = _mm_cmplt_epi32(u[ 1], kZero); + v[ 2] = _mm_cmplt_epi32(u[ 2], kZero); + v[ 3] = _mm_cmplt_epi32(u[ 3], kZero); + v[ 4] = _mm_cmplt_epi32(u[ 4], kZero); + v[ 5] = _mm_cmplt_epi32(u[ 5], kZero); + v[ 6] = _mm_cmplt_epi32(u[ 6], kZero); + v[ 7] = _mm_cmplt_epi32(u[ 7], kZero); + v[ 8] = _mm_cmplt_epi32(u[ 8], kZero); + v[ 9] = _mm_cmplt_epi32(u[ 9], kZero); + v[10] = _mm_cmplt_epi32(u[10], kZero); + v[11] = _mm_cmplt_epi32(u[11], kZero); + v[12] = _mm_cmplt_epi32(u[12], kZero); + v[13] = _mm_cmplt_epi32(u[13], kZero); + v[14] = _mm_cmplt_epi32(u[14], kZero); + v[15] = _mm_cmplt_epi32(u[15], kZero); + + u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]); + u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]); + u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]); + u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]); + u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]); + u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]); + u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]); + u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]); + u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]); + u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]); + u[10] = _mm_sub_epi32(u[10], v[10]); + u[11] = _mm_sub_epi32(u[11], v[11]); + u[12] = _mm_sub_epi32(u[12], v[12]); + u[13] = _mm_sub_epi32(u[13], v[13]); + u[14] = _mm_sub_epi32(u[14], v[14]); + u[15] = _mm_sub_epi32(u[15], v[15]); + + v[0] = _mm_add_epi32(u[0], K32One); + v[1] = _mm_add_epi32(u[1], K32One); + v[2] = _mm_add_epi32(u[2], K32One); + v[3] = _mm_add_epi32(u[3], K32One); + v[4] = _mm_add_epi32(u[4], K32One); + v[5] = _mm_add_epi32(u[5], K32One); + v[6] = _mm_add_epi32(u[6], K32One); + v[7] = _mm_add_epi32(u[7], K32One); + v[8] = _mm_add_epi32(u[8], K32One); + v[9] = _mm_add_epi32(u[9], K32One); + v[10] = _mm_add_epi32(u[10], K32One); + v[11] = _mm_add_epi32(u[11], K32One); + v[12] = _mm_add_epi32(u[12], K32One); + v[13] = _mm_add_epi32(u[13], K32One); + v[14] = _mm_add_epi32(u[14], K32One); + v[15] = _mm_add_epi32(u[15], K32One); + + u[0] = _mm_srai_epi32(v[0], 2); + u[1] = _mm_srai_epi32(v[1], 2); + u[2] = _mm_srai_epi32(v[2], 2); + u[3] = _mm_srai_epi32(v[3], 2); + u[4] = _mm_srai_epi32(v[4], 2); + u[5] = _mm_srai_epi32(v[5], 2); + u[6] = _mm_srai_epi32(v[6], 2); + u[7] = _mm_srai_epi32(v[7], 2); + u[8] = _mm_srai_epi32(v[8], 2); + u[9] = _mm_srai_epi32(v[9], 2); + u[10] = _mm_srai_epi32(v[10], 2); + u[11] = _mm_srai_epi32(v[11], 2); + u[12] = _mm_srai_epi32(v[12], 2); + u[13] = _mm_srai_epi32(v[13], 2); + u[14] = _mm_srai_epi32(v[14], 2); + u[15] = _mm_srai_epi32(v[15], 2); + + out[ 1] = _mm_packs_epi32(u[0], u[1]); + out[17] = _mm_packs_epi32(u[2], u[3]); + out[ 9] = _mm_packs_epi32(u[4], u[5]); + out[25] = _mm_packs_epi32(u[6], u[7]); + out[ 7] = _mm_packs_epi32(u[8], u[9]); + out[23] = _mm_packs_epi32(u[10], u[11]); + out[15] = _mm_packs_epi32(u[12], u[13]); + out[31] = _mm_packs_epi32(u[14], u[15]); + } + { + const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64); + const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64); + const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64); + const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64); + const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64); + const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64); + const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64); + const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64); + + u[ 0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]); + u[ 1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]); + u[ 2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]); + u[ 3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]); + u[ 4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]); + u[ 5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]); + u[ 6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]); + u[ 7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]); + u[ 8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]); + u[ 9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]); + u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]); + u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]); + u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]); + u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]); + u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]); + u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]); + + v[ 0] = k_madd_epi32(u[ 0], k32_p27_p05); + v[ 1] = k_madd_epi32(u[ 1], k32_p27_p05); + v[ 2] = k_madd_epi32(u[ 2], k32_p27_p05); + v[ 3] = k_madd_epi32(u[ 3], k32_p27_p05); + v[ 4] = k_madd_epi32(u[ 4], k32_p11_p21); + v[ 5] = k_madd_epi32(u[ 5], k32_p11_p21); + v[ 6] = k_madd_epi32(u[ 6], k32_p11_p21); + v[ 7] = k_madd_epi32(u[ 7], k32_p11_p21); + v[ 8] = k_madd_epi32(u[ 8], k32_p19_p13); + v[ 9] = k_madd_epi32(u[ 9], k32_p19_p13); + v[10] = k_madd_epi32(u[10], k32_p19_p13); + v[11] = k_madd_epi32(u[11], k32_p19_p13); + v[12] = k_madd_epi32(u[12], k32_p03_p29); + v[13] = k_madd_epi32(u[13], k32_p03_p29); + v[14] = k_madd_epi32(u[14], k32_p03_p29); + v[15] = k_madd_epi32(u[15], k32_p03_p29); + v[16] = k_madd_epi32(u[12], k32_m29_p03); + v[17] = k_madd_epi32(u[13], k32_m29_p03); + v[18] = k_madd_epi32(u[14], k32_m29_p03); + v[19] = k_madd_epi32(u[15], k32_m29_p03); + v[20] = k_madd_epi32(u[ 8], k32_m13_p19); + v[21] = k_madd_epi32(u[ 9], k32_m13_p19); + v[22] = k_madd_epi32(u[10], k32_m13_p19); + v[23] = k_madd_epi32(u[11], k32_m13_p19); + v[24] = k_madd_epi32(u[ 4], k32_m21_p11); + v[25] = k_madd_epi32(u[ 5], k32_m21_p11); + v[26] = k_madd_epi32(u[ 6], k32_m21_p11); + v[27] = k_madd_epi32(u[ 7], k32_m21_p11); + v[28] = k_madd_epi32(u[ 0], k32_m05_p27); + v[29] = k_madd_epi32(u[ 1], k32_m05_p27); + v[30] = k_madd_epi32(u[ 2], k32_m05_p27); + v[31] = k_madd_epi32(u[ 3], k32_m05_p27); + + u[ 0] = k_packs_epi64(v[ 0], v[ 1]); + u[ 1] = k_packs_epi64(v[ 2], v[ 3]); + u[ 2] = k_packs_epi64(v[ 4], v[ 5]); + u[ 3] = k_packs_epi64(v[ 6], v[ 7]); + u[ 4] = k_packs_epi64(v[ 8], v[ 9]); + u[ 5] = k_packs_epi64(v[10], v[11]); + u[ 6] = k_packs_epi64(v[12], v[13]); + u[ 7] = k_packs_epi64(v[14], v[15]); + u[ 8] = k_packs_epi64(v[16], v[17]); + u[ 9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); + v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); + v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); + v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); + v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); + v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); + v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); + v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); + v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); + v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); + u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); + u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); + u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); + u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); + u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); + u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); + u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); + u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); + u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + v[ 0] = _mm_cmplt_epi32(u[ 0], kZero); + v[ 1] = _mm_cmplt_epi32(u[ 1], kZero); + v[ 2] = _mm_cmplt_epi32(u[ 2], kZero); + v[ 3] = _mm_cmplt_epi32(u[ 3], kZero); + v[ 4] = _mm_cmplt_epi32(u[ 4], kZero); + v[ 5] = _mm_cmplt_epi32(u[ 5], kZero); + v[ 6] = _mm_cmplt_epi32(u[ 6], kZero); + v[ 7] = _mm_cmplt_epi32(u[ 7], kZero); + v[ 8] = _mm_cmplt_epi32(u[ 8], kZero); + v[ 9] = _mm_cmplt_epi32(u[ 9], kZero); + v[10] = _mm_cmplt_epi32(u[10], kZero); + v[11] = _mm_cmplt_epi32(u[11], kZero); + v[12] = _mm_cmplt_epi32(u[12], kZero); + v[13] = _mm_cmplt_epi32(u[13], kZero); + v[14] = _mm_cmplt_epi32(u[14], kZero); + v[15] = _mm_cmplt_epi32(u[15], kZero); + + u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]); + u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]); + u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]); + u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]); + u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]); + u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]); + u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]); + u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]); + u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]); + u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]); + u[10] = _mm_sub_epi32(u[10], v[10]); + u[11] = _mm_sub_epi32(u[11], v[11]); + u[12] = _mm_sub_epi32(u[12], v[12]); + u[13] = _mm_sub_epi32(u[13], v[13]); + u[14] = _mm_sub_epi32(u[14], v[14]); + u[15] = _mm_sub_epi32(u[15], v[15]); + + v[0] = _mm_add_epi32(u[0], K32One); + v[1] = _mm_add_epi32(u[1], K32One); + v[2] = _mm_add_epi32(u[2], K32One); + v[3] = _mm_add_epi32(u[3], K32One); + v[4] = _mm_add_epi32(u[4], K32One); + v[5] = _mm_add_epi32(u[5], K32One); + v[6] = _mm_add_epi32(u[6], K32One); + v[7] = _mm_add_epi32(u[7], K32One); + v[8] = _mm_add_epi32(u[8], K32One); + v[9] = _mm_add_epi32(u[9], K32One); + v[10] = _mm_add_epi32(u[10], K32One); + v[11] = _mm_add_epi32(u[11], K32One); + v[12] = _mm_add_epi32(u[12], K32One); + v[13] = _mm_add_epi32(u[13], K32One); + v[14] = _mm_add_epi32(u[14], K32One); + v[15] = _mm_add_epi32(u[15], K32One); + + u[0] = _mm_srai_epi32(v[0], 2); + u[1] = _mm_srai_epi32(v[1], 2); + u[2] = _mm_srai_epi32(v[2], 2); + u[3] = _mm_srai_epi32(v[3], 2); + u[4] = _mm_srai_epi32(v[4], 2); + u[5] = _mm_srai_epi32(v[5], 2); + u[6] = _mm_srai_epi32(v[6], 2); + u[7] = _mm_srai_epi32(v[7], 2); + u[8] = _mm_srai_epi32(v[8], 2); + u[9] = _mm_srai_epi32(v[9], 2); + u[10] = _mm_srai_epi32(v[10], 2); + u[11] = _mm_srai_epi32(v[11], 2); + u[12] = _mm_srai_epi32(v[12], 2); + u[13] = _mm_srai_epi32(v[13], 2); + u[14] = _mm_srai_epi32(v[14], 2); + u[15] = _mm_srai_epi32(v[15], 2); + + out[ 5] = _mm_packs_epi32(u[0], u[1]); + out[21] = _mm_packs_epi32(u[2], u[3]); + out[13] = _mm_packs_epi32(u[4], u[5]); + out[29] = _mm_packs_epi32(u[6], u[7]); + out[ 3] = _mm_packs_epi32(u[8], u[9]); + out[19] = _mm_packs_epi32(u[10], u[11]); + out[11] = _mm_packs_epi32(u[12], u[13]); + out[27] = _mm_packs_epi32(u[14], u[15]); + } + } +#endif + // Transpose the results, do it as four 8x8 transposes. + { + int transpose_block; + int16_t *output; + if (0 == pass) { + output = &intermediate[column_start * 32]; + } else { + output = &output_org[column_start * 32]; + } + for (transpose_block = 0; transpose_block < 4; ++transpose_block) { + __m128i *this_out = &out[8 * transpose_block]; + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + if (0 == pass) { + // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; + // TODO(cd): see quality impact of only doing + // output[j] = (output[j] + 1) >> 2; + // which would remove the code between here ... + __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero); + __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero); + __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero); + __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero); + __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero); + __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero); + __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero); + __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero); + tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0); + tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0); + tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0); + tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0); + tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0); + tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0); + tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0); + tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0); + // ... and here. + // PS: also change code in vp9/encoder/vp9_dct.c + tr2_0 = _mm_add_epi16(tr2_0, kOne); + tr2_1 = _mm_add_epi16(tr2_1, kOne); + tr2_2 = _mm_add_epi16(tr2_2, kOne); + tr2_3 = _mm_add_epi16(tr2_3, kOne); + tr2_4 = _mm_add_epi16(tr2_4, kOne); + tr2_5 = _mm_add_epi16(tr2_5, kOne); + tr2_6 = _mm_add_epi16(tr2_6, kOne); + tr2_7 = _mm_add_epi16(tr2_7, kOne); + tr2_0 = _mm_srai_epi16(tr2_0, 2); + tr2_1 = _mm_srai_epi16(tr2_1, 2); + tr2_2 = _mm_srai_epi16(tr2_2, 2); + tr2_3 = _mm_srai_epi16(tr2_3, 2); + tr2_4 = _mm_srai_epi16(tr2_4, 2); + tr2_5 = _mm_srai_epi16(tr2_5, 2); + tr2_6 = _mm_srai_epi16(tr2_6, 2); + tr2_7 = _mm_srai_epi16(tr2_7, 2); + } + // Note: even though all these stores are aligned, using the aligned + // intrinsic make the code slightly slower. + _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0); + _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1); + _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2); + _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3); + _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4); + _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5); + _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6); + _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7); + // Process next 8x8 + output += 8; + } + } + } + } +} diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c index bf09c7a..eb271fe 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c @@ -2572,1224 +2572,14 @@ void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output, write_buffer_16x16(output, in0, in1, 16); } -void vp9_short_fdct32x32_rd_sse2(int16_t *input, - int16_t *output_org, int pitch) { - // Calculate pre-multiplied strides - const int str1 = pitch >> 1; - const int str2 = pitch; - const int str3 = pitch + str1; - // We need an intermediate buffer between passes. - DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); - const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); - const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); - const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); - const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); - const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); - const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); - const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); - const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); - const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); - const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); - const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); - const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); - const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); - const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); - const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); - const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i kOne = _mm_set1_epi16(1); - // Do the two transform/transpose passes - int pass; - for (pass = 0; pass < 2; ++pass) { - // We process eight columns (transposed rows in second pass) at a time. - int column_start; - for (column_start = 0; column_start < 32; column_start += 8) { - __m128i step1[32]; - __m128i step2[32]; - __m128i step3[32]; - __m128i out[32]; - // Stage 1 - // Note: even though all the loads below are aligned, using the aligned - // intrinsic make the code slightly slower. - if (0 == pass) { - int16_t *in = &input[column_start]; - // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; - // Note: the next four blocks could be in a loop. That would help the - // instruction cache but is actually slower. - { - int16_t *ina = in + 0 * str1; - int16_t *inb = in + 31 * str1; - __m128i *step1a = &step1[ 0]; - __m128i *step1b = &step1[31]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[ 0] = _mm_add_epi16(ina0, inb0); - step1a[ 1] = _mm_add_epi16(ina1, inb1); - step1a[ 2] = _mm_add_epi16(ina2, inb2); - step1a[ 3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); - step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); - step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); - step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - { - int16_t *ina = in + 4 * str1; - int16_t *inb = in + 27 * str1; - __m128i *step1a = &step1[ 4]; - __m128i *step1b = &step1[27]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[ 0] = _mm_add_epi16(ina0, inb0); - step1a[ 1] = _mm_add_epi16(ina1, inb1); - step1a[ 2] = _mm_add_epi16(ina2, inb2); - step1a[ 3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); - step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); - step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); - step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - { - int16_t *ina = in + 8 * str1; - int16_t *inb = in + 23 * str1; - __m128i *step1a = &step1[ 8]; - __m128i *step1b = &step1[23]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[ 0] = _mm_add_epi16(ina0, inb0); - step1a[ 1] = _mm_add_epi16(ina1, inb1); - step1a[ 2] = _mm_add_epi16(ina2, inb2); - step1a[ 3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); - step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); - step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); - step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - { - int16_t *ina = in + 12 * str1; - int16_t *inb = in + 19 * str1; - __m128i *step1a = &step1[12]; - __m128i *step1b = &step1[19]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[ 0] = _mm_add_epi16(ina0, inb0); - step1a[ 1] = _mm_add_epi16(ina1, inb1); - step1a[ 2] = _mm_add_epi16(ina2, inb2); - step1a[ 3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); - step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); - step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); - step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - } else { - int16_t *in = &intermediate[column_start]; - // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; - // Note: using the same approach as above to have common offset is - // counter-productive as all offsets can be calculated at compile - // time. - // Note: the next four blocks could be in a loop. That would help the - // instruction cache but is actually slower. - { - __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); - __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); - __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); - __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); - __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); - __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); - __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); - __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); - step1[ 0] = _mm_add_epi16(in00, in31); - step1[ 1] = _mm_add_epi16(in01, in30); - step1[ 2] = _mm_add_epi16(in02, in29); - step1[ 3] = _mm_add_epi16(in03, in28); - step1[28] = _mm_sub_epi16(in03, in28); - step1[29] = _mm_sub_epi16(in02, in29); - step1[30] = _mm_sub_epi16(in01, in30); - step1[31] = _mm_sub_epi16(in00, in31); - } - { - __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); - __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); - __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); - __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); - __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); - __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); - __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); - __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); - step1[ 4] = _mm_add_epi16(in04, in27); - step1[ 5] = _mm_add_epi16(in05, in26); - step1[ 6] = _mm_add_epi16(in06, in25); - step1[ 7] = _mm_add_epi16(in07, in24); - step1[24] = _mm_sub_epi16(in07, in24); - step1[25] = _mm_sub_epi16(in06, in25); - step1[26] = _mm_sub_epi16(in05, in26); - step1[27] = _mm_sub_epi16(in04, in27); - } - { - __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); - __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); - __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); - __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); - __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); - __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); - __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); - __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); - step1[ 8] = _mm_add_epi16(in08, in23); - step1[ 9] = _mm_add_epi16(in09, in22); - step1[10] = _mm_add_epi16(in10, in21); - step1[11] = _mm_add_epi16(in11, in20); - step1[20] = _mm_sub_epi16(in11, in20); - step1[21] = _mm_sub_epi16(in10, in21); - step1[22] = _mm_sub_epi16(in09, in22); - step1[23] = _mm_sub_epi16(in08, in23); - } - { - __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); - __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); - __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); - __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); - __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); - __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); - __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); - __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); - step1[12] = _mm_add_epi16(in12, in19); - step1[13] = _mm_add_epi16(in13, in18); - step1[14] = _mm_add_epi16(in14, in17); - step1[15] = _mm_add_epi16(in15, in16); - step1[16] = _mm_sub_epi16(in15, in16); - step1[17] = _mm_sub_epi16(in14, in17); - step1[18] = _mm_sub_epi16(in13, in18); - step1[19] = _mm_sub_epi16(in12, in19); - } - } - // Stage 2 - { - step2[ 0] = _mm_add_epi16(step1[0], step1[15]); - step2[ 1] = _mm_add_epi16(step1[1], step1[14]); - step2[ 2] = _mm_add_epi16(step1[2], step1[13]); - step2[ 3] = _mm_add_epi16(step1[3], step1[12]); - step2[ 4] = _mm_add_epi16(step1[4], step1[11]); - step2[ 5] = _mm_add_epi16(step1[5], step1[10]); - step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]); - step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]); - step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]); - step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]); - step2[10] = _mm_sub_epi16(step1[5], step1[10]); - step2[11] = _mm_sub_epi16(step1[4], step1[11]); - step2[12] = _mm_sub_epi16(step1[3], step1[12]); - step2[13] = _mm_sub_epi16(step1[2], step1[13]); - step2[14] = _mm_sub_epi16(step1[1], step1[14]); - step2[15] = _mm_sub_epi16(step1[0], step1[15]); - } - { - const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); - const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); - const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); - const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); - const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); - const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); - const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); - const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); - const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); - const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); - const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); - const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); - const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); - const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); - const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); - const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); - const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); - const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); - const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); - const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); - const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); - const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); - const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); - const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); - const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); - const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); - const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); - const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); - const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); - const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); - const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); - const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); - const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); - const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); - const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); - const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); - const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); - const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); - const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); - const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); - const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); - const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); - const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); - const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); - const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); - const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); - const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); - const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); - const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); - const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); - const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); - const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); - const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); - const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); - const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); - // Combine - step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); - step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); - step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); - step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); - step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); - step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); - step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); - step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); - } - // Stage 3 - { - step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]); - step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]); - step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]); - step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]); - step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]); - step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]); - step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]); - step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]); - } - { - const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); - const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); - const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); - const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); - const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); - const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); - const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); - const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); - const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); - const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); - const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); - const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); - // Combine - step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); - step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); - step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); - step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); - } - { - step3[16] = _mm_add_epi16(step2[23], step1[16]); - step3[17] = _mm_add_epi16(step2[22], step1[17]); - step3[18] = _mm_add_epi16(step2[21], step1[18]); - step3[19] = _mm_add_epi16(step2[20], step1[19]); - step3[20] = _mm_sub_epi16(step1[19], step2[20]); - step3[21] = _mm_sub_epi16(step1[18], step2[21]); - step3[22] = _mm_sub_epi16(step1[17], step2[22]); - step3[23] = _mm_sub_epi16(step1[16], step2[23]); - step3[24] = _mm_sub_epi16(step1[31], step2[24]); - step3[25] = _mm_sub_epi16(step1[30], step2[25]); - step3[26] = _mm_sub_epi16(step1[29], step2[26]); - step3[27] = _mm_sub_epi16(step1[28], step2[27]); - step3[28] = _mm_add_epi16(step2[27], step1[28]); - step3[29] = _mm_add_epi16(step2[26], step1[29]); - step3[30] = _mm_add_epi16(step2[25], step1[30]); - step3[31] = _mm_add_epi16(step2[24], step1[31]); - } - // dump the magnitude by half, hence the intermediate values are within - // the range of 16 bits. - if (1 == pass) { - __m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero); - __m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero); - __m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero); - __m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero); - __m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero); - __m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero); - __m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero); - __m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero); - __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero); - __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero); - __m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero); - __m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero); - __m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero); - __m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero); - __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); - __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); - __m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero); - __m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero); - __m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero); - __m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero); - __m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero); - __m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero); - __m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero); - __m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero); - __m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero); - __m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero); - __m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero); - __m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero); - __m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero); - __m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero); - __m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero); - __m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero); - step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0); - step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0); - step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0); - step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0); - step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0); - step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0); - step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0); - step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0); - step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0); - step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0); - step3[10] = _mm_sub_epi16(step3[10], s3_10_0); - step3[11] = _mm_sub_epi16(step3[11], s3_11_0); - step3[12] = _mm_sub_epi16(step3[12], s3_12_0); - step3[13] = _mm_sub_epi16(step3[13], s3_13_0); - step2[14] = _mm_sub_epi16(step2[14], s2_14_0); - step2[15] = _mm_sub_epi16(step2[15], s2_15_0); - step3[16] = _mm_sub_epi16(step3[16], s3_16_0); - step3[17] = _mm_sub_epi16(step3[17], s3_17_0); - step3[18] = _mm_sub_epi16(step3[18], s3_18_0); - step3[19] = _mm_sub_epi16(step3[19], s3_19_0); - step3[20] = _mm_sub_epi16(step3[20], s3_20_0); - step3[21] = _mm_sub_epi16(step3[21], s3_21_0); - step3[22] = _mm_sub_epi16(step3[22], s3_22_0); - step3[23] = _mm_sub_epi16(step3[23], s3_23_0); - step3[24] = _mm_sub_epi16(step3[24], s3_24_0); - step3[25] = _mm_sub_epi16(step3[25], s3_25_0); - step3[26] = _mm_sub_epi16(step3[26], s3_26_0); - step3[27] = _mm_sub_epi16(step3[27], s3_27_0); - step3[28] = _mm_sub_epi16(step3[28], s3_28_0); - step3[29] = _mm_sub_epi16(step3[29], s3_29_0); - step3[30] = _mm_sub_epi16(step3[30], s3_30_0); - step3[31] = _mm_sub_epi16(step3[31], s3_31_0); - step3[ 0] = _mm_add_epi16(step3[ 0], kOne); - step3[ 1] = _mm_add_epi16(step3[ 1], kOne); - step3[ 2] = _mm_add_epi16(step3[ 2], kOne); - step3[ 3] = _mm_add_epi16(step3[ 3], kOne); - step3[ 4] = _mm_add_epi16(step3[ 4], kOne); - step3[ 5] = _mm_add_epi16(step3[ 5], kOne); - step3[ 6] = _mm_add_epi16(step3[ 6], kOne); - step3[ 7] = _mm_add_epi16(step3[ 7], kOne); - step2[ 8] = _mm_add_epi16(step2[ 8], kOne); - step2[ 9] = _mm_add_epi16(step2[ 9], kOne); - step3[10] = _mm_add_epi16(step3[10], kOne); - step3[11] = _mm_add_epi16(step3[11], kOne); - step3[12] = _mm_add_epi16(step3[12], kOne); - step3[13] = _mm_add_epi16(step3[13], kOne); - step2[14] = _mm_add_epi16(step2[14], kOne); - step2[15] = _mm_add_epi16(step2[15], kOne); - step3[16] = _mm_add_epi16(step3[16], kOne); - step3[17] = _mm_add_epi16(step3[17], kOne); - step3[18] = _mm_add_epi16(step3[18], kOne); - step3[19] = _mm_add_epi16(step3[19], kOne); - step3[20] = _mm_add_epi16(step3[20], kOne); - step3[21] = _mm_add_epi16(step3[21], kOne); - step3[22] = _mm_add_epi16(step3[22], kOne); - step3[23] = _mm_add_epi16(step3[23], kOne); - step3[24] = _mm_add_epi16(step3[24], kOne); - step3[25] = _mm_add_epi16(step3[25], kOne); - step3[26] = _mm_add_epi16(step3[26], kOne); - step3[27] = _mm_add_epi16(step3[27], kOne); - step3[28] = _mm_add_epi16(step3[28], kOne); - step3[29] = _mm_add_epi16(step3[29], kOne); - step3[30] = _mm_add_epi16(step3[30], kOne); - step3[31] = _mm_add_epi16(step3[31], kOne); - step3[ 0] = _mm_srai_epi16(step3[ 0], 2); - step3[ 1] = _mm_srai_epi16(step3[ 1], 2); - step3[ 2] = _mm_srai_epi16(step3[ 2], 2); - step3[ 3] = _mm_srai_epi16(step3[ 3], 2); - step3[ 4] = _mm_srai_epi16(step3[ 4], 2); - step3[ 5] = _mm_srai_epi16(step3[ 5], 2); - step3[ 6] = _mm_srai_epi16(step3[ 6], 2); - step3[ 7] = _mm_srai_epi16(step3[ 7], 2); - step2[ 8] = _mm_srai_epi16(step2[ 8], 2); - step2[ 9] = _mm_srai_epi16(step2[ 9], 2); - step3[10] = _mm_srai_epi16(step3[10], 2); - step3[11] = _mm_srai_epi16(step3[11], 2); - step3[12] = _mm_srai_epi16(step3[12], 2); - step3[13] = _mm_srai_epi16(step3[13], 2); - step2[14] = _mm_srai_epi16(step2[14], 2); - step2[15] = _mm_srai_epi16(step2[15], 2); - step3[16] = _mm_srai_epi16(step3[16], 2); - step3[17] = _mm_srai_epi16(step3[17], 2); - step3[18] = _mm_srai_epi16(step3[18], 2); - step3[19] = _mm_srai_epi16(step3[19], 2); - step3[20] = _mm_srai_epi16(step3[20], 2); - step3[21] = _mm_srai_epi16(step3[21], 2); - step3[22] = _mm_srai_epi16(step3[22], 2); - step3[23] = _mm_srai_epi16(step3[23], 2); - step3[24] = _mm_srai_epi16(step3[24], 2); - step3[25] = _mm_srai_epi16(step3[25], 2); - step3[26] = _mm_srai_epi16(step3[26], 2); - step3[27] = _mm_srai_epi16(step3[27], 2); - step3[28] = _mm_srai_epi16(step3[28], 2); - step3[29] = _mm_srai_epi16(step3[29], 2); - step3[30] = _mm_srai_epi16(step3[30], 2); - step3[31] = _mm_srai_epi16(step3[31], 2); - } - // Stage 4 - { - step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]); - step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]); - step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]); - step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]); - step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]); - step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]); - step1[10] = _mm_sub_epi16(step2[ 9], step3[10]); - step1[11] = _mm_sub_epi16(step2[ 8], step3[11]); - step1[12] = _mm_sub_epi16(step2[15], step3[12]); - step1[13] = _mm_sub_epi16(step2[14], step3[13]); - step1[14] = _mm_add_epi16(step3[13], step2[14]); - step1[15] = _mm_add_epi16(step3[12], step2[15]); - } - { - const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); - const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); - const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); - const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); - const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); - const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); - const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); - const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); - const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); - const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); - const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); - const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); - const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); - // Combine - step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); - step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); - } - { - const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); - const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); - const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); - const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); - const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); - const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); - const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); - const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); - const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); - const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); - const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); - const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); - const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); - const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); - const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); - const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); - const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); - const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); - const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); - const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); - const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); - const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); - const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); - const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); - const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); - const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); - const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); - const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); - const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); - const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); - const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); - const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); - const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); - const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); - const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); - const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); - const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); - const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); - const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); - const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); - const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); - const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); - const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); - const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); - const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); - const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); - const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); - const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); - const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); - const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); - const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); - const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); - const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); - const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); - const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); - // Combine - step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); - step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); - step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); - step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); - step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); - step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); - step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); - step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); - } - // Stage 5 - { - step2[4] = _mm_add_epi16(step1[5], step3[4]); - step2[5] = _mm_sub_epi16(step3[4], step1[5]); - step2[6] = _mm_sub_epi16(step3[7], step1[6]); - step2[7] = _mm_add_epi16(step1[6], step3[7]); - } - { - const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); - const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); - const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); - const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); - const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); - const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); - const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); - const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); - const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); - const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); - const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); - const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); - const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); - const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); - const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); - const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); - const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); - const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); - const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); - const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); - const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); - const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); - const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); - const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); - const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); - const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); - const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); - // Combine - out[ 0] = _mm_packs_epi32(out_00_6, out_00_7); - out[16] = _mm_packs_epi32(out_16_6, out_16_7); - out[ 8] = _mm_packs_epi32(out_08_6, out_08_7); - out[24] = _mm_packs_epi32(out_24_6, out_24_7); - } - { - const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]); - const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]); - const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); - const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); - const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); - const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); - const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); - const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); - const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); - const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); - const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); - const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); - const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); - const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); - const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); - const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); - const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); - const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); - const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); - const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); - const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); - const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); - const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); - const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); - const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); - const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); - const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); - // Combine - step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7); - step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); - step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); - step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); - } - { - step2[16] = _mm_add_epi16(step1[19], step3[16]); - step2[17] = _mm_add_epi16(step1[18], step3[17]); - step2[18] = _mm_sub_epi16(step3[17], step1[18]); - step2[19] = _mm_sub_epi16(step3[16], step1[19]); - step2[20] = _mm_sub_epi16(step3[23], step1[20]); - step2[21] = _mm_sub_epi16(step3[22], step1[21]); - step2[22] = _mm_add_epi16(step1[21], step3[22]); - step2[23] = _mm_add_epi16(step1[20], step3[23]); - step2[24] = _mm_add_epi16(step1[27], step3[24]); - step2[25] = _mm_add_epi16(step1[26], step3[25]); - step2[26] = _mm_sub_epi16(step3[25], step1[26]); - step2[27] = _mm_sub_epi16(step3[24], step1[27]); - step2[28] = _mm_sub_epi16(step3[31], step1[28]); - step2[29] = _mm_sub_epi16(step3[30], step1[29]); - step2[30] = _mm_add_epi16(step1[29], step3[30]); - step2[31] = _mm_add_epi16(step1[28], step3[31]); - } - // Stage 6 - { - const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); - const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); - const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); - const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); - const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); - const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); - const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); - const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); - // dct_const_round_shift - const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); - const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); - const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); - const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); - const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); - const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); - const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); - const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); - const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); - const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); - const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); - const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); - const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); - const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); - const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); - const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); - // Combine - out[ 4] = _mm_packs_epi32(out_04_6, out_04_7); - out[20] = _mm_packs_epi32(out_20_6, out_20_7); - out[12] = _mm_packs_epi32(out_12_6, out_12_7); - out[28] = _mm_packs_epi32(out_28_6, out_28_7); - } - { - step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]); - step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]); - step3[10] = _mm_sub_epi16(step1[11], step2[10]); - step3[11] = _mm_add_epi16(step2[10], step1[11]); - step3[12] = _mm_add_epi16(step2[13], step1[12]); - step3[13] = _mm_sub_epi16(step1[12], step2[13]); - step3[14] = _mm_sub_epi16(step1[15], step2[14]); - step3[15] = _mm_add_epi16(step2[14], step1[15]); - } - { - const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); - const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); - const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); - const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); - const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); - const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); - const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); - const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); - const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); - const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); - const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); - const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); - const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); - const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); - const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); - const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); - const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); - const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); - const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); - const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); - const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); - const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); - const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); - const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); - // dct_const_round_shift - const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); - const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); - const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); - const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); - const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); - const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); - const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); - const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); - const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); - const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); - const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); - const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); - const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); - const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); - const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); - const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); - const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); - const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); - const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); - const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); - const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); - const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); - const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); - const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); - const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); - const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); - const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); - const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); - const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); - const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); - const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); - const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); - // Combine - step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); - step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); - step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); - step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); - // Combine - step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); - step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); - step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); - step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); - } - // Stage 7 - { - const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]); - const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]); - const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]); - const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]); - const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); - const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); - const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); - const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); - const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); - const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); - const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); - const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); - const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); - const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); - const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); - const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); - const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); - const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); - const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); - const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); - const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); - const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); - const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); - const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); - // dct_const_round_shift - const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); - const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); - const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); - const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); - const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); - const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); - const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); - const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); - const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); - const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); - const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); - const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); - const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); - const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); - const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); - const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); - const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); - const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); - const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); - const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); - const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); - const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); - const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); - const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); - const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); - const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); - const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); - const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); - const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); - const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); - const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); - const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); - // Combine - out[ 2] = _mm_packs_epi32(out_02_6, out_02_7); - out[18] = _mm_packs_epi32(out_18_6, out_18_7); - out[10] = _mm_packs_epi32(out_10_6, out_10_7); - out[26] = _mm_packs_epi32(out_26_6, out_26_7); - out[ 6] = _mm_packs_epi32(out_06_6, out_06_7); - out[22] = _mm_packs_epi32(out_22_6, out_22_7); - out[14] = _mm_packs_epi32(out_14_6, out_14_7); - out[30] = _mm_packs_epi32(out_30_6, out_30_7); - } - { - step1[16] = _mm_add_epi16(step3[17], step2[16]); - step1[17] = _mm_sub_epi16(step2[16], step3[17]); - step1[18] = _mm_sub_epi16(step2[19], step3[18]); - step1[19] = _mm_add_epi16(step3[18], step2[19]); - step1[20] = _mm_add_epi16(step3[21], step2[20]); - step1[21] = _mm_sub_epi16(step2[20], step3[21]); - step1[22] = _mm_sub_epi16(step2[23], step3[22]); - step1[23] = _mm_add_epi16(step3[22], step2[23]); - step1[24] = _mm_add_epi16(step3[25], step2[24]); - step1[25] = _mm_sub_epi16(step2[24], step3[25]); - step1[26] = _mm_sub_epi16(step2[27], step3[26]); - step1[27] = _mm_add_epi16(step3[26], step2[27]); - step1[28] = _mm_add_epi16(step3[29], step2[28]); - step1[29] = _mm_sub_epi16(step2[28], step3[29]); - step1[30] = _mm_sub_epi16(step2[31], step3[30]); - step1[31] = _mm_add_epi16(step3[30], step2[31]); - } - // Final stage --- outputs indices are bit-reversed. - { - const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); - const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); - const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); - const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); - const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); - const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); - const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); - const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); - const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); - const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); - const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); - const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); - const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); - const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); - const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); - const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); - const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); - const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); - const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); - const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); - const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); - const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); - const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); - const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); - // dct_const_round_shift - const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); - const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); - const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); - const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); - const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); - const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); - const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); - const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); - const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); - const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); - const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); - const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); - const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); - const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); - const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); - const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); - const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); - const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); - const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); - const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); - const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); - const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); - const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); - const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); - const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); - const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); - const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); - const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); - const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); - const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); - const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); - const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); - // Combine - out[ 1] = _mm_packs_epi32(out_01_6, out_01_7); - out[17] = _mm_packs_epi32(out_17_6, out_17_7); - out[ 9] = _mm_packs_epi32(out_09_6, out_09_7); - out[25] = _mm_packs_epi32(out_25_6, out_25_7); - out[ 7] = _mm_packs_epi32(out_07_6, out_07_7); - out[23] = _mm_packs_epi32(out_23_6, out_23_7); - out[15] = _mm_packs_epi32(out_15_6, out_15_7); - out[31] = _mm_packs_epi32(out_31_6, out_31_7); - } - { - const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); - const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); - const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); - const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); - const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); - const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); - const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); - const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); - const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); - const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); - const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); - const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); - const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); - const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); - const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); - const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); - const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); - const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); - const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); - const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); - const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); - const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); - const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); - const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); - // dct_const_round_shift - const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); - const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); - const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); - const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); - const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); - const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); - const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); - const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); - const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); - const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); - const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); - const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); - const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); - const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); - const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); - const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); - const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); - const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); - const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); - const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); - const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); - const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); - const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); - const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); - const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); - const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); - const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); - const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); - const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); - const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); - const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); - const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); - // Combine - out[ 5] = _mm_packs_epi32(out_05_6, out_05_7); - out[21] = _mm_packs_epi32(out_21_6, out_21_7); - out[13] = _mm_packs_epi32(out_13_6, out_13_7); - out[29] = _mm_packs_epi32(out_29_6, out_29_7); - out[ 3] = _mm_packs_epi32(out_03_6, out_03_7); - out[19] = _mm_packs_epi32(out_19_6, out_19_7); - out[11] = _mm_packs_epi32(out_11_6, out_11_7); - out[27] = _mm_packs_epi32(out_27_6, out_27_7); - } - // Transpose the results, do it as four 8x8 transposes. - { - int transpose_block; - int16_t *output; - if (0 == pass) { - output = &intermediate[column_start * 32]; - } else { - output = &output_org[column_start * 32]; - } - for (transpose_block = 0; transpose_block < 4; ++transpose_block) { - __m128i *this_out = &out[8 * transpose_block]; - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - if (0 == pass) { - // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; - // TODO(cd): see quality impact of only doing - // output[j] = (output[j] + 1) >> 2; - // which would remove the code between here ... - __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero); - __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero); - __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero); - __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero); - __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero); - __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero); - __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero); - __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero); - tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0); - tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0); - tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0); - tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0); - tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0); - tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0); - tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0); - tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0); - // ... and here. - // PS: also change code in vp9/encoder/vp9_dct.c - tr2_0 = _mm_add_epi16(tr2_0, kOne); - tr2_1 = _mm_add_epi16(tr2_1, kOne); - tr2_2 = _mm_add_epi16(tr2_2, kOne); - tr2_3 = _mm_add_epi16(tr2_3, kOne); - tr2_4 = _mm_add_epi16(tr2_4, kOne); - tr2_5 = _mm_add_epi16(tr2_5, kOne); - tr2_6 = _mm_add_epi16(tr2_6, kOne); - tr2_7 = _mm_add_epi16(tr2_7, kOne); - tr2_0 = _mm_srai_epi16(tr2_0, 2); - tr2_1 = _mm_srai_epi16(tr2_1, 2); - tr2_2 = _mm_srai_epi16(tr2_2, 2); - tr2_3 = _mm_srai_epi16(tr2_3, 2); - tr2_4 = _mm_srai_epi16(tr2_4, 2); - tr2_5 = _mm_srai_epi16(tr2_5, 2); - tr2_6 = _mm_srai_epi16(tr2_6, 2); - tr2_7 = _mm_srai_epi16(tr2_7, 2); - } - // Note: even though all these stores are aligned, using the aligned - // intrinsic make the code slightly slower. - _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0); - _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1); - _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2); - _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3); - _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4); - _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5); - _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6); - _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7); - // Process next 8x8 - output += 8; - } - } - } - } -} +#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2 +#define FDCT32x32_HIGH_PRECISION 0 +#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION + +#define FDCT32x32_2D vp9_short_fdct32x32_sse2 +#define FDCT32x32_HIGH_PRECISION 1 +#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm index 60f7991..db30660 100644 --- a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm +++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm @@ -36,6 +36,14 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pshufd m4, m4, 0 mova m2, [quantq] ; m2 = quant paddw m0, m4 ; m0 = zbin + zbin_oq +%ifidn %1, b_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m0, m5 + paddw m1, m5 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif mova m3, [r2q] ; m3 = dequant psubw m0, [pw_1] mov r2, shiftmp @@ -43,6 +51,9 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova m4, [r2] ; m4 = shift mov r4, dqcoeffmp mov r5, iscanmp +%ifidn %1, b_32x32 + psllw m4, 1 +%endif pxor m5, m5 ; m5 = dedicated zero DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob lea coeffq, [ coeffq+ncoeffq*2] @@ -56,16 +67,12 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) -%ifidn %1, b_32x32 - paddw m6, m6 - paddw m11, m11 -%endif pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin punpckhqdq m0, m0 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin - paddw m6, m1 ; m6 += round + paddsw m6, m1 ; m6 += round punpckhqdq m1, m1 - paddw m11, m1 ; m11 += round + paddsw m11, m1 ; m11 += round pmulhw m8, m6, m2 ; m8 = m6*q>>16 punpckhqdq m2, m2 pmulhw m13, m11, m2 ; m13 = m11*q>>16 @@ -112,10 +119,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) -%ifidn %1, b_32x32 - paddw m6, m6 - paddw m11, m11 -%endif pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin %ifidn %1, b_32x32 @@ -124,8 +127,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ or r6, r2 jz .skip_iter %endif - paddw m6, m1 ; m6 += round - paddw m11, m1 ; m11 += round + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round pmulhw m14, m6, m2 ; m14 = m6*q>>16 pmulhw m13, m11, m2 ; m13 = m11*q>>16 paddw m14, m6 ; m14 += m6 @@ -164,6 +167,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmaxsw m8, m13 add ncoeffq, mmsize jl .ac_only_loop + %ifidn %1, b_32x32 jmp .accumulate_eob .skip_iter: diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm index 19e2feb..533456b 100644 --- a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm +++ b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm @@ -270,8 +270,13 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ %if mmsize == 16 movhps m2, [srcq+src_strideq*2] %else ; mmsize == 8 +%if %1 == 4 + movh m1, [srcq+src_strideq*2] + punpckldq m2, m1 +%else punpckldq m2, [srcq+src_strideq*2] %endif +%endif movh m1, [dstq] %if mmsize == 16 movlhps m0, m2 @@ -542,9 +547,16 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ movhps m2, [srcq+src_strideq] movhps m3, [srcq+src_strideq+1] %else +%if %1 == 4 + movh m1, [srcq+src_strideq] + punpckldq m2, m1 + movh m1, [srcq+src_strideq+1] + punpckldq m3, m1 +%else punpckldq m2, [srcq+src_strideq] punpckldq m3, [srcq+src_strideq+1] %endif +%endif pavgb m2, m3 %if mmsize == 16 movlhps m0, m2 diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm index d3dbefe..3501cf1 100644 --- a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm +++ b/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm @@ -342,8 +342,8 @@ sym(vp9_get4x4var_mmx): movsxd rdx, dword ptr arg(3) ;[recon_stride] ; Row 1 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm1, [rbx] ; Copy eight bytes to mm1 + movd mm0, [rax] ; Copy 4 bytes to mm0 + movd mm1, [rbx] ; Copy 4 bytes to mm1 punpcklbw mm0, mm6 ; unpack to higher prrcision punpcklbw mm1, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 @@ -351,12 +351,12 @@ sym(vp9_get4x4var_mmx): pmaddwd mm0, mm0 ; square and accumulate add rbx,rdx ; Inc pointer into ref data add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 + movd mm1, [rbx] ; Copy 4 bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 ; Row 2 - movq mm0, [rax] ; Copy eight bytes to mm0 + movd mm0, [rax] ; Copy 4 bytes to mm0 punpcklbw mm0, mm6 ; unpack to higher prrcision punpcklbw mm1, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 @@ -365,11 +365,11 @@ sym(vp9_get4x4var_mmx): pmaddwd mm0, mm0 ; square and accumulate add rbx,rdx ; Inc pointer into ref data add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 + movd mm1, [rbx] ; Copy 4 bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 ; Row 3 - movq mm0, [rax] ; Copy eight bytes to mm0 + movd mm0, [rax] ; Copy 4 bytes to mm0 punpcklbw mm0, mm6 ; unpack to higher prrcision punpcklbw mm1, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 @@ -378,11 +378,11 @@ sym(vp9_get4x4var_mmx): pmaddwd mm0, mm0 ; square and accumulate add rbx,rdx ; Inc pointer into ref data add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 + movd mm1, [rbx] ; Copy 4 bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 ; Row 4 - movq mm0, [rax] ; Copy eight bytes to mm0 + movd mm0, [rax] ; Copy 4 bytes to mm0 punpcklbw mm0, mm6 ; unpack to higher prrcision punpcklbw mm1, mm6 diff --git a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c index b4ff850..cea934d 100644 --- a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c @@ -244,7 +244,7 @@ unsigned int vp9_variance16x16_sse2 return (var - (((unsigned int)avg * avg) >> 8)); } -unsigned int vp9_mse16x16_wmt( +unsigned int vp9_mse16x16_sse2( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, @@ -500,7 +500,7 @@ FNS(ssse3, ssse3); #undef FNS #undef FN -unsigned int vp9_variance_halfpixvar16x16_h_wmt( +unsigned int vp9_variance_halfpixvar16x16_h_sse2( const unsigned char *src_ptr, int src_pixels_per_line, const unsigned char *dst_ptr, @@ -519,7 +519,7 @@ unsigned int vp9_variance_halfpixvar16x16_h_wmt( } -unsigned int vp9_variance_halfpixvar16x16_v_wmt( +unsigned int vp9_variance_halfpixvar16x16_v_sse2( const unsigned char *src_ptr, int src_pixels_per_line, const unsigned char *dst_ptr, @@ -537,7 +537,7 @@ unsigned int vp9_variance_halfpixvar16x16_v_wmt( } -unsigned int vp9_variance_halfpixvar16x16_hv_wmt( +unsigned int vp9_variance_halfpixvar16x16_hv_sse2( const unsigned char *src_ptr, int src_pixels_per_line, const unsigned char *dst_ptr, diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk index b2b2a80..687fb48 100644 --- a/libvpx/vp9/vp9_common.mk +++ b/libvpx/vp9/vp9_common.mk @@ -49,6 +49,8 @@ VP9_COMMON_SRCS-yes += common/vp9_rtcd.c VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh VP9_COMMON_SRCS-yes += common/vp9_sadmxn.h VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h +VP9_COMMON_SRCS-yes += common/vp9_scale.h +VP9_COMMON_SRCS-yes += common/vp9_scale.c VP9_COMMON_SRCS-yes += common/vp9_seg_common.h VP9_COMMON_SRCS-yes += common/vp9_seg_common.c VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h @@ -71,29 +73,41 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c -VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h -VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c +VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h +VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm -ifeq ($(CONFIG_POSTPROC),yes) +ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm endif ifeq ($(USE_X86INC),yes) VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm +VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm endif VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_1_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_1_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct32x32_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht4x4_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM) $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh)) diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c index be7828f..48866d2 100644 --- a/libvpx/vp9/vp9_cx_iface.c +++ b/libvpx/vp9/vp9_cx_iface.c @@ -51,14 +51,14 @@ static const struct extraconfig_map extracfg_map[] = { { NULL, 0, /* cpu_used */ - 0, /* enable_auto_alt_ref */ + 1, /* enable_auto_alt_ref */ 0, /* noise_sensitivity */ 0, /* Sharpness */ 0, /* static_thresh */ 0, /* tile_columns */ 0, /* tile_rows */ - 0, /* arnr_max_frames */ - 3, /* arnr_strength */ + 7, /* arnr_max_frames */ + 5, /* arnr_strength */ 3, /* arnr_type*/ 0, /* experimental mode */ 0, /* tuning*/ @@ -89,6 +89,18 @@ struct vpx_codec_alg_priv { unsigned int fixed_kf_cntr; }; +static const VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { + switch (frame) { + case VP8_LAST_FRAME: + return VP9_LAST_FLAG; + case VP8_GOLD_FRAME: + return VP9_GOLD_FLAG; + case VP8_ALTR_FRAME: + return VP9_ALT_FLAG; + } + assert(!"Invalid Reference Frame"); + return VP9_LAST_FLAG; +} static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx, @@ -148,7 +160,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(cfg, g_threads, 64); RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); - RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_CQ); + RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q); RANGE_CHECK_HI(cfg, rc_undershoot_pct, 1000); RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000); RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); @@ -160,6 +172,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100); RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS); + RANGE_CHECK(cfg, ss_number_layers, 1, + VPX_SS_MAX_LAYERS); /*Spatial layers max */ /* VP8 does not support a lower bound on the keyframe interval in * automatic keyframe placement mode. */ @@ -262,13 +276,15 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, // VBR only supported for now. // CBR code has been deprectated for experimental phase. // CQ mode not yet tested - oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK; - /*if (cfg.rc_end_usage == VPX_CQ) - oxcf->end_usage = USAGE_CONSTRAINED_QUALITY; - else - oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;*/ + oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK; + /* + if (cfg.rc_end_usage == VPX_CQ) + oxcf->end_usage = USAGE_CONSTRAINED_QUALITY; + */ + if (cfg.rc_end_usage == VPX_Q) + oxcf->end_usage = USAGE_CONSTANT_QUALITY; - oxcf->target_bandwidth = cfg.rc_target_bitrate; + oxcf->target_bandwidth = cfg.rc_target_bitrate; oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct; oxcf->best_allowed_q = cfg.rc_min_quantizer; @@ -317,6 +333,8 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, oxcf->error_resilient_mode = cfg.g_error_resilient; oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode; + + oxcf->ss_number_layers = cfg.ss_number_layers; /* printf("Current VP9 Settings: \n"); printf("target_bandwidth: %d\n", oxcf->target_bandwidth); @@ -411,21 +429,22 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx, #define MAP(id, var) case id: var = CAST(id, args); break; switch (ctrl_id) { - MAP(VP8E_SET_CPUUSED, xcfg.cpu_used); - MAP(VP8E_SET_ENABLEAUTOALTREF, xcfg.enable_auto_alt_ref); - MAP(VP8E_SET_NOISE_SENSITIVITY, xcfg.noise_sensitivity); - MAP(VP8E_SET_SHARPNESS, xcfg.Sharpness); - MAP(VP8E_SET_STATIC_THRESHOLD, xcfg.static_thresh); - MAP(VP9E_SET_TILE_COLUMNS, xcfg.tile_columns); - MAP(VP9E_SET_TILE_ROWS, xcfg.tile_rows); - - MAP(VP8E_SET_ARNR_MAXFRAMES, xcfg.arnr_max_frames); - MAP(VP8E_SET_ARNR_STRENGTH, xcfg.arnr_strength); - MAP(VP8E_SET_ARNR_TYPE, xcfg.arnr_type); - MAP(VP8E_SET_TUNING, xcfg.tuning); - MAP(VP8E_SET_CQ_LEVEL, xcfg.cq_level); - MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct); - MAP(VP9E_SET_LOSSLESS, xcfg.lossless); + MAP(VP8E_SET_CPUUSED, xcfg.cpu_used); + MAP(VP8E_SET_ENABLEAUTOALTREF, xcfg.enable_auto_alt_ref); + MAP(VP8E_SET_NOISE_SENSITIVITY, xcfg.noise_sensitivity); + MAP(VP8E_SET_SHARPNESS, xcfg.Sharpness); + MAP(VP8E_SET_STATIC_THRESHOLD, xcfg.static_thresh); + MAP(VP9E_SET_TILE_COLUMNS, xcfg.tile_columns); + MAP(VP9E_SET_TILE_ROWS, xcfg.tile_rows); + MAP(VP8E_SET_ARNR_MAXFRAMES, xcfg.arnr_max_frames); + MAP(VP8E_SET_ARNR_STRENGTH, xcfg.arnr_strength); + MAP(VP8E_SET_ARNR_TYPE, xcfg.arnr_type); + MAP(VP8E_SET_TUNING, xcfg.tuning); + MAP(VP8E_SET_CQ_LEVEL, xcfg.cq_level); + MAP(VP9E_SET_MAX_Q, ctx->cfg.rc_max_quantizer); + MAP(VP9E_SET_MIN_Q, ctx->cfg.rc_min_quantizer); + MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct); + MAP(VP9E_SET_LOSSLESS, xcfg.lossless); MAP(VP9E_SET_FRAME_PARALLEL_DECODING, xcfg.frame_parallel_decoding_mode); } @@ -846,7 +865,8 @@ static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx, YV12_BUFFER_CONFIG sd; image2yuvconfig(&frame->img, &sd); - vp9_set_reference_enc(ctx->cpi, frame->frame_type, &sd); + vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type), + &sd); return VPX_CODEC_OK; } else return VPX_CODEC_INVALID_PARAM; @@ -864,7 +884,8 @@ static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx, YV12_BUFFER_CONFIG sd; image2yuvconfig(&frame->img, &sd); - vp9_copy_reference_enc(ctx->cpi, frame->frame_type, &sd); + vp9_copy_reference_enc(ctx->cpi, + ref_frame_to_vp9_reframe(frame->frame_type), &sd); return VPX_CODEC_OK; } else return VPX_CODEC_INVALID_PARAM; @@ -889,7 +910,7 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); (void)ctr_id; @@ -1005,6 +1026,68 @@ static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_INVALID_PARAM; } +static vpx_codec_err_t vp9e_set_width(vpx_codec_alg_priv_t *ctx, int ctr_id, + va_list args) { + unsigned int *data = va_arg(args, unsigned int *); + if (data) { + int res; + res = vp9_set_size_literal(ctx->cpi, *data, 0); + if (!res) { + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t vp9e_set_height(vpx_codec_alg_priv_t *ctx, + int ctr_id, + va_list args) { + unsigned int *data = va_arg(args, unsigned int *); + + if (data) { + int res; + res = vp9_set_size_literal(ctx->cpi, 0, *data); + + if (!res) { + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t vp9e_set_layer(vpx_codec_alg_priv_t *ctx, + int ctr_id, + va_list args) { + unsigned int *data = va_arg(args, unsigned int *); + + if (data) { + int res; + res = 0; + + res = vp9_switch_layer(ctx->cpi, *data); + + if (!res) { + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t vp9e_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id, + va_list args) { + int data = va_arg(args, int); + vp9_set_svc(ctx->cpi, data); + return VPX_CODEC_OK; +} static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = { {VP8_SET_REFERENCE, vp9e_set_reference}, @@ -1026,14 +1109,20 @@ static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = { {VP8E_GET_LAST_QUANTIZER, get_param}, {VP8E_GET_LAST_QUANTIZER_64, get_param}, {VP8E_SET_ARNR_MAXFRAMES, set_param}, - {VP8E_SET_ARNR_STRENGTH, set_param}, - {VP8E_SET_ARNR_TYPE, set_param}, + {VP8E_SET_ARNR_STRENGTH, set_param}, + {VP8E_SET_ARNR_TYPE, set_param}, {VP8E_SET_TUNING, set_param}, {VP8E_SET_CQ_LEVEL, set_param}, + {VP9E_SET_MAX_Q, set_param}, + {VP9E_SET_MIN_Q, set_param}, {VP8E_SET_MAX_INTRA_BITRATE_PCT, set_param}, {VP9E_SET_LOSSLESS, set_param}, {VP9E_SET_FRAME_PARALLEL_DECODING, set_param}, {VP9_GET_REFERENCE, get_reference}, + {VP9E_SET_WIDTH, vp9e_set_width}, + {VP9E_SET_HEIGHT, vp9e_set_height}, + {VP9E_SET_LAYER, vp9e_set_layer}, + {VP9E_SET_SVC, vp9e_set_svc}, { -1, NULL}, }; @@ -1053,7 +1142,7 @@ static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = { VPX_RC_ONE_PASS, /* g_pass */ - 0, /* g_lag_in_frames */ + 25, /* g_lag_in_frames */ 0, /* rc_dropframe_thresh */ 0, /* rc_resize_allowed */ @@ -1065,7 +1154,7 @@ static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = { {0}, /* rc_twopass_stats_in */ #endif 256, /* rc_target_bandwidth */ - 4, /* rc_min_quantizer */ + 0, /* rc_min_quantizer */ 63, /* rc_max_quantizer */ 100, /* rc_undershoot_pct */ 100, /* rc_overshoot_pct */ @@ -1076,13 +1165,15 @@ static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = { 50, /* rc_two_pass_vbrbias */ 0, /* rc_two_pass_vbrmin_section */ - 400, /* rc_two_pass_vbrmax_section */ + 2000, /* rc_two_pass_vbrmax_section */ /* keyframing settings (kf) */ VPX_KF_AUTO, /* g_kfmode*/ 0, /* kf_min_dist */ 9999, /* kf_max_dist */ + VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */ + #if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION) 1, /* g_delete_first_pass_file */ "vp8.fpf" /* first pass filename */ diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c index 05029b9..10b3238 100644 --- a/libvpx/vp9/vp9_dx_iface.c +++ b/libvpx/vp9/vp9_dx_iface.c @@ -15,11 +15,12 @@ #include "vpx/vp8dx.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx_version.h" -#include "decoder/vp9_onyxd.h" -#include "decoder/vp9_onyxd_int.h" +#include "vp9/decoder/vp9_onyxd.h" +#include "vp9/decoder/vp9_onyxd_int.h" +#include "vp9/decoder/vp9_read_bit_buffer.h" #include "vp9/vp9_iface_common.h" -#define VP9_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0) +#define VP9_CAP_POSTPROC (CONFIG_VP9_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0) typedef vpx_codec_stream_info_t vp9_stream_info_t; /* Structures for handling memory allocations */ @@ -142,32 +143,64 @@ static vpx_codec_err_t vp9_destroy(vpx_codec_alg_priv_t *ctx) { static vpx_codec_err_t vp9_peek_si(const uint8_t *data, unsigned int data_sz, vpx_codec_stream_info_t *si) { - vpx_codec_err_t res = VPX_CODEC_OK; - if (data_sz <= 8) return VPX_CODEC_UNSUP_BITSTREAM; + if (data + data_sz <= data) return VPX_CODEC_INVALID_PARAM; - if (data + data_sz <= data) { - res = VPX_CODEC_INVALID_PARAM; - } else { - const int frame_marker = (data[0] >> 6) & 0x3; - const int version = (data[0] >> 4) & 0x3; + si->is_kf = 0; + si->w = si->h = 0; + + { + struct vp9_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL }; + const int frame_marker = vp9_rb_read_literal(&rb, 2); + const int version = vp9_rb_read_bit(&rb) | (vp9_rb_read_bit(&rb) << 1); if (frame_marker != 0x2) return VPX_CODEC_UNSUP_BITSTREAM; +#if CONFIG_NON420 + if (version > 1) return VPX_CODEC_UNSUP_BITSTREAM; +#else if (version != 0) return VPX_CODEC_UNSUP_BITSTREAM; +#endif + + if (vp9_rb_read_bit(&rb)) { // show an existing frame + return VPX_CODEC_OK; + } - si->is_kf = !((data[0] >> 2) & 0x1); + si->is_kf = !vp9_rb_read_bit(&rb); if (si->is_kf) { - const uint8_t *c = data + 1; + const int sRGB = 7; + int colorspace; - if (c[0] != SYNC_CODE_0 || c[1] != SYNC_CODE_1 || c[2] != SYNC_CODE_2) + rb.bit_offset += 1; // show frame + rb.bit_offset += 1; // error resilient + + if (vp9_rb_read_literal(&rb, 8) != SYNC_CODE_0 || + vp9_rb_read_literal(&rb, 8) != SYNC_CODE_1 || + vp9_rb_read_literal(&rb, 8) != SYNC_CODE_2) { return VPX_CODEC_UNSUP_BITSTREAM; + } - c += 3; - si->w = (((c[0] & 0xf) << 12) | (c[1] << 4) | ((c[2] >> 4) & 0xf)) + 1; - si->h = (((c[2] & 0xf) << 12) | (c[3] << 4) | ((c[4] >> 4) & 0xf)) + 1; + colorspace = vp9_rb_read_literal(&rb, 3); + if (colorspace != sRGB) { + rb.bit_offset += 1; // [16,235] (including xvycc) vs [0,255] range + if (version == 1) { + rb.bit_offset += 2; // subsampling x/y + rb.bit_offset += 1; // has extra plane + } + } else { + if (version == 1) { + rb.bit_offset += 1; // has extra plane + } else { + // RGB is only available in version 1 + return VPX_CODEC_UNSUP_BITSTREAM; + } + } + + // TODO(jzern): these are available on non-keyframes in intra only mode. + si->w = vp9_rb_read_literal(&rb, 16) + 1; + si->h = vp9_rb_read_literal(&rb, 16) + 1; } } - return res; + return VPX_CODEC_OK; } static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t *ctx, @@ -368,6 +401,8 @@ static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t *ctx, uint32_t sizes[8]; int frames_this_pts, frame_count = 0; + if (data == NULL || data_sz == 0) return VPX_CODEC_INVALID_PARAM; + parse_superframe_index(data, data_sz, sizes, &frames_this_pts); do { @@ -561,7 +596,7 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx, int ctr_id, va_list args) { -#if CONFIG_POSTPROC +#if CONFIG_VP9_POSTPROC vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); if (data) { diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk index 288c0d8..9fbf100 100644 --- a/libvpx/vp9/vp9cx.mk +++ b/libvpx/vp9/vp9cx.mk @@ -64,7 +64,7 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c VP9_CX_SRCS-yes += encoder/vp9_tokenize.c VP9_CX_SRCS-yes += encoder/vp9_treewriter.c VP9_CX_SRCS-yes += encoder/vp9_variance_c.c -ifeq ($(CONFIG_POSTPROC),yes) +ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c endif @@ -78,18 +78,18 @@ VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm ifeq ($(USE_X86INC),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm endif ifeq ($(ARCH_X86_64),yes) @@ -100,5 +100,6 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/libvpx/vp9_spatial_scalable_encoder.c b/libvpx/vp9_spatial_scalable_encoder.c new file mode 100644 index 0000000..8bb582f --- /dev/null +++ b/libvpx/vp9_spatial_scalable_encoder.c @@ -0,0 +1,487 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * This is an example demonstrating how to implement a multi-layer + * VP9 encoding scheme based on spatial scalability for video applications + * that benefit from a scalable bitstream. + */ +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include <time.h> +#include <string.h> +#include <unistd.h> +#include <libgen.h> +#define VPX_CODEC_DISABLE_COMPAT 1 +#include "vpx/vpx_encoder.h" +#include "vpx/vp8cx.h" +#define interface (vpx_codec_vp9_cx()) +#define fourcc 0x30395056 +#define IVF_FILE_HDR_SZ (32) +#define IVF_FRAME_HDR_SZ (12) +#define NUM_BUFFERS 8 + +char *input_filename; +char *output_filename; +unsigned int number_frames_to_code = 60 * 60; +unsigned int number_frames_to_skip = 0; +unsigned int number_spatial_layers = 5; +unsigned int key_period = 100; + +typedef enum ENCODING_MODE { + INTER_LAYER_PREDICTION_I, + INTER_LAYER_PREDICTION_IP, + USE_GOLDEN_FRAME +} ENCODING_MODE; + +static void mem_put_le16(char *mem, unsigned int val) { + mem[0] = val; + mem[1] = val >> 8; +} + +static void mem_put_le32(char *mem, unsigned int val) { + mem[0] = val; + mem[1] = val >> 8; + mem[2] = val >> 16; + mem[3] = val >> 24; +} + +static void usage(char *program_name) { + printf( + "Usage: %s [-f frames] [-s skip_frames] [-w width] [-h height] \n\t" + "[-n rate_num] [-d rate_den] [-b bitrate] [-l layers] " + "<input_filename> <output_filename>\n", + basename(program_name)); + exit(EXIT_FAILURE); +} + +static void die(const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + vprintf(fmt, ap); + if (fmt[strlen(fmt) - 1] != '\n') printf("\n"); + exit(EXIT_FAILURE); +} + +static void die_codec(vpx_codec_ctx_t *ctx, const char *s) { + const char *detail = vpx_codec_error_detail(ctx); + + printf("%s: %s\n", s, vpx_codec_error(ctx)); + if (detail) printf(" %s\n", detail); + exit(EXIT_FAILURE); +} + +static int read_frame(FILE *f, vpx_image_t *img) { + size_t nbytes, to_read; + int res = 1; + + to_read = img->w * img->h * 3 / 2; + nbytes = fread(img->planes[0], 1, to_read, f); + if (nbytes != to_read) { + res = 0; + if (nbytes > 0) + printf("Warning: Read partial frame. Check your width & height!\n"); + } + return res; +} + +static int read_dummy_frame(vpx_image_t *img) { + size_t to_read; + + to_read = img->w * img->h * 3 / 2; + memset(img->planes[0], 129, to_read); + return 1; +} + +static void write_ivf_file_header(FILE *outfile, const vpx_codec_enc_cfg_t *cfg, + int frame_cnt) { + char header[32]; + + if (cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS) return; + header[0] = 'D'; + header[1] = 'K'; + header[2] = 'I'; + header[3] = 'F'; + mem_put_le16(header + 4, 0); /* version */ + mem_put_le16(header + 6, 32); /* headersize */ + mem_put_le32(header + 8, fourcc); /* headersize */ + mem_put_le16(header + 12, cfg->g_w); /* width */ + mem_put_le16(header + 14, cfg->g_h); /* height */ + mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */ + mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */ + mem_put_le32(header + 24, frame_cnt); /* length */ + mem_put_le32(header + 28, 0); /* unused */ + + (void)fwrite(header, 1, 32, outfile); +} + +static void write_ivf_frame_header(FILE *outfile, + const vpx_codec_cx_pkt_t *pkt) { + char header[12]; + vpx_codec_pts_t pts; + + if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return; + + pts = pkt->data.frame.pts; + mem_put_le32(header, pkt->data.frame.sz); + mem_put_le32(header + 4, pts & 0xFFFFFFFF); + mem_put_le32(header + 8, pts >> 32); + + (void)fwrite(header, 1, 12, outfile); +} + +static void check_parameters() { + if (number_spatial_layers > 5) die("Cannot support more than 5 layers"); +} + +static void parse_command_line(int argc, char **argv, + vpx_codec_enc_cfg_t *cfg) { + unsigned int width = 1920; + unsigned int height = 1080; + unsigned int timebase_num = 1; + unsigned int timebase_den = 60; + unsigned int bitrate = 1000; + int c; + vpx_codec_err_t res; + + opterr = 0; + while ((c = getopt(argc, argv, "f:w:h:n:d:b:s:l:p:")) != -1) switch (c) { + case 'f': + number_frames_to_code = atoi(optarg); + break; + case 'w': + width = atoi(optarg); + break; + case 'h': + height = atoi(optarg); + break; + case 'n': + timebase_num = atoi(optarg); + break; + case 'd': + timebase_den = atoi(optarg); + break; + case 'b': + bitrate = atoi(optarg); + break; + case 's': + number_frames_to_skip = atoi(optarg); + break; + case 'l': + number_spatial_layers = atoi(optarg); + break; + case 'p': + key_period = atoi(optarg); + break; + case '?': + usage(argv[0]); + } + + // Parse required parameters + if (argc - optind != 2) { + usage(argv[0]); + } + + input_filename = argv[optind]; + output_filename = argv[optind + 1]; + + if (width < 16 || width % 2 || height < 16 || height % 2) + die("Invalid resolution: %d x %d", width, height); + + /* Populate encoder configuration */ + res = vpx_codec_enc_config_default(interface, cfg, 0); + if (res) { + die("Failed to get config: %s\n", vpx_codec_err_to_string(res)); + } + printf( + "Codec %s\nframes: %d, skip: %d, layers: %d\n" + "width %d, height: %d, \n" + "num: %d, den: %d, bitrate: %d, \n" + "key period: %d \n", + vpx_codec_iface_name(interface), number_frames_to_code, + number_frames_to_skip, number_spatial_layers, width, height, timebase_num, + timebase_den, bitrate, key_period); + + // Do minimal check at the application level. Encoder parameters will be + // checked internally + check_parameters(); + + cfg->rc_target_bitrate = bitrate; + cfg->g_w = width; + cfg->g_h = height; + cfg->g_timebase.num = timebase_num; + cfg->g_timebase.den = timebase_den; + cfg->ss_number_layers = number_spatial_layers; +} + +static void set_default_configuration(vpx_codec_enc_cfg_t *cfg) { + /* Real time parameters */ + cfg->rc_dropframe_thresh = 0; + cfg->rc_end_usage = VPX_CBR; + cfg->rc_resize_allowed = 0; + cfg->rc_min_quantizer = 33; + cfg->rc_max_quantizer = 33; + cfg->rc_undershoot_pct = 100; + cfg->rc_overshoot_pct = 15; + cfg->rc_buf_initial_sz = 500; + cfg->rc_buf_optimal_sz = 600; + cfg->rc_buf_sz = 1000; + + /* Enable error resilient mode */ + cfg->g_error_resilient = 1; + cfg->g_lag_in_frames = 0; + + /* Disable automatic keyframe placement */ + cfg->kf_mode = VPX_KF_DISABLED; + cfg->kf_min_dist = cfg->kf_max_dist = 3000; +} + +static void initialize_codec(vpx_codec_ctx_t *codec, vpx_codec_enc_cfg_t *cfg) { + int max_intra_size_pct; + + /* Initialize codec */ + if (vpx_codec_enc_init(codec, interface, cfg, VPX_CODEC_USE_PSNR)) + die_codec(codec, "Failed to initialize encoder"); + + vpx_codec_control(codec, VP9E_SET_SVC, 1); + /* Cap CPU & first I-frame size */ + vpx_codec_control(codec, VP8E_SET_CPUUSED, 1); + vpx_codec_control(codec, VP8E_SET_STATIC_THRESHOLD, 1); + vpx_codec_control(codec, VP8E_SET_NOISE_SENSITIVITY, 1); + vpx_codec_control(codec, VP8E_SET_TOKEN_PARTITIONS, 1); + + max_intra_size_pct = + (int)(((double)cfg->rc_buf_optimal_sz * 0.5) * + ((double)cfg->g_timebase.den / cfg->g_timebase.num) / 10.0); + /* printf ("max_intra_size_pct=%d\n", max_intra_size_pct); */ + + vpx_codec_control(codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct); +} + +static int calculate_layer(int frame_cnt, int number_spatial_layers) { + if (frame_cnt == 0) + return 0; + else + return (frame_cnt + number_spatial_layers - 1) % number_spatial_layers; +} + +static void switch_to_layer(int layer, unsigned int initial_width, + unsigned int initial_height, + vpx_codec_ctx_t *codec) { + // Set layer size + int scaling_factor_num[MAX_LAYERS] = {2, 1, 4, 2, 1}; + int scaling_factor_den[MAX_LAYERS] = {9, 3, 9, 3, 1}; + + int quantizer[MAX_LAYERS] = {60, 53, 39, 33, 27}; + + unsigned int current_width; + unsigned int current_height; + + current_width = initial_width * + scaling_factor_num[layer + 5 - number_spatial_layers] / + scaling_factor_den[layer + 5 - number_spatial_layers]; + current_height = initial_height * + scaling_factor_num[layer + 5 - number_spatial_layers] / + scaling_factor_den[layer + 5 - number_spatial_layers]; + + current_width += current_width % 2; + current_height += current_height % 2; + + vpx_codec_control(codec, VP9E_SET_WIDTH, ¤t_width); + vpx_codec_control(codec, VP9E_SET_HEIGHT, ¤t_height); + + // Set layer context + vpx_codec_control(codec, VP9E_SET_LAYER, &layer); + vpx_codec_control(codec, VP9E_SET_MAX_Q, + quantizer[layer + 5 - number_spatial_layers]); + vpx_codec_control(codec, VP9E_SET_MIN_Q, + quantizer[layer + 5 - number_spatial_layers]); +} + +static int get_flag(int is_I_frame_in_layer, int layer, ENCODING_MODE mode) { + // First layer + switch (mode) { + case INTER_LAYER_PREDICTION_I: + if (is_I_frame_in_layer && layer == 0) return VPX_EFLAG_FORCE_KF; + if (layer == 0) + return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF; + else if (is_I_frame_in_layer) + return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_LAST; + else + return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF; + break; + + case INTER_LAYER_PREDICTION_IP: + if (is_I_frame_in_layer && layer == 0) return VPX_EFLAG_FORCE_KF; + if (layer == 0) + return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF; + else if (is_I_frame_in_layer) + return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_LAST; + else + return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF; + break; + + case USE_GOLDEN_FRAME: + if (is_I_frame_in_layer && layer == 0) return VPX_EFLAG_FORCE_KF; + if (2 * number_spatial_layers - NUM_BUFFERS <= layer) { + if (layer == 0) + return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_REF_ARF; + else if (is_I_frame_in_layer) + return VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF | + VP8_EFLAG_NO_REF_LAST; + else + return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + } else { + if (layer == 0) + return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF; + else if (is_I_frame_in_layer) + return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_LAST; + else + return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF; + } + break; + default: + return VPX_EFLAG_FORCE_KF; + } +} + +int main(int argc, char **argv) { + FILE *infile, *outfile[MAX_LAYERS]; + vpx_codec_ctx_t codec; + vpx_codec_enc_cfg_t cfg; + int frame_cnt = 0; + vpx_image_t raw; + int frame_avail = 1; + int got_data = 0; + int i; + int frames_in_layer[MAX_LAYERS] = {0}; + clock_t before; + clock_t after; + int pts = 0; /* PTS starts at 0 */ + int frame_duration = 1; /* 1 timebase tick per frame */ + + parse_command_line(argc, argv, &cfg); + + // Allocate image buffer + if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 32)) + die("Failed to allocate image", cfg.g_w, cfg.g_h); + + set_default_configuration(&cfg); + + /* Open input file */ + if (!(infile = fopen(input_filename, "rb"))) + die("Failed to open %s for reading", argv[1]); + + /* Open output file */ + for (i = 0; i < number_spatial_layers; i++) { + char file_name[512]; + snprintf(file_name, sizeof(file_name), "%s_%d.ivf", output_filename, i); + if (!(outfile[i] = fopen(file_name, "wb"))) + die("Failed to open %s for writing", file_name); + write_ivf_file_header(outfile[i], &cfg, 0); + } + + initialize_codec(&codec, &cfg); + + // skip initial frames + for (i = 0; i < number_frames_to_skip; i++) { + read_frame(infile, &raw); + } + + before = clock(); + // Encoding frames + while ((frame_avail || got_data) && + frame_cnt <= number_frames_to_code * number_spatial_layers) { + int flags = 0; + vpx_codec_iter_t iter = NULL; + const vpx_codec_cx_pkt_t *pkt; + + int layer = calculate_layer(frame_cnt, number_spatial_layers); + int is_I_frame_in_layer = + (((frame_cnt - 1) / number_spatial_layers % key_period) == 0); + int is_dummy = (frame_cnt == 0); + + if (is_dummy) { // Dummy frame + flags = VPX_EFLAG_FORCE_KF; + frame_avail = read_dummy_frame(&raw); + + } else { // Regular frame + // Read a new frame only at the base layer + if (layer == 0) frame_avail = read_frame(infile, &raw); + switch_to_layer(layer, cfg.g_w, cfg.g_h, &codec); + flags = get_flag(is_I_frame_in_layer, layer, INTER_LAYER_PREDICTION_I); + } + + // Actual Encoding + if (vpx_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags, + VPX_DL_REALTIME)) + die_codec(&codec, "Failed to encode frame"); + + got_data = 0; + // Process data / Get PSNR statistics + while ((pkt = vpx_codec_get_cx_data(&codec, &iter))) { + got_data = 1; + switch (pkt->kind) { + case VPX_CODEC_CX_FRAME_PKT: + for (i = layer; i < number_spatial_layers; i++) { + write_ivf_frame_header(outfile[i], pkt); + (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, + outfile[i]); + frames_in_layer[i]++; + } + break; + case VPX_CODEC_PSNR_PKT: + if (frame_cnt != 0) + printf( + "Processed Frame %d, layer %d, PSNR(Total/Y/U/V): " + "%2.3f %2.3f %2.3f %2.3f \n", + (frame_cnt - 1) / number_spatial_layers + 1, layer, + pkt->data.psnr.psnr[0], pkt->data.psnr.psnr[1], + pkt->data.psnr.psnr[2], pkt->data.psnr.psnr[3]); + break; + default: + break; + } + } + frame_cnt++; + // TODO(ivan): Modify ts later if(!layer) + pts += frame_duration; + } + // end while + + after = clock(); + printf("Processed %d frames in different resolutions in %ld ms.\n", + frame_cnt - 1, (int)(after - before) / (CLOCKS_PER_SEC / 1000)); + + fclose(infile); + + if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); + + /* Try to rewrite the output file headers with the actual frame count */ + for (i = 0; i < number_spatial_layers; i++) { + if (!fseek(outfile[i], 0, SEEK_SET)) { + write_ivf_file_header(outfile[i], &cfg, frames_in_layer[i]); + } + fclose(outfile[i]); + } + + return EXIT_SUCCESS; +} diff --git a/libvpx/vpx/vp8cx.h b/libvpx/vpx/vp8cx.h index f8e2ef9..f3ea6d3 100644 --- a/libvpx/vpx/vp8cx.h +++ b/libvpx/vpx/vp8cx.h @@ -190,7 +190,15 @@ enum vp8e_enc_control_id { VP9E_SET_LOSSLESS, VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS, - VP9E_SET_FRAME_PARALLEL_DECODING + VP9E_SET_FRAME_PARALLEL_DECODING, + + VP9E_SET_WIDTH = 99, + VP9E_SET_HEIGHT, + VP9E_SET_LAYER, + VP9E_SET_SVC, + + VP9E_SET_MAX_Q, + VP9E_SET_MIN_Q }; /*!\brief vpx 1-D scaling mode @@ -292,6 +300,12 @@ VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *) VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *) VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *) +VPX_CTRL_USE_TYPE(VP9E_SET_LAYER, int *) +VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int) + +VPX_CTRL_USE_TYPE(VP9E_SET_WIDTH, unsigned int *) +VPX_CTRL_USE_TYPE(VP9E_SET_HEIGHT, unsigned int *) + VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int) VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF, unsigned int) VPX_CTRL_USE_TYPE(VP8E_SET_NOISE_SENSITIVITY, unsigned int) @@ -316,6 +330,9 @@ VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int) + +VPX_CTRL_USE_TYPE(VP9E_SET_MAX_Q, unsigned int) +VPX_CTRL_USE_TYPE(VP9E_SET_MIN_Q, unsigned int) /*! @} - end defgroup vp8_encoder */ #include "vpx_codec_impl_bottom.h" #endif diff --git a/libvpx/vpx/vpx_encoder.h b/libvpx/vpx/vpx_encoder.h index ffdbc06..56fd2d9 100644 --- a/libvpx/vpx/vpx_encoder.h +++ b/libvpx/vpx/vpx_encoder.h @@ -46,6 +46,12 @@ extern "C" { /*!\deprecated Use #VPX_TS_MAX_LAYERS instead. */ #define MAX_LAYERS VPX_TS_MAX_LAYERS +/*! Spatial Scalability: Maximum number of coding layers */ +#define VPX_SS_MAX_LAYERS 5 + +/*! Spatial Scalability: Default number of coding layers */ +#define VPX_SS_DEFAULT_LAYERS 3 + /*!\brief Current ABI version number * * \internal @@ -217,9 +223,10 @@ extern "C" { /*!\brief Rate control mode */ enum vpx_rc_mode { - VPX_VBR, /**< Variable Bit Rate (VBR) mode */ + VPX_VBR, /**< Variable Bit Rate (VBR) mode */ VPX_CBR, /**< Constant Bit Rate (CBR) mode */ - VPX_CQ /**< Constant Quality (CQ) mode */ + VPX_CQ, /**< Constrained Quality (CQ) mode */ + VPX_Q, /**< Constant Quality (Q) mode */ }; @@ -595,8 +602,14 @@ extern "C" { unsigned int kf_max_dist; /* - * Temporal scalability settings (ts) + * Spatial scalability settings (ss) + */ + + /*!\brief Number of coding layers (spatial) + * + * This value specifies the number of coding layers to be used. */ + unsigned int ss_number_layers; /*!\brief Number of coding layers * diff --git a/libvpx/vpx_mem/include/vpx_mem_intrnl.h b/libvpx/vpx_mem/include/vpx_mem_intrnl.h index 60b5165..2248ad5 100644 --- a/libvpx/vpx_mem/include/vpx_mem_intrnl.h +++ b/libvpx/vpx_mem/include/vpx_mem_intrnl.h @@ -50,14 +50,10 @@ vpx_memcpy, _memset, and _memmove*/ calls to vpx_* functions other than vpx_memalign*/ # else -# define DEFAULT_ALIGNMENT 1 +# define DEFAULT_ALIGNMENT (2 * sizeof(void*)) /* NOLINT */ # endif #endif -#if DEFAULT_ALIGNMENT < 1 -# error "DEFAULT_ALIGNMENT must be >= 1!" -#endif - #if CONFIG_MEM_TRACKER # define TRY_BOUNDS_CHECK 1 /*when set to 1 pads each allocation, integrity can be checked using diff --git a/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm b/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm index cc1789a..d070a47 100644 --- a/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm +++ b/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm @@ -9,7 +9,7 @@ ; - EXPORT |vp8_yv12_copy_y_neon| + EXPORT |vpx_yv12_copy_y_neon| ARM REQUIRE8 @@ -19,8 +19,9 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vpxyv12_copy_y_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) -|vp8_yv12_copy_y_neon| PROC +;void vpx_yv12_copy_y_neon(const YV12_BUFFER_CONFIG *src_ybc, +; YV12_BUFFER_CONFIG *dst_ybc) +|vpx_yv12_copy_y_neon| PROC push {r4 - r11, lr} vpush {d8-d15} diff --git a/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm b/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm index 3f17883..696f47a 100644 --- a/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm +++ b/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm @@ -18,7 +18,7 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, +;void vp8_yv12_copy_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc, ; YV12_BUFFER_CONFIG *dst_ybc); |vp8_yv12_copy_frame_func_neon| PROC diff --git a/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm b/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm index d452ad2..d3306b6 100644 --- a/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm +++ b/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm @@ -17,11 +17,12 @@ INCLUDE vpx_scale_asm_offsets.asm AREA ||.text||, CODE, READONLY, ALIGN=2 -;Note: This function is used to copy source data in src_buffer[i] at beginning of -;the encoding. The buffer has a width and height of cpi->oxcf.Width and cpi->oxcf.Height, -;which can be ANY numbers(NOT always multiples of 16 or 4). +;Note: This function is used to copy source data in src_buffer[i] at beginning +;of the encoding. The buffer has a width and height of cpi->oxcf.Width and +;cpi->oxcf.Height, which can be ANY numbers(NOT always multiples of 16 or 4). -;void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +;void vp8_yv12_copy_src_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc, +; YV12_BUFFER_CONFIG *dst_ybc); |vp8_yv12_copy_src_frame_func_neon| PROC push {r4 - r11, lr} diff --git a/libvpx/vpx_scale/arm/neon/yv12extend_arm.c b/libvpx/vpx_scale/arm/neon/yv12extend_arm.c index 4535b8f..fac7bbc 100644 --- a/libvpx/vpx_scale/arm/neon/yv12extend_arm.c +++ b/libvpx/vpx_scale/arm/neon/yv12extend_arm.c @@ -10,12 +10,12 @@ #include "./vpx_scale_rtcd.h" -extern void vp8_yv12_copy_frame_func_neon(struct yv12_buffer_config *src_ybc, - struct yv12_buffer_config *dst_ybc); +extern void vp8_yv12_copy_frame_func_neon( + const struct yv12_buffer_config *src_ybc, + struct yv12_buffer_config *dst_ybc); -void vp8_yv12_copy_frame_neon(struct yv12_buffer_config *src_ybc, +void vp8_yv12_copy_frame_neon(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc) { vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc); - vp8_yv12_extend_frame_borders_neon(dst_ybc); } diff --git a/libvpx/vpx_scale/generic/yv12config.c b/libvpx/vpx_scale/generic/yv12config.c index 2592040..a89e29d 100644 --- a/libvpx/vpx_scale/generic/yv12config.c +++ b/libvpx/vpx_scale/generic/yv12config.c @@ -192,7 +192,7 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, ybf->alpha_buffer = ybf->buffer_alloc + yplane_size + 2 * uvplane_size + (alpha_border_h * alpha_stride) + alpha_border_w; #endif - ybf->corrupted = 0; /* assume not currupted by errors */ + ybf->corrupted = 0; /* assume not corrupted by errors */ return 0; } return -2; diff --git a/libvpx/vpx_scale/generic/yv12extend.c b/libvpx/vpx_scale/generic/yv12extend.c index cc8da2a..f2aec2b 100644 --- a/libvpx/vpx_scale/generic/yv12extend.c +++ b/libvpx/vpx_scale/generic/yv12extend.c @@ -10,67 +10,52 @@ #include <assert.h> #include "./vpx_config.h" -#include "vpx_scale/yv12config.h" +#include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" -#include "vpx_scale/vpx_scale.h" - -/**************************************************************************** -* Exports -****************************************************************************/ +#include "vpx_scale/yv12config.h" -/**************************************************************************** - * - ****************************************************************************/ -static void extend_plane(uint8_t *s, /* source */ - int sp, /* source pitch */ - int w, /* width */ - int h, /* height */ - int et, /* extend top border */ - int el, /* extend left border */ - int eb, /* extend bottom border */ - int er) { /* extend right border */ +static void extend_plane(uint8_t *const src, int src_stride, + int width, int height, + int extend_top, int extend_left, + int extend_bottom, int extend_right) { int i; - uint8_t *src_ptr1, *src_ptr2; - uint8_t *dest_ptr1, *dest_ptr2; - int linesize; + const int linesize = extend_left + extend_right + width; /* copy the left and right most columns out */ - src_ptr1 = s; - src_ptr2 = s + w - 1; - dest_ptr1 = s - el; - dest_ptr2 = s + w; - - for (i = 0; i < h; i++) { - vpx_memset(dest_ptr1, src_ptr1[0], el); - vpx_memset(dest_ptr2, src_ptr2[0], er); - src_ptr1 += sp; - src_ptr2 += sp; - dest_ptr1 += sp; - dest_ptr2 += sp; + uint8_t *src_ptr1 = src; + uint8_t *src_ptr2 = src + width - 1; + uint8_t *dst_ptr1 = src - extend_left; + uint8_t *dst_ptr2 = src + width; + + for (i = 0; i < height; ++i) { + vpx_memset(dst_ptr1, src_ptr1[0], extend_left); + vpx_memset(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_stride; + src_ptr2 += src_stride; + dst_ptr1 += src_stride; + dst_ptr2 += src_stride; } /* Now copy the top and bottom lines into each line of the respective * borders */ - src_ptr1 = s - el; - src_ptr2 = s + sp * (h - 1) - el; - dest_ptr1 = s + sp * (-et) - el; - dest_ptr2 = s + sp * (h) - el; - linesize = el + er + w; - - for (i = 0; i < et; i++) { - vpx_memcpy(dest_ptr1, src_ptr1, linesize); - dest_ptr1 += sp; + src_ptr1 = src - extend_left; + src_ptr2 = src + src_stride * (height - 1) - extend_left; + dst_ptr1 = src + src_stride * -extend_top - extend_left; + dst_ptr2 = src + src_stride * height - extend_left; + + for (i = 0; i < extend_top; ++i) { + vpx_memcpy(dst_ptr1, src_ptr1, linesize); + dst_ptr1 += src_stride; } - for (i = 0; i < eb; i++) { - vpx_memcpy(dest_ptr2, src_ptr2, linesize); - dest_ptr2 += sp; + for (i = 0; i < extend_bottom; ++i) { + vpx_memcpy(dst_ptr2, src_ptr2, linesize); + dst_ptr2 += src_stride; } } -void -vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { +void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { assert(ybf->y_height - ybf->y_crop_height < 16); assert(ybf->y_width - ybf->y_crop_width < 16); assert(ybf->y_height - ybf->y_crop_height >= 0); @@ -96,9 +81,9 @@ vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { } #if CONFIG_VP9 -static void extend_frame(YV12_BUFFER_CONFIG *ybf, - int subsampling_x, int subsampling_y, - int ext_size) { +static void extend_frame(YV12_BUFFER_CONFIG *const ybf, + int subsampling_x, int subsampling_y, + int ext_size) { const int c_w = (ybf->y_crop_width + subsampling_x) >> subsampling_x; const int c_h = (ybf->y_crop_height + subsampling_y) >> subsampling_y; const int c_et = ext_size >> subsampling_y; @@ -126,7 +111,6 @@ static void extend_frame(YV12_BUFFER_CONFIG *ybf, c_w, c_h, c_et, c_el, c_eb, c_er); } - void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, int subsampling_x, int subsampling_y) { extend_frame(ybf, subsampling_x, subsampling_y, ybf->border); @@ -134,33 +118,20 @@ void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf, int subsampling_x, int subsampling_y) { - const int inner_bw = ybf->border > VP9INNERBORDERINPIXLES ? - VP9INNERBORDERINPIXLES : ybf->border; + const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) ? + VP9INNERBORDERINPIXELS : ybf->border; extend_frame(ybf, subsampling_x, subsampling_y, inner_bw); } -#endif +#endif // CONFIG_VP9 -/**************************************************************************** - * - * ROUTINE : vp8_yv12_copy_frame - * - * INPUTS : - * - * OUTPUTS : None. - * - * RETURNS : void - * - * FUNCTION : Copies the source image into the destination image and - * updates the destination's UMV borders. - * - * SPECIAL NOTES : The frames are assumed to be identical in size. - * - ****************************************************************************/ -void -vp8_yv12_copy_frame_c(YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc) { +// Copies the source image into the destination image and updates the +// destination's UMV borders. +// Note: The frames are assumed to be identical in size. +void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { int row; - unsigned char *source, *dest; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; #if 0 /* These assertions are valid in the codec, but the libvpx-tester uses @@ -170,48 +141,42 @@ vp8_yv12_copy_frame_c(YV12_BUFFER_CONFIG *src_ybc, assert(src_ybc->y_height == dst_ybc->y_height); #endif - source = src_ybc->y_buffer; - dest = dst_ybc->y_buffer; - - for (row = 0; row < src_ybc->y_height; row++) { - vpx_memcpy(dest, source, src_ybc->y_width); - source += src_ybc->y_stride; - dest += dst_ybc->y_stride; + for (row = 0; row < src_ybc->y_height; ++row) { + vpx_memcpy(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; } - source = src_ybc->u_buffer; - dest = dst_ybc->u_buffer; + src = src_ybc->u_buffer; + dst = dst_ybc->u_buffer; - for (row = 0; row < src_ybc->uv_height; row++) { - vpx_memcpy(dest, source, src_ybc->uv_width); - source += src_ybc->uv_stride; - dest += dst_ybc->uv_stride; + for (row = 0; row < src_ybc->uv_height; ++row) { + vpx_memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; } - source = src_ybc->v_buffer; - dest = dst_ybc->v_buffer; + src = src_ybc->v_buffer; + dst = dst_ybc->v_buffer; - for (row = 0; row < src_ybc->uv_height; row++) { - vpx_memcpy(dest, source, src_ybc->uv_width); - source += src_ybc->uv_stride; - dest += dst_ybc->uv_stride; + for (row = 0; row < src_ybc->uv_height; ++row) { + vpx_memcpy(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; } vp8_yv12_extend_frame_borders_c(dst_ybc); } -void vp8_yv12_copy_y_c(YV12_BUFFER_CONFIG *src_ybc, +void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) { int row; - unsigned char *source, *dest; - - - source = src_ybc->y_buffer; - dest = dst_ybc->y_buffer; + const uint8_t *src = src_ybc->y_buffer; + uint8_t *dst = dst_ybc->y_buffer; - for (row = 0; row < src_ybc->y_height; row++) { - vpx_memcpy(dest, source, src_ybc->y_width); - source += src_ybc->y_stride; - dest += dst_ybc->y_stride; + for (row = 0; row < src_ybc->y_height; ++row) { + vpx_memcpy(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; } } diff --git a/libvpx/vpx_scale/vpx_scale_rtcd.sh b/libvpx/vpx_scale/vpx_scale_rtcd.sh index 21d1e52..ea7b0e2 100644 --- a/libvpx/vpx_scale/vpx_scale_rtcd.sh +++ b/libvpx/vpx_scale/vpx_scale_rtcd.sh @@ -19,11 +19,11 @@ fi prototype void vp8_yv12_extend_frame_borders "struct yv12_buffer_config *ybf" specialize vp8_yv12_extend_frame_borders neon -prototype void vp8_yv12_copy_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc" +prototype void vp8_yv12_copy_frame "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc" specialize vp8_yv12_copy_frame neon -prototype void vp8_yv12_copy_y "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc" -specialize vp8_yv12_copy_y neon +prototype void vpx_yv12_copy_y "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc" +specialize vpx_yv12_copy_y neon if [ "$CONFIG_VP9" = "yes" ]; then prototype void vp9_extend_frame_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y" diff --git a/libvpx/vpx_scale/yv12config.h b/libvpx/vpx_scale/yv12config.h index 66e587a..0e950fb 100644 --- a/libvpx/vpx_scale/yv12config.h +++ b/libvpx/vpx_scale/yv12config.h @@ -18,7 +18,7 @@ extern "C" { #include "vpx/vpx_integer.h" #define VP8BORDERINPIXELS 32 -#define VP9INNERBORDERINPIXLES 96 +#define VP9INNERBORDERINPIXELS 96 #define VP9BORDERINPIXELS 160 #define VP9_INTERP_EXTEND 4 diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c index 547b572..0c742ca 100644 --- a/libvpx/vpxenc.c +++ b/libvpx/vpxenc.c @@ -1046,6 +1046,7 @@ static const struct arg_enum_list end_usage_enum[] = { {"vbr", VPX_VBR}, {"cbr", VPX_CBR}, {"cq", VPX_CQ}, + {"q", VPX_Q}, {NULL, 0} }; static const arg_def_t end_usage = ARG_DEF_ENUM(NULL, "end-usage", 1, @@ -1126,7 +1127,7 @@ static const struct arg_enum_list tuning_enum[] = { static const arg_def_t tune_ssim = ARG_DEF_ENUM(NULL, "tune", 1, "Material to favor", tuning_enum); static const arg_def_t cq_level = ARG_DEF(NULL, "cq-level", 1, - "Constrained Quality Level"); + "Constant/Constrained Quality level"); static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"); static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode"); @@ -1688,8 +1689,10 @@ static void parse_global_config(struct global_config *global, char **argv) { /* Initialize default parameters */ memset(global, 0, sizeof(*global)); global->codec = codecs; - global->passes = 1; + global->passes = 0; global->use_i420 = 1; + /* Assign default deadline to good quality */ + global->deadline = VPX_DL_GOOD_QUALITY; for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) { arg.argv_step = 1; @@ -1761,6 +1764,11 @@ static void parse_global_config(struct global_config *global, char **argv) { } /* Validate global config */ + if (global->passes == 0) { + // Make default VP9 passes = 2 until there is a better quality 1-pass + // encoder + global->passes = (global->codec->iface == vpx_codec_vp9_cx ? 2 : 1); + } if (global->pass) { /* DWIM: Assume the user meant passes=2 if pass=2 is specified */ @@ -2631,8 +2639,8 @@ int main(int argc, const char **argv_) { &global.framerate)); } - FOREACH_STREAM(open_output_file(stream, &global)); FOREACH_STREAM(setup_pass(stream, &global, pass)); + FOREACH_STREAM(open_output_file(stream, &global)); FOREACH_STREAM(initialize_encoder(stream, &global)); frame_avail = 1; diff --git a/mips-dspr2/libvpx_srcs.txt b/mips-dspr2/libvpx_srcs.txt index 299d615..897d207 100644 --- a/mips-dspr2/libvpx_srcs.txt +++ b/mips-dspr2/libvpx_srcs.txt @@ -178,6 +178,8 @@ vp9/common/vp9_reconintra.h vp9/common/vp9_rtcd.c vp9/common/vp9_rtcd_defs.sh vp9/common/vp9_sadmxn.h +vp9/common/vp9_scale.c +vp9/common/vp9_scale.h vp9/common/vp9_seg_common.c vp9/common/vp9_seg_common.h vp9/common/vp9_subpelvar.h diff --git a/mips-dspr2/vp9_rtcd.h b/mips-dspr2/vp9_rtcd.h index d6dc6bf..b23f1a6 100644 --- a/mips-dspr2/vp9_rtcd.h +++ b/mips-dspr2/vp9_rtcd.h @@ -36,160 +36,160 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob); #define vp9_idct_add_32x32 vp9_idct_add_32x32_c -void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c -void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c -void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c -void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c -void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c -void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c -void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c -void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c -void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c -void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c -void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c -void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c -void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c -void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c -void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c -void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c -void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c -void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c -void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c -void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c -void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c -void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c -void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c -void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c -void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c -void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c -void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c -void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c -void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c -void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c -void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c -void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c -void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c -void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c -void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c -void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c -void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c -void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c -void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c -void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c -void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c -void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c -void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c -void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c -void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c -void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c -void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c -void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c -void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c -void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c -void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c -void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride); @@ -300,12 +300,6 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_iwalsh4x4_add vp9_short_iwalsh4x4_add_c -unsigned int vp9_sad32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad); -#define vp9_sad32x3 vp9_sad32x3_c - -unsigned int vp9_sad3x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad); -#define vp9_sad3x32 vp9_sad3x32_c - void vp9_rtcd(void); #include "vpx_config.h" diff --git a/mips-dspr2/vpx_config.c b/mips-dspr2/vpx_config.c index cf19239..1036456 100644 --- a/mips-dspr2/vpx_config.c +++ b/mips-dspr2/vpx_config.c @@ -5,5 +5,5 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ -static const char* const cfg = "--force-target=mips32-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/johannkoenig/android-ndk --disable-vp9-encoder --enable-dspr2 --disable-examples --disable-docs --enable-realtime-only"; +static const char* const cfg = "--force-target=mips32-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/hkuang/Downloads/android-ndk-r8e --disable-vp9-encoder --enable-dspr2 --disable-examples --disable-docs --enable-realtime-only"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/mips-dspr2/vpx_config.h b/mips-dspr2/vpx_config.h index e85b676..e6cad01 100644 --- a/mips-dspr2/vpx_config.h +++ b/mips-dspr2/vpx_config.h @@ -59,6 +59,7 @@ #define CONFIG_DC_RECON 1 #define CONFIG_RUNTIME_CPU_DETECT 0 #define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 #define CONFIG_MULTITHREAD 1 #define CONFIG_INTERNAL_STATS 0 #define CONFIG_VP8_ENCODER 1 diff --git a/mips-dspr2/vpx_scale_rtcd.h b/mips-dspr2/vpx_scale_rtcd.h index be038f4..d9e41f3 100644 --- a/mips-dspr2/vpx_scale_rtcd.h +++ b/mips-dspr2/vpx_scale_rtcd.h @@ -33,11 +33,11 @@ void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pit void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c -void vp8_yv12_copy_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c -void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_y vp8_yv12_copy_y_c +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); #define vp9_extend_frame_borders vp9_extend_frame_borders_c diff --git a/mips/libvpx_srcs.txt b/mips/libvpx_srcs.txt index 055f5fb..8e6fad7 100644 --- a/mips/libvpx_srcs.txt +++ b/mips/libvpx_srcs.txt @@ -172,6 +172,8 @@ vp9/common/vp9_reconintra.h vp9/common/vp9_rtcd.c vp9/common/vp9_rtcd_defs.sh vp9/common/vp9_sadmxn.h +vp9/common/vp9_scale.c +vp9/common/vp9_scale.h vp9/common/vp9_seg_common.c vp9/common/vp9_seg_common.h vp9/common/vp9_subpelvar.h diff --git a/mips/vp9_rtcd.h b/mips/vp9_rtcd.h index d6dc6bf..b23f1a6 100644 --- a/mips/vp9_rtcd.h +++ b/mips/vp9_rtcd.h @@ -36,160 +36,160 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob); #define vp9_idct_add_32x32 vp9_idct_add_32x32_c -void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c -void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c -void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c -void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c -void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c -void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c -void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c -void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c -void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c -void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c -void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c -void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c -void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c -void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c -void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c -void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c -void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c -void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c -void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c -void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c -void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c -void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c -void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c -void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c -void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c -void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c -void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c -void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c -void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c -void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c -void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c -void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c -void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c -void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c -void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c -void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c -void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c -void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c -void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c -void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); -#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c -void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c -void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c -void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c -void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c -void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c -void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c -void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c -void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c -void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c -void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c -void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c -void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride); @@ -300,12 +300,6 @@ void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_iwalsh4x4_add vp9_short_iwalsh4x4_add_c -unsigned int vp9_sad32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad); -#define vp9_sad32x3 vp9_sad32x3_c - -unsigned int vp9_sad3x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad); -#define vp9_sad3x32 vp9_sad3x32_c - void vp9_rtcd(void); #include "vpx_config.h" diff --git a/mips/vpx_config.c b/mips/vpx_config.c index 84f0e8b..8c995fb 100644 --- a/mips/vpx_config.c +++ b/mips/vpx_config.c @@ -5,5 +5,5 @@ /* tree. An additional intellectual property rights grant can be found */ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ -static const char* const cfg = "--force-target=mips32-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/johannkoenig/android-ndk --disable-vp9-encoder --disable-examples --disable-docs --enable-realtime-only"; +static const char* const cfg = "--force-target=mips32-android-gcc --disable-runtime-cpu-detect --sdk-path=/usr/local/google/home/hkuang/Downloads/android-ndk-r8e --disable-vp9-encoder --disable-examples --disable-docs --enable-realtime-only"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/mips/vpx_config.h b/mips/vpx_config.h index 7db47f8..8ead72e 100644 --- a/mips/vpx_config.h +++ b/mips/vpx_config.h @@ -59,6 +59,7 @@ #define CONFIG_DC_RECON 1 #define CONFIG_RUNTIME_CPU_DETECT 0 #define CONFIG_POSTPROC 0 +#define CONFIG_VP9_POSTPROC 0 #define CONFIG_MULTITHREAD 1 #define CONFIG_INTERNAL_STATS 0 #define CONFIG_VP8_ENCODER 1 diff --git a/mips/vpx_scale_rtcd.h b/mips/vpx_scale_rtcd.h index be038f4..d9e41f3 100644 --- a/mips/vpx_scale_rtcd.h +++ b/mips/vpx_scale_rtcd.h @@ -33,11 +33,11 @@ void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pit void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); #define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c -void vp8_yv12_copy_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c -void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_y vp8_yv12_copy_y_c +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); #define vp9_extend_frame_borders vp9_extend_frame_borders_c |