diff options
author | hkuang <hkuang@google.com> | 2013-07-25 11:11:39 -0700 |
---|---|---|
committer | hkuang <hkuang@google.com> | 2013-07-25 12:03:12 -0700 |
commit | 91037db265ecdd914a26e056cf69207b4f50924e (patch) | |
tree | c78c618cf6d0ffb187e2734d524bca19698b3c0d | |
parent | ba164dffc5a6795bce97fae02b51ccf3330e15e4 (diff) | |
download | android_external_libvpx-91037db265ecdd914a26e056cf69207b4f50924e.tar.gz android_external_libvpx-91037db265ecdd914a26e056cf69207b4f50924e.tar.bz2 android_external_libvpx-91037db265ecdd914a26e056cf69207b4f50924e.zip |
Roll latest libvpx into Android.
Make the VP9 decoding 2X faster than the old one.
Checkout is from master branch(hash:242157c756314827ad9244952c7253e8900b9626).
Change-Id: Ibe67b3ee19f82b87df2416826b63a67f7f79b63a
232 files changed, 21557 insertions, 17417 deletions
diff --git a/armv7a-neon/libvpx_srcs.txt b/armv7a-neon/libvpx_srcs.txt index 15973e2..7f331c0 100644 --- a/armv7a-neon/libvpx_srcs.txt +++ b/armv7a-neon/libvpx_srcs.txt @@ -119,7 +119,6 @@ vp8/common/treecoder.c vp8/common/treecoder.h vp8/common/variance_c.c vp8/common/variance.h -vp8/common/vp8_asm_com_offsets.c vp8/common/vp8_entropymodedata.h vp8/decoder/dboolhuff.c vp8/decoder/dboolhuff.h @@ -133,7 +132,6 @@ vp8/decoder/onyxd_if.c vp8/decoder/onyxd_int.h vp8/decoder/threading.c vp8/decoder/treereader.h -vp8/decoder/vp8_asm_dec_offsets.c vp8/encoder/arm/armv5te/boolhuff_armv5te.asm.s vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm.s vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm.s @@ -205,11 +203,18 @@ vp8/vp8_cx_iface.c vp8/vp8cx.mk vp8/vp8_dx_iface.c vp8/vp8dx.mk +vp9/common/arm/neon/vp9_convolve8_avg_neon.asm.s +vp9/common/arm/neon/vp9_convolve8_neon.asm.s +vp9/common/arm/neon/vp9_convolve_neon.c +vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm.s +vp9/common/arm/neon/vp9_loopfilter_neon.asm.s +vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm.s vp9/common/generic/vp9_systemdependent.c vp9/common/vp9_alloccommon.c vp9/common/vp9_alloccommon.h -vp9/common/vp9_asm_com_offsets.c vp9/common/vp9_blockd.h +vp9/common/vp9_common_data.c +vp9/common/vp9_common_data.h vp9/common/vp9_common.h vp9/common/vp9_convolve.c vp9/common/vp9_convolve.h @@ -233,10 +238,6 @@ vp9/common/vp9_idct.h vp9/common/vp9_loopfilter.c vp9/common/vp9_loopfilter_filters.c vp9/common/vp9_loopfilter.h -vp9/common/vp9_mbpitch.c -vp9/common/vp9_modecont.c -vp9/common/vp9_modecontext.c -vp9/common/vp9_modecont.h vp9/common/vp9_mv.h vp9/common/vp9_mvref_common.c vp9/common/vp9_mvref_common.h @@ -264,7 +265,7 @@ vp9/common/vp9_tile_common.c vp9/common/vp9_tile_common.h vp9/common/vp9_treecoder.c vp9/common/vp9_treecoder.h -vp9/decoder/vp9_asm_dec_offsets.c +vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm.s vp9/decoder/vp9_dboolhuff.c vp9/decoder/vp9_dboolhuff.h vp9/decoder/vp9_decodemv.c @@ -273,6 +274,8 @@ vp9/decoder/vp9_decodframe.c vp9/decoder/vp9_decodframe.h vp9/decoder/vp9_detokenize.c vp9/decoder/vp9_detokenize.h +vp9/decoder/vp9_dsubexp.c +vp9/decoder/vp9_dsubexp.h vp9/decoder/vp9_idct_blk.c vp9/decoder/vp9_idct_blk.h vp9/decoder/vp9_onyxd.h diff --git a/armv7a-neon/vp9_rtcd.h b/armv7a-neon/vp9_rtcd.h index cc3c834..6e6ff71 100644 --- a/armv7a-neon/vp9_rtcd.h +++ b/armv7a-neon/vp9_rtcd.h @@ -38,53 +38,195 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob); #define vp9_idct_add_32x32 vp9_idct_add_32x32_c -void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem16x16 vp9_copy_mem16x16_c +void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c -void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem8x8 vp9_copy_mem8x8_c +void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c -void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem8x4 vp9_copy_mem8x4_c +void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c -void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available); -#define vp9_build_intra_predictors vp9_build_intra_predictors_c +void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c -void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize); -#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c +void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c -void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize); -#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c +void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c -void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride); -#define vp9_intra4x4_predict vp9_intra4x4_predict_c +void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c + +void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c + +void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c + +void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c + +void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c + +void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c + +void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c + +void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c + +void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c + +void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c + +void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c + +void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c + +void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c + +void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c + +void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c + +void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c + +void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c + +void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c + +void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c + +void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c + +void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c + +void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c + +void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c + +void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c + +void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride); -#define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c +void vp9_add_constant_residual_8x8_neon(const int16_t diff, uint8_t *dest, int stride); +#define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_neon void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest, int stride); -#define vp9_add_constant_residual_16x16 vp9_add_constant_residual_16x16_c +void vp9_add_constant_residual_16x16_neon(const int16_t diff, uint8_t *dest, int stride); +#define vp9_add_constant_residual_16x16 vp9_add_constant_residual_16x16_neon void vp9_add_constant_residual_32x32_c(const int16_t diff, uint8_t *dest, int stride); -#define vp9_add_constant_residual_32x32 vp9_add_constant_residual_32x32_c +void vp9_add_constant_residual_32x32_neon(const int16_t diff, uint8_t *dest, int stride); +#define vp9_add_constant_residual_32x32 vp9_add_constant_residual_32x32_neon void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); #define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_c void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_c +void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_neon void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c +void vp9_loop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_neon -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_c +void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_neon void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_c +void vp9_loop_filter_horizontal_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_neon void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); #define vp9_blend_mb_inner vp9_blend_mb_inner_c @@ -95,23 +237,35 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); #define vp9_blend_b vp9_blend_b_c -void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve8 vp9_convolve8_c +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_copy vp9_convolve_copy_c -void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve8_horiz vp9_convolve8_horiz_c +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_avg vp9_convolve_avg_c -void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve8_vert vp9_convolve8_vert_c +void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve8 vp9_convolve8_neon -void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve8_avg vp9_convolve8_avg_c +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve8_horiz vp9_convolve8_horiz_neon -void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve8_vert vp9_convolve8_vert_neon -void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c +void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve8_avg vp9_convolve8_avg_neon + +void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_neon + +void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_neon void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_idct4x4_1_add vp9_short_idct4x4_1_add_c @@ -120,7 +274,8 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_idct4x4_add vp9_short_idct4x4_add_c void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_short_idct8x8_add vp9_short_idct8x8_add_c +void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_short_idct8x8_add vp9_short_idct8x8_add_neon void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_idct10_8x8_add vp9_short_idct10_8x8_add_c @@ -158,9 +313,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx void vp9_idct4_1d_c(int16_t *input, int16_t *output); #define vp9_idct4_1d vp9_idct4_1d_c -void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride); -#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c - void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c diff --git a/armv7a-neon/vpx_config.h b/armv7a-neon/vpx_config.h index a808f7c..6f45f7e 100644 --- a/armv7a-neon/vpx_config.h +++ b/armv7a-neon/vpx_config.h @@ -87,5 +87,4 @@ #define CONFIG_MULTIPLE_ARF 0 #define CONFIG_NON420 0 #define CONFIG_ALPHA 0 -#define CONFIG_BALANCED_COEFTREE 0 #endif /* VPX_CONFIG_H */ diff --git a/armv7a-neon/vpx_scale_rtcd.h b/armv7a-neon/vpx_scale_rtcd.h index ed84626..9972777 100644 --- a/armv7a-neon/vpx_scale_rtcd.h +++ b/armv7a-neon/vpx_scale_rtcd.h @@ -45,6 +45,9 @@ void vp8_yv12_copy_y_neon(struct yv12_buffer_config *src_ybc, struct yv12_buffer void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); #define vp9_extend_frame_borders vp9_extend_frame_borders_c +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c + void vpx_scale_rtcd(void); #include "vpx_config.h" diff --git a/armv7a/libvpx_srcs.txt b/armv7a/libvpx_srcs.txt index bab4901..a929dc3 100644 --- a/armv7a/libvpx_srcs.txt +++ b/armv7a/libvpx_srcs.txt @@ -88,7 +88,6 @@ vp8/common/treecoder.c vp8/common/treecoder.h vp8/common/variance_c.c vp8/common/variance.h -vp8/common/vp8_asm_com_offsets.c vp8/common/vp8_entropymodedata.h vp8/decoder/dboolhuff.c vp8/decoder/dboolhuff.h @@ -102,7 +101,6 @@ vp8/decoder/onyxd_if.c vp8/decoder/onyxd_int.h vp8/decoder/threading.c vp8/decoder/treereader.h -vp8/decoder/vp8_asm_dec_offsets.c vp8/encoder/arm/armv5te/boolhuff_armv5te.asm.s vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm.s vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm.s @@ -170,8 +168,9 @@ vp8/vp8dx.mk vp9/common/generic/vp9_systemdependent.c vp9/common/vp9_alloccommon.c vp9/common/vp9_alloccommon.h -vp9/common/vp9_asm_com_offsets.c vp9/common/vp9_blockd.h +vp9/common/vp9_common_data.c +vp9/common/vp9_common_data.h vp9/common/vp9_common.h vp9/common/vp9_convolve.c vp9/common/vp9_convolve.h @@ -195,10 +194,6 @@ vp9/common/vp9_idct.h vp9/common/vp9_loopfilter.c vp9/common/vp9_loopfilter_filters.c vp9/common/vp9_loopfilter.h -vp9/common/vp9_mbpitch.c -vp9/common/vp9_modecont.c -vp9/common/vp9_modecontext.c -vp9/common/vp9_modecont.h vp9/common/vp9_mv.h vp9/common/vp9_mvref_common.c vp9/common/vp9_mvref_common.h @@ -226,7 +221,6 @@ vp9/common/vp9_tile_common.c vp9/common/vp9_tile_common.h vp9/common/vp9_treecoder.c vp9/common/vp9_treecoder.h -vp9/decoder/vp9_asm_dec_offsets.c vp9/decoder/vp9_dboolhuff.c vp9/decoder/vp9_dboolhuff.h vp9/decoder/vp9_decodemv.c @@ -235,6 +229,8 @@ vp9/decoder/vp9_decodframe.c vp9/decoder/vp9_decodframe.h vp9/decoder/vp9_detokenize.c vp9/decoder/vp9_detokenize.h +vp9/decoder/vp9_dsubexp.c +vp9/decoder/vp9_dsubexp.h vp9/decoder/vp9_idct_blk.c vp9/decoder/vp9_idct_blk.h vp9/decoder/vp9_onyxd.h diff --git a/armv7a/vp9_rtcd.h b/armv7a/vp9_rtcd.h index cc3c834..d6b244d 100644 --- a/armv7a/vp9_rtcd.h +++ b/armv7a/vp9_rtcd.h @@ -38,26 +38,161 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob); #define vp9_idct_add_32x32 vp9_idct_add_32x32_c -void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem16x16 vp9_copy_mem16x16_c +void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c -void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem8x8 vp9_copy_mem8x8_c +void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c -void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem8x4 vp9_copy_mem8x4_c +void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c -void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available); -#define vp9_build_intra_predictors vp9_build_intra_predictors_c +void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c -void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize); -#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c +void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c -void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize); -#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c +void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c -void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride); -#define vp9_intra4x4_predict vp9_intra4x4_predict_c +void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c + +void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c + +void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c + +void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c + +void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c + +void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c + +void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c + +void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c + +void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c + +void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c + +void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c + +void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c + +void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c + +void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c + +void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c + +void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c + +void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c + +void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c + +void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c + +void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c + +void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c + +void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c + +void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c + +void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c + +void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride); #define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c @@ -77,7 +212,7 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *bli void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); @@ -95,22 +230,28 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); #define vp9_blend_b vp9_blend_b_c -void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_copy vp9_convolve_copy_c + +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_avg vp9_convolve_avg_c + +void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8 vp9_convolve8_c -void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_horiz vp9_convolve8_horiz_c -void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_vert vp9_convolve8_vert_c -void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg vp9_convolve8_avg_c -void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c -void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride); @@ -158,9 +299,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx void vp9_idct4_1d_c(int16_t *input, int16_t *output); #define vp9_idct4_1d vp9_idct4_1d_c -void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride); -#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c - void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c diff --git a/armv7a/vpx_config.h b/armv7a/vpx_config.h index e04f103..be08d2a 100644 --- a/armv7a/vpx_config.h +++ b/armv7a/vpx_config.h @@ -87,5 +87,4 @@ #define CONFIG_MULTIPLE_ARF 0 #define CONFIG_NON420 0 #define CONFIG_ALPHA 0 -#define CONFIG_BALANCED_COEFTREE 0 #endif /* VPX_CONFIG_H */ diff --git a/armv7a/vpx_scale_rtcd.h b/armv7a/vpx_scale_rtcd.h index 3f25632..d4212f2 100644 --- a/armv7a/vpx_scale_rtcd.h +++ b/armv7a/vpx_scale_rtcd.h @@ -42,6 +42,9 @@ void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_co void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); #define vp9_extend_frame_borders vp9_extend_frame_borders_c +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c + void vpx_scale_rtcd(void); #include "vpx_config.h" diff --git a/generic/libvpx_srcs.txt b/generic/libvpx_srcs.txt index 8c1ec80..402ac24 100644 --- a/generic/libvpx_srcs.txt +++ b/generic/libvpx_srcs.txt @@ -60,7 +60,6 @@ vp8/common/treecoder.c vp8/common/treecoder.h vp8/common/variance_c.c vp8/common/variance.h -vp8/common/vp8_asm_com_offsets.c vp8/common/vp8_entropymodedata.h vp8/decoder/dboolhuff.c vp8/decoder/dboolhuff.h @@ -74,7 +73,6 @@ vp8/decoder/onyxd_if.c vp8/decoder/onyxd_int.h vp8/decoder/threading.c vp8/decoder/treereader.h -vp8/decoder/vp8_asm_dec_offsets.c vp8/encoder/bitstream.c vp8/encoder/bitstream.h vp8/encoder/block.h @@ -130,8 +128,9 @@ vp8/vp8dx.mk vp9/common/generic/vp9_systemdependent.c vp9/common/vp9_alloccommon.c vp9/common/vp9_alloccommon.h -vp9/common/vp9_asm_com_offsets.c vp9/common/vp9_blockd.h +vp9/common/vp9_common_data.c +vp9/common/vp9_common_data.h vp9/common/vp9_common.h vp9/common/vp9_convolve.c vp9/common/vp9_convolve.h @@ -155,10 +154,6 @@ vp9/common/vp9_idct.h vp9/common/vp9_loopfilter.c vp9/common/vp9_loopfilter_filters.c vp9/common/vp9_loopfilter.h -vp9/common/vp9_mbpitch.c -vp9/common/vp9_modecont.c -vp9/common/vp9_modecontext.c -vp9/common/vp9_modecont.h vp9/common/vp9_mv.h vp9/common/vp9_mvref_common.c vp9/common/vp9_mvref_common.h @@ -186,7 +181,6 @@ vp9/common/vp9_tile_common.c vp9/common/vp9_tile_common.h vp9/common/vp9_treecoder.c vp9/common/vp9_treecoder.h -vp9/decoder/vp9_asm_dec_offsets.c vp9/decoder/vp9_dboolhuff.c vp9/decoder/vp9_dboolhuff.h vp9/decoder/vp9_decodemv.c @@ -195,6 +189,8 @@ vp9/decoder/vp9_decodframe.c vp9/decoder/vp9_decodframe.h vp9/decoder/vp9_detokenize.c vp9/decoder/vp9_detokenize.h +vp9/decoder/vp9_dsubexp.c +vp9/decoder/vp9_dsubexp.h vp9/decoder/vp9_idct_blk.c vp9/decoder/vp9_idct_blk.h vp9/decoder/vp9_onyxd.h diff --git a/generic/vp9_rtcd.h b/generic/vp9_rtcd.h index dee08d4..c0824cb 100644 --- a/generic/vp9_rtcd.h +++ b/generic/vp9_rtcd.h @@ -38,26 +38,161 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob); #define vp9_idct_add_32x32 vp9_idct_add_32x32_c -void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem16x16 vp9_copy_mem16x16_c +void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c -void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem8x8 vp9_copy_mem8x8_c +void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c -void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem8x4 vp9_copy_mem8x4_c +void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c -void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available); -#define vp9_build_intra_predictors vp9_build_intra_predictors_c +void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c -void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize); -#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c +void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c -void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize); -#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c +void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c -void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride); -#define vp9_intra4x4_predict vp9_intra4x4_predict_c +void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c + +void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c + +void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c + +void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c + +void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c + +void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c + +void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c + +void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c + +void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c + +void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c + +void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c + +void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c + +void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c + +void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c + +void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c + +void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c + +void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c + +void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c + +void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c + +void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c + +void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c + +void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c + +void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c + +void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c + +void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride); #define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c @@ -77,7 +212,7 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *bli void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); @@ -95,22 +230,28 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); #define vp9_blend_b vp9_blend_b_c -void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_copy vp9_convolve_copy_c + +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_avg vp9_convolve_avg_c + +void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8 vp9_convolve8_c -void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_horiz vp9_convolve8_horiz_c -void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_vert vp9_convolve8_vert_c -void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg vp9_convolve8_avg_c -void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c -void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride); @@ -158,9 +299,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx void vp9_idct4_1d_c(int16_t *input, int16_t *output); #define vp9_idct4_1d vp9_idct4_1d_c -void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride); -#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c - void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c diff --git a/generic/vpx_config.h b/generic/vpx_config.h index 44e6842..37dcff9 100644 --- a/generic/vpx_config.h +++ b/generic/vpx_config.h @@ -87,5 +87,4 @@ #define CONFIG_MULTIPLE_ARF 0 #define CONFIG_NON420 0 #define CONFIG_ALPHA 0 -#define CONFIG_BALANCED_COEFTREE 0 #endif /* VPX_CONFIG_H */ diff --git a/generic/vpx_scale_rtcd.h b/generic/vpx_scale_rtcd.h index 3a1db05..c2842ee 100644 --- a/generic/vpx_scale_rtcd.h +++ b/generic/vpx_scale_rtcd.h @@ -42,6 +42,9 @@ void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_co void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); #define vp9_extend_frame_borders vp9_extend_frame_borders_c +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c + void vpx_scale_rtcd(void); #include "vpx_config.h" @@ -60,14 +60,10 @@ LOCAL_SRC_FILES += $(libvpx_target)/vpx_config.c # used yet but are included in the comments for future reference. libvpx_asm_offsets_intermediates := \ - vp8/common/vp8_asm_com_offsets.intermediate \ - vp8/decoder/vp8_asm_dec_offsets.intermediate \ vp8/encoder/vp8_asm_enc_offsets.intermediate \ vpx_scale/vpx_scale_asm_offsets.intermediate \ libvpx_asm_offsets_files := \ - vp8/common/vp8_asm_com_offsets.asm \ - vp8/decoder/vp8_asm_dec_offsets.asm \ vp8/encoder/vp8_asm_enc_offsets.asm \ vpx_scale/vpx_scale_asm_offsets.asm \ diff --git a/libvpx/README b/libvpx/README index 0475dad..92cc074 100644 --- a/libvpx/README +++ b/libvpx/README @@ -97,7 +97,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: 5. Configuration errors If the configuration step fails, the first step is to look in the error log. - This defaults to config.err. This should give a good indication of what went + This defaults to config.log. This should give a good indication of what went wrong. If not, contact us for support. SUPPORT diff --git a/libvpx/build/arm-msvs/obj_int_extract.bat b/libvpx/build/arm-msvs/obj_int_extract.bat index 147342d..7fd16a3 100644 --- a/libvpx/build/arm-msvs/obj_int_extract.bat +++ b/libvpx/build/arm-msvs/obj_int_extract.bat @@ -7,18 +7,7 @@ REM in the file PATENTS. All contributing project authors may REM be found in the AUTHORS file in the root of the source tree. echo on -cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/common/vp9_asm_com_offsets.c" -cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/decoder/vp9_asm_dec_offsets.c" -cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp9/encoder/vp9_asm_enc_offsets.c" -obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm" -obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm" -obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm" - -cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/common/vp8_asm_com_offsets.c" -cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/decoder/vp8_asm_dec_offsets.c" cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vp8/encoder/vp8_asm_enc_offsets.c" -obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm" -obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm" obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm" cl /I "./" /I "%1" /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%1/vpx_scale/vpx_scale_asm_offsets.c" diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh index ee4493d..30a6106 100755 --- a/libvpx/build/make/configure.sh +++ b/libvpx/build/make/configure.sh @@ -75,7 +75,7 @@ Options: Build options: --help print this message - --log=yes|no|FILE file configure log is written to [config.err] + --log=yes|no|FILE file configure log is written to [config.log] --target=TARGET target platform tuple [generic-gnu] --cpu=CPU optimize for a specific cpu rather than a family --extra-cflags=ECFLAGS add ECFLAGS to CFLAGS [$CFLAGS] @@ -653,6 +653,10 @@ process_common_toolchain() { tgt_isa=x86_64 tgt_os=darwin12 ;; + *darwin13*) + tgt_isa=x86_64 + tgt_os=darwin13 + ;; x86_64*mingw32*) tgt_os=win64 ;; @@ -751,6 +755,10 @@ process_common_toolchain() { add_cflags "-mmacosx-version-min=10.8" add_ldflags "-mmacosx-version-min=10.8" ;; + *-darwin13-*) + add_cflags "-mmacosx-version-min=10.9" + add_ldflags "-mmacosx-version-min=10.9" + ;; esac # Handle Solaris variants. Solaris 10 needs -lposix4 @@ -1296,7 +1304,7 @@ process_detect() { } enable logging -logfile="config.err" +logfile="config.log" self=$0 process() { cmdline_args="$@" diff --git a/libvpx/build/make/gen_msvs_proj.sh b/libvpx/build/make/gen_msvs_proj.sh index cff27c8..fc5011b 100755 --- a/libvpx/build/make/gen_msvs_proj.sh +++ b/libvpx/build/make/gen_msvs_proj.sh @@ -381,7 +381,7 @@ generate_vcproj() { RuntimeLibrary="$debug_runtime" \ UsePrecompiledHeader="0" \ WarningLevel="3" \ - DebugInformationFormat="1" \ + DebugInformationFormat="2" \ $warn_64bit \ $uses_asm && tag Tool Name="YASM" IncludePaths="$incs" Debug="true" @@ -395,7 +395,7 @@ generate_vcproj() { RuntimeLibrary="$debug_runtime" \ UsePrecompiledHeader="0" \ WarningLevel="3" \ - DebugInformationFormat="1" \ + DebugInformationFormat="2" \ $warn_64bit \ $uses_asm && tag Tool Name="YASM" IncludePaths="$incs" Debug="true" diff --git a/libvpx/build/make/gen_msvs_sln.sh b/libvpx/build/make/gen_msvs_sln.sh index 5a8c793..f9fc694 100755 --- a/libvpx/build/make/gen_msvs_sln.sh +++ b/libvpx/build/make/gen_msvs_sln.sh @@ -74,8 +74,13 @@ parse_project() { # assume that all projects have the same list of possible configurations, # so overwriting old config_lists is not a problem - config_list=`grep -A1 '<Configuration' $file | - grep Name | cut -d\" -f2` + if [ "$sfx" = "vcproj" ]; then + config_list=`grep -A1 '<Configuration' $file | + grep Name | cut -d\" -f2` + else + config_list=`grep -B1 'Label="Configuration"' $file | + grep Condition | cut -d\' -f4` + fi proj_list="${proj_list} ${var}" } @@ -168,9 +173,14 @@ process_makefile() { IFS=$'\r'$'\n' local TAB=$'\t' cat <<EOF -found_devenv := \$(shell which devenv.com >/dev/null 2>&1 && echo yes) +ifeq (\$(CONFIG_VS_VERSION),7) +MSBUILD_TOOL := devenv.com +else +MSBUILD_TOOL := msbuild.exe +endif +found_devenv := \$(shell which \$(MSBUILD_TOOL) >/dev/null 2>&1 && echo yes) .nodevenv.once: -${TAB}@echo " * devenv.com not found in path." +${TAB}@echo " * \$(MSBUILD_TOOL) not found in path." ${TAB}@echo " * " ${TAB}@echo " * You will have to build all configurations manually using the" ${TAB}@echo " * Visual Studio IDE. To allow make to build them automatically," @@ -195,16 +205,17 @@ ${TAB}rm -rf "$platform"/"$config" ifneq (\$(found_devenv),) ifeq (\$(CONFIG_VS_VERSION),7) $nows_sln_config: $outfile -${TAB}devenv.com $outfile -build "$config" +${TAB}\$(MSBUILD_TOOL) $outfile -build "$config" else $nows_sln_config: $outfile -${TAB}devenv.com $outfile -build "$sln_config" +${TAB}\$(MSBUILD_TOOL) $outfile -m -t:Build \\ +${TAB}${TAB}-p:Configuration="$config" -p:Platform="$platform" endif else $nows_sln_config: $outfile .nodevenv.once -${TAB}@echo " * Skipping build of $sln_config (devenv.com not in path)." +${TAB}@echo " * Skipping build of $sln_config (\$(MSBUILD_TOOL) not in path)." ${TAB}@echo " * " endif diff --git a/libvpx/build/make/obj_int_extract.c b/libvpx/build/make/obj_int_extract.c index 1604b5e..feed9d9 100644 --- a/libvpx/build/make/obj_int_extract.c +++ b/libvpx/build/make/obj_int_extract.c @@ -38,7 +38,21 @@ int log_msg(const char *fmt, ...) { #include <mach-o/loader.h> #include <mach-o/nlist.h> -int parse_macho(uint8_t *base_buf, size_t sz) { +int print_macho_equ(output_fmt_t mode, uint8_t* name, int val) { + switch (mode) { + case OUTPUT_FMT_RVDS: + printf("%-40s EQU %5d\n", name, val); + return 0; + case OUTPUT_FMT_GAS: + printf(".set %-40s, %5d\n", name, val); + return 0; + default: + log_msg("Unsupported mode: %d", mode); + return 1; + } +} + +int parse_macho(uint8_t *base_buf, size_t sz, output_fmt_t mode) { int i, j; struct mach_header header; uint8_t *buf = base_buf; @@ -156,8 +170,7 @@ int parse_macho(uint8_t *base_buf, size_t sz) { memcpy(&val, base_buf + base_data_section + nl.n_value, sizeof(val)); - printf("%-40s EQU %5d\n", - str_buf + nl.n_un.n_strx + 1, val); + print_macho_equ(mode, str_buf + nl.n_un.n_strx + 1, val); } else { /* if (bits == 64) */ struct nlist_64 nl; int val; @@ -167,8 +180,7 @@ int parse_macho(uint8_t *base_buf, size_t sz) { memcpy(&val, base_buf + base_data_section + nl.n_value, sizeof(val)); - printf("%-40s EQU %5d\n", - str_buf + nl.n_un.n_strx + 1, val); + print_macho_equ(mode, str_buf + nl.n_un.n_strx + 1, val); } } } @@ -796,7 +808,7 @@ int main(int argc, char **argv) { #if defined(__GNUC__) && __GNUC__ #if defined(__MACH__) - res = parse_macho(file_buf, file_size); + res = parse_macho(file_buf, file_size, mode); #elif defined(__ELF__) res = parse_elf(file_buf, file_size, mode); #endif diff --git a/libvpx/build/x86-msvs/obj_int_extract.bat b/libvpx/build/x86-msvs/obj_int_extract.bat index 47fef97..4e9b0ec 100644 --- a/libvpx/build/x86-msvs/obj_int_extract.bat +++ b/libvpx/build/x86-msvs/obj_int_extract.bat @@ -7,17 +7,6 @@ REM in the file PATENTS. All contributing project authors may REM be found in the AUTHORS file in the root of the source tree. echo on -cl /I "./" /I "%1" /nologo /c "%1/vp9/common/vp9_asm_com_offsets.c" -cl /I "./" /I "%1" /nologo /c "%1/vp9/decoder/vp9_asm_dec_offsets.c" -cl /I "./" /I "%1" /nologo /c "%1/vp9/encoder/vp9_asm_enc_offsets.c" -obj_int_extract.exe rvds "vp9_asm_com_offsets.obj" > "vp9_asm_com_offsets.asm" -obj_int_extract.exe rvds "vp9_asm_dec_offsets.obj" > "vp9_asm_dec_offsets.asm" -obj_int_extract.exe rvds "vp9_asm_enc_offsets.obj" > "vp9_asm_enc_offsets.asm" - -cl /I "./" /I "%1" /nologo /c "%1/vp8/common/vp8_asm_com_offsets.c" -cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/vp8_asm_dec_offsets.c" cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/vp8_asm_enc_offsets.c" -obj_int_extract.exe rvds "vp8_asm_com_offsets.obj" > "vp8_asm_com_offsets.asm" -obj_int_extract.exe rvds "vp8_asm_dec_offsets.obj" > "vp8_asm_dec_offsets.asm" obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm" diff --git a/libvpx/configure b/libvpx/configure index 28676fb..3651334 100755 --- a/libvpx/configure +++ b/libvpx/configure @@ -115,6 +115,7 @@ all_platforms="${all_platforms} x86-darwin9-icc" all_platforms="${all_platforms} x86-darwin10-gcc" all_platforms="${all_platforms} x86-darwin11-gcc" all_platforms="${all_platforms} x86-darwin12-gcc" +all_platforms="${all_platforms} x86-darwin13-gcc" all_platforms="${all_platforms} x86-linux-gcc" all_platforms="${all_platforms} x86-linux-icc" all_platforms="${all_platforms} x86-os2-gcc" @@ -129,6 +130,7 @@ all_platforms="${all_platforms} x86_64-darwin9-gcc" all_platforms="${all_platforms} x86_64-darwin10-gcc" all_platforms="${all_platforms} x86_64-darwin11-gcc" all_platforms="${all_platforms} x86_64-darwin12-gcc" +all_platforms="${all_platforms} x86_64-darwin13-gcc" all_platforms="${all_platforms} x86_64-linux-gcc" all_platforms="${all_platforms} x86_64-linux-icc" all_platforms="${all_platforms} x86_64-solaris-gcc" @@ -142,6 +144,7 @@ all_platforms="${all_platforms} universal-darwin9-gcc" all_platforms="${all_platforms} universal-darwin10-gcc" all_platforms="${all_platforms} universal-darwin11-gcc" all_platforms="${all_platforms} universal-darwin12-gcc" +all_platforms="${all_platforms} universal-darwin13-gcc" all_platforms="${all_platforms} generic-gnu" # all_targets is a list of all targets that can be configured @@ -247,7 +250,6 @@ EXPERIMENT_LIST=" multiple_arf non420 alpha - balanced_coeftree " CONFIG_LIST=" external_build @@ -682,6 +684,14 @@ process_toolchain() { # iOS/ARM builds do not work with gtest. This does not match # x86 targets. ;; + *-win*) + # Some mingw toolchains don't have pthread available by default. + # Treat these more like visual studio where threading in gtest + # would be disabled for the same reason. + check_cxx "$@" <<EOF && soft_enable unit_tests +int z; +EOF + ;; *) enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests int z; diff --git a/libvpx/libs.mk b/libvpx/libs.mk index f7ed95b..4aa7dc4 100644 --- a/libvpx/libs.mk +++ b/libvpx/libs.mk @@ -202,6 +202,7 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS) libvpx_srcs.txt: @echo " [CREATE] $@" @echo $(CODEC_SRCS) | xargs -n1 echo | sort -u > $@ +CLEAN-OBJS += libvpx_srcs.txt ifeq ($(CONFIG_EXTERNAL_BUILD),yes) @@ -382,6 +383,11 @@ LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\ $(call enabled,LIBVPX_TEST_DATA)) libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1) +libvpx_test_srcs.txt: + @echo " [CREATE] $@" + @echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | sort -u > $@ +CLEAN-OBJS += libvpx_test_srcs.txt + $(LIBVPX_TEST_DATA): @echo " [DOWNLOAD] $@" $(qexec)trap 'rm -f $@' INT TERM &&\ @@ -442,6 +448,10 @@ else include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk GTEST_SRCS := $(addprefix third_party/googletest/src/,$(call enabled,GTEST_SRCS)) GTEST_OBJS=$(call objs,$(GTEST_SRCS)) +ifeq ($(filter win%,$(TGT_OS)),$(TGT_OS)) +# Disabling pthreads globally will cause issues on darwin and possibly elsewhere +$(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -DGTEST_HAS_PTHREAD=0 +endif $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src $(GTEST_OBJS) $(GTEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include OBJS-$(BUILD_LIBVPX) += $(GTEST_OBJS) @@ -466,7 +476,7 @@ $(foreach bin,$(LIBVPX_TEST_BINS),\ lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a ))\ $(if $(BUILD_LIBVPX),$(eval $(call linkerxx_template,$(bin),\ $(LIBVPX_TEST_OBJS) \ - -L. -lvpx -lgtest -lpthread -lm)\ + -L. -lvpx -lgtest $(extralibs) -lm)\ )))\ $(if $(LIPO_LIBS),$(eval $(call lipo_bin_template,$(bin))))\ diff --git a/libvpx/test/altref_test.cc b/libvpx/test/altref_test.cc index 14af265..af25b72 100644 --- a/libvpx/test/altref_test.cc +++ b/libvpx/test/altref_test.cc @@ -33,10 +33,6 @@ class AltRefTest : public ::libvpx_test::EncoderTest, altref_count_ = 0; } - virtual bool Continue() const { - return !HasFatalFailure() && !abort_; - } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, libvpx_test::Encoder *encoder) { if (video->frame() == 1) { diff --git a/libvpx/test/borders_test.cc b/libvpx/test/borders_test.cc index 49505ee..7bfece8 100644 --- a/libvpx/test/borders_test.cc +++ b/libvpx/test/borders_test.cc @@ -27,10 +27,6 @@ class BordersTest : public ::libvpx_test::EncoderTest, SetMode(GET_PARAM(1)); } - virtual bool Continue() const { - return !HasFatalFailure() && !abort_; - } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { if ( video->frame() == 1) { diff --git a/libvpx/test/codec_factory.h b/libvpx/test/codec_factory.h index fdae572..cc7b53f 100644 --- a/libvpx/test/codec_factory.h +++ b/libvpx/test/codec_factory.h @@ -134,14 +134,14 @@ class VP8CodecFactory : public CodecFactory { const libvpx_test::VP8CodecFactory kVP8; -#define VP8_INSTANTIATE_TEST_CASE(test, params)\ +#define VP8_INSTANTIATE_TEST_CASE(test, ...)\ INSTANTIATE_TEST_CASE_P(VP8, test, \ ::testing::Combine( \ ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \ &libvpx_test::kVP8)), \ - params)) + __VA_ARGS__)) #else -#define VP8_INSTANTIATE_TEST_CASE(test, params) +#define VP8_INSTANTIATE_TEST_CASE(test, ...) #endif // CONFIG_VP8 @@ -216,14 +216,14 @@ class VP9CodecFactory : public CodecFactory { const libvpx_test::VP9CodecFactory kVP9; -#define VP9_INSTANTIATE_TEST_CASE(test, params)\ +#define VP9_INSTANTIATE_TEST_CASE(test, ...)\ INSTANTIATE_TEST_CASE_P(VP9, test, \ ::testing::Combine( \ ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \ &libvpx_test::kVP9)), \ - params)) + __VA_ARGS__)) #else -#define VP9_INSTANTIATE_TEST_CASE(test, params) +#define VP9_INSTANTIATE_TEST_CASE(test, ...) #endif // CONFIG_VP9 diff --git a/libvpx/test/config_test.cc b/libvpx/test/config_test.cc index 9008728..36c6330 100644 --- a/libvpx/test/config_test.cc +++ b/libvpx/test/config_test.cc @@ -40,10 +40,6 @@ class ConfigTest : public ::libvpx_test::EncoderTest, ++frame_count_out_; } - virtual bool Continue() const { - return !HasFatalFailure() && !abort_; - } - unsigned int frame_count_in_; unsigned int frame_count_out_; unsigned int frame_count_max_; diff --git a/libvpx/test/convolve_test.cc b/libvpx/test/convolve_test.cc index fd2bd36..3b72129 100644 --- a/libvpx/test/convolve_test.cc +++ b/libvpx/test/convolve_test.cc @@ -22,8 +22,8 @@ extern "C" { } namespace { -typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h); @@ -211,7 +211,7 @@ class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) { virtual void SetUp() { UUT_ = GET_PARAM(2); - /* Set up guard blocks for an inner block cetered in the outer block */ + /* Set up guard blocks for an inner block centered in the outer block */ for (int i = 0; i < kOutputBufferSize; ++i) { if (IsIndexInBorder(i)) output_[i] = 255; @@ -546,4 +546,26 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values( make_tuple(32, 64, &convolve8_ssse3), make_tuple(64, 64, &convolve8_ssse3))); #endif + +#if HAVE_NEON +const ConvolveFunctions convolve8_neon( + vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon, + vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon, + vp9_convolve8_neon, vp9_convolve8_avg_neon); + +INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values( + make_tuple(4, 4, &convolve8_neon), + make_tuple(8, 4, &convolve8_neon), + make_tuple(4, 8, &convolve8_neon), + make_tuple(8, 8, &convolve8_neon), + make_tuple(16, 8, &convolve8_neon), + make_tuple(8, 16, &convolve8_neon), + make_tuple(16, 16, &convolve8_neon), + make_tuple(32, 16, &convolve8_neon), + make_tuple(16, 32, &convolve8_neon), + make_tuple(32, 32, &convolve8_neon), + make_tuple(64, 32, &convolve8_neon), + make_tuple(32, 64, &convolve8_neon), + make_tuple(64, 64, &convolve8_neon))); +#endif } // namespace diff --git a/libvpx/test/cpu_speed_test.cc b/libvpx/test/cpu_speed_test.cc new file mode 100644 index 0000000..e6ad75b --- /dev/null +++ b/libvpx/test/cpu_speed_test.cc @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <climits> +#include <vector> +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +class CpuSpeedTest : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWith2Params< + libvpx_test::TestMode, int> { + protected: + CpuSpeedTest() : EncoderTest(GET_PARAM(0)) {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(GET_PARAM(1)); + set_cpu_used_ = GET_PARAM(2); + } + + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, + ::libvpx_test::Encoder *encoder) { + if (video->frame() == 1) { + encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_); + encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); + encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); + encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); + } + } + + virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { + if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) { + } + } + int set_cpu_used_; +}; + +TEST_P(CpuSpeedTest, TestQ0) { + // Validate that this non multiple of 64 wide clip encodes and decodes + // without a mismatch when passing in a very low max q. This pushes + // the encoder to producing lots of big partitions which will likely + // extend into the border and test the border condition. + cfg_.g_lag_in_frames = 25; + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_minsection_pct = 2000; + cfg_.rc_target_bitrate = 400; + cfg_.rc_max_quantizer = 0; + cfg_.rc_min_quantizer = 0; + + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 20); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + + +TEST_P(CpuSpeedTest, TestEncodeHighBitrate) { + // Validate that this non multiple of 64 wide clip encodes and decodes + // without a mismatch when passing in a very low max q. This pushes + // the encoder to producing lots of big partitions which will likely + // extend into the border and test the border condition. + cfg_.g_lag_in_frames = 25; + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_minsection_pct = 2000; + cfg_.rc_target_bitrate = 12000; + cfg_.rc_max_quantizer = 10; + cfg_.rc_min_quantizer = 0; + + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 40); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} +TEST_P(CpuSpeedTest, TestLowBitrate) { + // Validate that this clip encodes and decodes without a mismatch + // when passing in a very high min q. This pushes the encoder to producing + // lots of small partitions which might will test the other condition. + + cfg_.g_lag_in_frames = 25; + cfg_.rc_2pass_vbr_minsection_pct = 5; + cfg_.rc_2pass_vbr_minsection_pct = 2000; + cfg_.rc_target_bitrate = 200; + cfg_.rc_min_quantizer = 40; + + ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, + 40); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + +using std::tr1::make_tuple; + +#define VP9_FACTORY \ + static_cast<const libvpx_test::CodecFactory*> (&libvpx_test::kVP9) + +VP9_INSTANTIATE_TEST_CASE( + CpuSpeedTest, + ::testing::Values(::libvpx_test::kTwoPassGood), + ::testing::Range(0, 3)); +} // namespace diff --git a/libvpx/test/cq_test.cc b/libvpx/test/cq_test.cc index a6a4b8e..a2c8291 100644 --- a/libvpx/test/cq_test.cc +++ b/libvpx/test/cq_test.cc @@ -42,10 +42,6 @@ class CQTest : public ::libvpx_test::EncoderTest, n_frames_ = 0; } - virtual bool Continue() const { - return !HasFatalFailure() && !abort_; - } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, libvpx_test::Encoder *encoder) { if (video->frame() == 1) { diff --git a/libvpx/test/datarate_test.cc b/libvpx/test/datarate_test.cc index 85eeafb..287e805 100644 --- a/libvpx/test/datarate_test.cc +++ b/libvpx/test/datarate_test.cc @@ -36,10 +36,6 @@ class DatarateTest : public ::libvpx_test::EncoderTest, duration_ = 0.0; } - virtual bool Continue() const { - return !HasFatalFailure() && !abort_; - } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { const vpx_rational_t tb = video->timebase(); diff --git a/libvpx/test/dct16x16_test.cc b/libvpx/test/dct16x16_test.cc index 9fb45d6..0795054 100644 --- a/libvpx/test/dct16x16_test.cc +++ b/libvpx/test/dct16x16_test.cc @@ -13,6 +13,7 @@ #include <string.h> #include "third_party/googletest/src/include/gtest/gtest.h" +#include "vpx_ports/mem.h" extern "C" { #include "vp9/common/vp9_entropy.h" @@ -264,59 +265,79 @@ void reference_16x16_dct_2d(int16_t input[16*16], double output[16*16]) { } } +void fdct16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/, + int stride, int /*tx_type*/) { + vp9_short_fdct16x16_c(in, out, stride); +} +void idct16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst, + int stride, int /*tx_type*/) { + vp9_short_idct16x16_add_c(out, dst, stride >> 1); +} +void fht16x16(int16_t *in, int16_t *out, uint8_t* /*dst*/, + int stride, int tx_type) { + // FIXME(jingning): need to test both SSE2 and c +#if HAVE_SSE2 + vp9_short_fht16x16_sse2(in, out, stride >> 1, tx_type); +#else + vp9_short_fht16x16_c(in, out, stride >> 1, tx_type); +#endif +} +void iht16x16_add(int16_t* /*in*/, int16_t *out, uint8_t *dst, + int stride, int tx_type) { + vp9_short_iht16x16_add_c(out, dst, stride >> 1, tx_type); +} -TEST(VP9Idct16x16Test, AccuracyCheck) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 1000; - for (int i = 0; i < count_test_block; ++i) { - int16_t in[256], coeff[256]; - uint8_t dst[256], src[256]; - double out_r[256]; - - for (int j = 0; j < 256; ++j) { - src[j] = rnd.Rand8(); - dst[j] = rnd.Rand8(); +class FwdTrans16x16Test : public ::testing::TestWithParam<int> { + public: + virtual ~FwdTrans16x16Test() {} + + virtual void SetUp() { + tx_type_ = GetParam(); + if (tx_type_ == 0) { + fwd_txfm = fdct16x16; + inv_txfm = idct16x16_add; + } else { + fwd_txfm = fht16x16; + inv_txfm = iht16x16_add; } - // Initialize a test block with input range [-255, 255]. - for (int j = 0; j < 256; ++j) - in[j] = src[j] - dst[j]; + } - reference_16x16_dct_2d(in, out_r); - for (int j = 0; j < 256; j++) - coeff[j] = round(out_r[j]); - vp9_short_idct16x16_add_c(coeff, dst, 16); - for (int j = 0; j < 256; ++j) { - const int diff = dst[j] - src[j]; - const int error = diff * diff; - EXPECT_GE(1, error) - << "Error: 16x16 IDCT has error " << error - << " at index " << j; - } + protected: + void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst, + int stride, int tx_type) { + (*fwd_txfm)(in, out, dst, stride, tx_type); } -} + void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst, + int stride, int tx_type) { + (*inv_txfm)(in, out, dst, stride, tx_type); + } + + int tx_type_; + void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int); + void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int); +}; -// we need enable fdct test once we re-do the 16 point fdct. -TEST(VP9Fdct16x16Test, AccuracyCheck) { +TEST_P(FwdTrans16x16Test, AccuracyCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); int max_error = 0; double total_error = 0; - const int count_test_block = 1000; + const int count_test_block = 10000; for (int i = 0; i < count_test_block; ++i) { - int16_t test_input_block[256]; - int16_t test_temp_block[256]; - uint8_t dst[256], src[256]; + DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 256); + DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 256); + DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256); + DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 256); for (int j = 0; j < 256; ++j) { src[j] = rnd.Rand8(); dst[j] = rnd.Rand8(); - } - // Initialize a test block with input range [-255, 255]. - for (int j = 0; j < 256; ++j) + // Initialize a test block with input range [-255, 255]. test_input_block[j] = src[j] - dst[j]; + } const int pitch = 32; - vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch); - vp9_short_idct16x16_add_c(test_temp_block, dst, 16); + RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); + RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); for (int j = 0; j < 256; ++j) { const int diff = dst[j] - src[j]; @@ -328,18 +349,21 @@ TEST(VP9Fdct16x16Test, AccuracyCheck) { } EXPECT_GE(1, max_error) - << "Error: 16x16 FDCT/IDCT has an individual round trip error > 1"; + << "Error: 16x16 FHT/IHT has an individual round trip error > 1"; EXPECT_GE(count_test_block , total_error) - << "Error: 16x16 FDCT/IDCT has average round trip error > 1 per block"; + << "Error: 16x16 FHT/IHT has average round trip error > 1 per block"; } -TEST(VP9Fdct16x16Test, CoeffSizeCheck) { +TEST_P(FwdTrans16x16Test, CoeffSizeCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = 1000; for (int i = 0; i < count_test_block; ++i) { - int16_t input_block[256], input_extreme_block[256]; - int16_t output_block[256], output_extreme_block[256]; + DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, 256); + DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, 256); + DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, 256); + DECLARE_ALIGNED_ARRAY(16, int16_t, output_extreme_block, 256); + DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 256); // Initialize a test block with input range [-255, 255]. for (int j = 0; j < 256; ++j) { @@ -351,16 +375,50 @@ TEST(VP9Fdct16x16Test, CoeffSizeCheck) { input_extreme_block[j] = 255; const int pitch = 32; - vp9_short_fdct16x16_c(input_block, output_block, pitch); - vp9_short_fdct16x16_c(input_extreme_block, output_extreme_block, pitch); + RunFwdTxfm(input_block, output_block, dst, pitch, tx_type_); + RunFwdTxfm(input_extreme_block, output_extreme_block, dst, pitch, tx_type_); // The minimum quant value is 4. for (int j = 0; j < 256; ++j) { - EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j])) + EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j])) << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE"; - EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j])) - << "Error: 16x16 FDCT extreme has coefficient larger than 4*DCT_MAX_VALUE"; + EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_extreme_block[j])) + << "Error: 16x16 FDCT extreme has coefficient larger " + << "than 4*DCT_MAX_VALUE"; + } + } +} + +INSTANTIATE_TEST_CASE_P(VP9, FwdTrans16x16Test, ::testing::Range(0, 4)); + +TEST(VP9Idct16x16Test, AccuracyCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + const int count_test_block = 1000; + for (int i = 0; i < count_test_block; ++i) { + int16_t in[256], coeff[256]; + uint8_t dst[256], src[256]; + double out_r[256]; + + for (int j = 0; j < 256; ++j) { + src[j] = rnd.Rand8(); + dst[j] = rnd.Rand8(); + } + // Initialize a test block with input range [-255, 255]. + for (int j = 0; j < 256; ++j) + in[j] = src[j] - dst[j]; + + reference_16x16_dct_2d(in, out_r); + for (int j = 0; j < 256; j++) + coeff[j] = round(out_r[j]); + vp9_short_idct16x16_add_c(coeff, dst, 16); + for (int j = 0; j < 256; ++j) { + const int diff = dst[j] - src[j]; + const int error = diff * diff; + EXPECT_GE(1, error) + << "Error: 16x16 IDCT has error " << error + << " at index " << j; } } } + } // namespace diff --git a/libvpx/test/encode_test_driver.h b/libvpx/test/encode_test_driver.h index 6aeb96b..dbdc33c 100644 --- a/libvpx/test/encode_test_driver.h +++ b/libvpx/test/encode_test_driver.h @@ -190,7 +190,9 @@ class EncoderTest { virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {} // Hook to determine whether the encode loop should continue. - virtual bool Continue() const { return !abort_; } + virtual bool Continue() const { + return !(::testing::Test::HasFatalFailure() || abort_); + } const CodecFactory *codec_; // Hook to determine whether to decode frame after encoding diff --git a/libvpx/test/error_resilience_test.cc b/libvpx/test/error_resilience_test.cc index ddfbd0f..d4a6967 100644 --- a/libvpx/test/error_resilience_test.cc +++ b/libvpx/test/error_resilience_test.cc @@ -50,10 +50,6 @@ class ErrorResilienceTest : public ::libvpx_test::EncoderTest, mismatch_nframes_ = 0; } - virtual bool Continue() const { - return !HasFatalFailure() && !abort_; - } - virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { psnr_ += pkt->data.psnr.psnr[0]; nframes_++; diff --git a/libvpx/test/fdct4x4_test.cc b/libvpx/test/fdct4x4_test.cc index 1c887bb..9dcc078 100644 --- a/libvpx/test/fdct4x4_test.cc +++ b/libvpx/test/fdct4x4_test.cc @@ -20,29 +20,75 @@ extern "C" { #include "acm_random.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" using libvpx_test::ACMRandom; namespace { +void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/, + int stride, int /*tx_type*/) { + vp9_short_fdct4x4_c(in, out, stride); +} +void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst, + int stride, int /*tx_type*/) { + vp9_short_idct4x4_add_c(out, dst, stride >> 1); +} +void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/, + int stride, int tx_type) { + vp9_short_fht4x4_c(in, out, stride >> 1, tx_type); +} +void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst, + int stride, int tx_type) { + vp9_short_iht4x4_add_c(out, dst, stride >> 1, tx_type); +} + +class FwdTrans4x4Test : public ::testing::TestWithParam<int> { + public: + virtual ~FwdTrans4x4Test() {} + virtual void SetUp() { + tx_type_ = GetParam(); + if (tx_type_ == 0) { + fwd_txfm_ = fdct4x4; + inv_txfm_ = idct4x4_add; + } else { + fwd_txfm_ = fht4x4; + inv_txfm_ = iht4x4_add; + } + } + + protected: + void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst, + int stride, int tx_type) { + (*fwd_txfm_)(in, out, dst, stride, tx_type); + } + + void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst, + int stride, int tx_type) { + (*inv_txfm_)(in, out, dst, stride, tx_type); + } + + int tx_type_; + void (*fwd_txfm_)(int16_t *in, int16_t *out, uint8_t *dst, + int stride, int tx_type); + void (*inv_txfm_)(int16_t *in, int16_t *out, uint8_t *dst, + int stride, int tx_type); +}; -TEST(Vp9Fdct4x4Test, SignBiasCheck) { +TEST_P(FwdTrans4x4Test, SignBiasCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - int16_t test_input_block[16]; - int16_t test_output_block[16]; + DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16); + DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 16); const int pitch = 8; int count_sign_block[16][2]; const int count_test_block = 1000000; memset(count_sign_block, 0, sizeof(count_sign_block)); - for (int i = 0; i < count_test_block; ++i) { // Initialize a test block with input range [-255, 255]. for (int j = 0; j < 16; ++j) test_input_block[j] = rnd.Rand8() - rnd.Rand8(); - // TODO(Yaowu): this should be converted to a parameterized test - // to test optimized versions of this function. - vp9_short_fdct4x4_c(test_input_block, test_output_block, pitch); + RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_); for (int j = 0; j < 16; ++j) { if (test_output_block[j] < 0) @@ -56,20 +102,18 @@ TEST(Vp9Fdct4x4Test, SignBiasCheck) { const bool bias_acceptable = (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 10000); EXPECT_TRUE(bias_acceptable) - << "Error: 4x4 FDCT has a sign bias > 1%" - << " for input range [-255, 255] at index " << j; + << "Error: 4x4 FDCT/FHT has a sign bias > 1%" + << " for input range [-255, 255] at index " << j + << " tx_type " << tx_type_; } memset(count_sign_block, 0, sizeof(count_sign_block)); - for (int i = 0; i < count_test_block; ++i) { // Initialize a test block with input range [-15, 15]. for (int j = 0; j < 16; ++j) test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4); - // TODO(Yaowu): this should be converted to a parameterized test - // to test optimized versions of this function. - vp9_short_fdct4x4_c(test_input_block, test_output_block, pitch); + RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_); for (int j = 0; j < 16; ++j) { if (test_output_block[j] < 0) @@ -83,20 +127,22 @@ TEST(Vp9Fdct4x4Test, SignBiasCheck) { const bool bias_acceptable = (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 100000); EXPECT_TRUE(bias_acceptable) - << "Error: 4x4 FDCT has a sign bias > 10%" + << "Error: 4x4 FDCT/FHT has a sign bias > 10%" << " for input range [-15, 15] at index " << j; } -}; +} -TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) { +TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); + int max_error = 0; double total_error = 0; const int count_test_block = 1000000; for (int i = 0; i < count_test_block; ++i) { - int16_t test_input_block[16]; - int16_t test_temp_block[16]; - uint8_t dst[16], src[16]; + DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16); + DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 16); + DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 16); + DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 16); for (int j = 0; j < 16; ++j) { src[j] = rnd.Rand8(); @@ -106,10 +152,8 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) { for (int j = 0; j < 16; ++j) test_input_block[j] = src[j] - dst[j]; - // TODO(Yaowu): this should be converted to a parameterized test - // to test optimized versions of this function. const int pitch = 8; - vp9_short_fdct4x4_c(test_input_block, test_temp_block, pitch); + RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); for (int j = 0; j < 16; ++j) { if(test_temp_block[j] > 0) { @@ -123,8 +167,8 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) { } } - // Because the bitstream is not frozen yet, use the idct in the codebase. - vp9_short_idct4x4_add_c(test_temp_block, dst, 4); + // inverse transform and reconstruct the pixel block + RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); for (int j = 0; j < 16; ++j) { const int diff = dst[j] - src[j]; @@ -135,10 +179,12 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) { } } EXPECT_GE(1, max_error) - << "Error: FDCT/IDCT has an individual roundtrip error > 1"; + << "Error: FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1"; EXPECT_GE(count_test_block, total_error) - << "Error: FDCT/IDCT has average roundtrip error > 1 per block"; -}; + << "Error: FDCT/IDCT or FHT/IHT has average " + << "roundtrip error > 1 per block"; +} +INSTANTIATE_TEST_CASE_P(VP9, FwdTrans4x4Test, ::testing::Range(0, 4)); } // namespace diff --git a/libvpx/test/fdct8x8_test.cc b/libvpx/test/fdct8x8_test.cc index 90b4ecd..50e2e9d 100644 --- a/libvpx/test/fdct8x8_test.cc +++ b/libvpx/test/fdct8x8_test.cc @@ -13,6 +13,7 @@ #include <string.h> #include "third_party/googletest/src/include/gtest/gtest.h" +#include "vpx_ports/mem.h" extern "C" { #include "vp9_rtcd.h" @@ -25,11 +26,62 @@ void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch); using libvpx_test::ACMRandom; namespace { +void fdct8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/, + int stride, int /*tx_type*/) { + vp9_short_fdct8x8_c(in, out, stride); +} +void idct8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst, + int stride, int /*tx_type*/) { + vp9_short_idct8x8_add_c(out, dst, stride >> 1); +} +void fht8x8(int16_t *in, int16_t *out, uint8_t* /*dst*/, + int stride, int tx_type) { + // TODO(jingning): need to refactor this to test both _c and _sse2 functions, + // when we have all inverse dct functions done sse2. +#if HAVE_SSE2 + vp9_short_fht8x8_sse2(in, out, stride >> 1, tx_type); +#else + vp9_short_fht8x8_c(in, out, stride >> 1, tx_type); +#endif +} +void iht8x8_add(int16_t* /*in*/, int16_t *out, uint8_t *dst, + int stride, int tx_type) { + vp9_short_iht8x8_add_c(out, dst, stride >> 1, tx_type); +} + +class FwdTrans8x8Test : public ::testing::TestWithParam<int> { + public: + virtual ~FwdTrans8x8Test() {} + virtual void SetUp() { + tx_type_ = GetParam(); + if (tx_type_ == 0) { + fwd_txfm = fdct8x8; + inv_txfm = idct8x8_add; + } else { + fwd_txfm = fht8x8; + inv_txfm = iht8x8_add; + } + } + + protected: + void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst, + int stride, int tx_type) { + (*fwd_txfm)(in, out, dst, stride, tx_type); + } + void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst, + int stride, int tx_type) { + (*inv_txfm)(in, out, dst, stride, tx_type); + } + + int tx_type_; + void (*fwd_txfm)(int16_t*, int16_t*, uint8_t*, int, int); + void (*inv_txfm)(int16_t*, int16_t*, uint8_t*, int, int); +}; -TEST(VP9Fdct8x8Test, SignBiasCheck) { +TEST_P(FwdTrans8x8Test, SignBiasCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); - int16_t test_input_block[64]; - int16_t test_output_block[64]; + DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64); + DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64); const int pitch = 16; int count_sign_block[64][2]; const int count_test_block = 100000; @@ -41,7 +93,7 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) { for (int j = 0; j < 64; ++j) test_input_block[j] = rnd.Rand8() - rnd.Rand8(); - vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch); + RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_); for (int j = 0; j < 64; ++j) { if (test_output_block[j] < 0) @@ -55,7 +107,7 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) { const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]); const int max_diff = 1125; EXPECT_LT(diff, max_diff) - << "Error: 8x8 FDCT has a sign bias > " + << "Error: 8x8 FDCT/FHT has a sign bias > " << 1. * max_diff / count_test_block * 100 << "%" << " for input range [-255, 255] at index " << j << " count0: " << count_sign_block[j][0] @@ -70,7 +122,7 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) { for (int j = 0; j < 64; ++j) test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4); - vp9_short_fdct8x8_c(test_input_block, test_output_block, pitch); + RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_); for (int j = 0; j < 64; ++j) { if (test_output_block[j] < 0) @@ -84,24 +136,25 @@ TEST(VP9Fdct8x8Test, SignBiasCheck) { const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]); const int max_diff = 10000; EXPECT_LT(diff, max_diff) - << "Error: 4x4 FDCT has a sign bias > " + << "Error: 4x4 FDCT/FHT has a sign bias > " << 1. * max_diff / count_test_block * 100 << "%" << " for input range [-15, 15] at index " << j << " count0: " << count_sign_block[j][0] << " count1: " << count_sign_block[j][1] << " diff: " << diff; } -}; +} -TEST(VP9Fdct8x8Test, RoundTripErrorCheck) { +TEST_P(FwdTrans8x8Test, RoundTripErrorCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); int max_error = 0; double total_error = 0; const int count_test_block = 100000; for (int i = 0; i < count_test_block; ++i) { - int16_t test_input_block[64]; - int16_t test_temp_block[64]; - uint8_t dst[64], src[64]; + DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64); + DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64); + DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64); + DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64); for (int j = 0; j < 64; ++j) { src[j] = rnd.Rand8(); @@ -112,7 +165,7 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) { test_input_block[j] = src[j] - dst[j]; const int pitch = 16; - vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch); + RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); for (int j = 0; j < 64; ++j){ if(test_temp_block[j] > 0) { test_temp_block[j] += 2; @@ -124,7 +177,7 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) { test_temp_block[j] *= 4; } } - vp9_short_idct8x8_add_c(test_temp_block, dst, 8); + RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); for (int j = 0; j < 64; ++j) { const int diff = dst[j] - src[j]; @@ -136,21 +189,23 @@ TEST(VP9Fdct8x8Test, RoundTripErrorCheck) { } EXPECT_GE(1, max_error) - << "Error: 8x8 FDCT/IDCT has an individual roundtrip error > 1"; + << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1"; EXPECT_GE(count_test_block/5, total_error) - << "Error: 8x8 FDCT/IDCT has average roundtrip error > 1/5 per block"; -}; + << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip " + "error > 1/5 per block"; +} -TEST(VP9Fdct8x8Test, ExtremalCheck) { +TEST_P(FwdTrans8x8Test, ExtremalCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); int max_error = 0; double total_error = 0; const int count_test_block = 100000; for (int i = 0; i < count_test_block; ++i) { - int16_t test_input_block[64]; - int16_t test_temp_block[64]; - uint8_t dst[64], src[64]; + DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64); + DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64); + DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64); + DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64); for (int j = 0; j < 64; ++j) { src[j] = rnd.Rand8() % 2 ? 255 : 0; @@ -161,8 +216,8 @@ TEST(VP9Fdct8x8Test, ExtremalCheck) { test_input_block[j] = src[j] - dst[j]; const int pitch = 16; - vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch); - vp9_short_idct8x8_add_c(test_temp_block, dst, 8); + RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); + RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_); for (int j = 0; j < 64; ++j) { const int diff = dst[j] - src[j]; @@ -173,13 +228,14 @@ TEST(VP9Fdct8x8Test, ExtremalCheck) { } EXPECT_GE(1, max_error) - << "Error: Extremal 8x8 FDCT/IDCT has an" + << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has an" << " individual roundtrip error > 1"; EXPECT_GE(count_test_block/5, total_error) - << "Error: Extremal 8x8 FDCT/IDCT has average" + << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average" << " roundtrip error > 1/5 per block"; } -}; +} +INSTANTIATE_TEST_CASE_P(VP9, FwdTrans8x8Test, ::testing::Range(0, 4)); } // namespace diff --git a/libvpx/test/i420_video_source.h b/libvpx/test/i420_video_source.h index 12a6ab1..bcbe8a7 100644 --- a/libvpx/test/i420_video_source.h +++ b/libvpx/test/i420_video_source.h @@ -49,7 +49,7 @@ class I420VideoSource : public VideoSource { if (input_file_) fclose(input_file_); input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_) << "Input file open failed. Filename: " + ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: " << file_name_; if (start_) { fseek(input_file_, raw_sz_ * start_, SEEK_SET); @@ -92,6 +92,7 @@ class I420VideoSource : public VideoSource { } virtual void FillFrame() { + ASSERT_TRUE(input_file_ != NULL); // Read a frame from input_file. if (fread(img_->img_data, raw_sz_, 1, input_file_) == 0) { limit_ = frame_; @@ -108,8 +109,8 @@ class I420VideoSource : public VideoSource { unsigned int frame_; unsigned int width_; unsigned int height_; - unsigned int framerate_numerator_; - unsigned int framerate_denominator_; + int framerate_numerator_; + int framerate_denominator_; }; } // namespace libvpx_test diff --git a/libvpx/test/idct_test.cc b/libvpx/test/idct_test.cc index 659cce0..aa786cb 100644 --- a/libvpx/test/idct_test.cc +++ b/libvpx/test/idct_test.cc @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - extern "C" { #include "./vpx_config.h" #include "./vp8_rtcd.h" @@ -22,100 +21,94 @@ typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr, int dst_stride); namespace { class IDCTTest : public ::testing::TestWithParam<idct_fn_t> { - protected: - virtual void SetUp() { - int i; - - UUT = GetParam(); - memset(input, 0, sizeof(input)); - /* Set up guard blocks */ - for (i = 0; i < 256; i++) - output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1; - } - - virtual void TearDown() { - libvpx_test::ClearSystemState(); - } - - idct_fn_t UUT; - short input[16]; - unsigned char output[256]; - unsigned char predict[256]; + protected: + virtual void SetUp() { + int i; + + UUT = GetParam(); + memset(input, 0, sizeof(input)); + /* Set up guard blocks */ + for (i = 0; i < 256; i++) output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1; + } + + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + idct_fn_t UUT; + short input[16]; + unsigned char output[256]; + unsigned char predict[256]; }; TEST_P(IDCTTest, TestGuardBlocks) { - int i; + int i; - for (i = 0; i < 256; i++) - if ((i & 0xF) < 4 && i < 64) - EXPECT_EQ(0, output[i]) << i; - else - EXPECT_EQ(255, output[i]); + for (i = 0; i < 256; i++) + if ((i & 0xF) < 4 && i < 64) + EXPECT_EQ(0, output[i]) << i; + else + EXPECT_EQ(255, output[i]); } TEST_P(IDCTTest, TestAllZeros) { - int i; + int i; - REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); + REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); - for (i = 0; i < 256; i++) - if ((i & 0xF) < 4 && i < 64) - EXPECT_EQ(0, output[i]) << "i==" << i; - else - EXPECT_EQ(255, output[i]) << "i==" << i; + for (i = 0; i < 256; i++) + if ((i & 0xF) < 4 && i < 64) + EXPECT_EQ(0, output[i]) << "i==" << i; + else + EXPECT_EQ(255, output[i]) << "i==" << i; } TEST_P(IDCTTest, TestAllOnes) { - int i; + int i; - input[0] = 4; - REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); + input[0] = 4; + REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); - for (i = 0; i < 256; i++) - if ((i & 0xF) < 4 && i < 64) - EXPECT_EQ(1, output[i]) << "i==" << i; - else - EXPECT_EQ(255, output[i]) << "i==" << i; + for (i = 0; i < 256; i++) + if ((i & 0xF) < 4 && i < 64) + EXPECT_EQ(1, output[i]) << "i==" << i; + else + EXPECT_EQ(255, output[i]) << "i==" << i; } TEST_P(IDCTTest, TestAddOne) { - int i; + int i; - for (i = 0; i < 256; i++) - predict[i] = i; - input[0] = 4; - REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16)); + for (i = 0; i < 256; i++) predict[i] = i; + input[0] = 4; + REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16)); - for (i = 0; i < 256; i++) - if ((i & 0xF) < 4 && i < 64) - EXPECT_EQ(i+1, output[i]) << "i==" << i; - else - EXPECT_EQ(255, output[i]) << "i==" << i; + for (i = 0; i < 256; i++) + if ((i & 0xF) < 4 && i < 64) + EXPECT_EQ(i + 1, output[i]) << "i==" << i; + else + EXPECT_EQ(255, output[i]) << "i==" << i; } TEST_P(IDCTTest, TestWithData) { - int i; - - for (i = 0; i < 16; i++) - input[i] = i; - - REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); - - for (i = 0; i < 256; i++) - if ((i & 0xF) > 3 || i > 63) - EXPECT_EQ(255, output[i]) << "i==" << i; - else if (i == 0) - EXPECT_EQ(11, output[i]) << "i==" << i; - else if (i == 34) - EXPECT_EQ(1, output[i]) << "i==" << i; - else if (i == 2 || i == 17 || i == 32) - EXPECT_EQ(3, output[i]) << "i==" << i; - else - EXPECT_EQ(0, output[i]) << "i==" << i; + int i; + + for (i = 0; i < 16; i++) input[i] = i; + + REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16)); + + for (i = 0; i < 256; i++) + if ((i & 0xF) > 3 || i > 63) + EXPECT_EQ(255, output[i]) << "i==" << i; + else if (i == 0) + EXPECT_EQ(11, output[i]) << "i==" << i; + else if (i == 34) + EXPECT_EQ(1, output[i]) << "i==" << i; + else if (i == 2 || i == 17 || i == 32) + EXPECT_EQ(3, output[i]) << "i==" << i; + else + EXPECT_EQ(0, output[i]) << "i==" << i; } -INSTANTIATE_TEST_CASE_P(C, IDCTTest, - ::testing::Values(vp8_short_idct4x4llm_c)); +INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c)); #if HAVE_MMX INSTANTIATE_TEST_CASE_P(MMX, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_mmx)); diff --git a/libvpx/test/intrapred_test.cc b/libvpx/test/intrapred_test.cc index 39ec896..da96741 100644 --- a/libvpx/test/intrapred_test.cc +++ b/libvpx/test/intrapred_test.cc @@ -27,6 +27,8 @@ using libvpx_test::ACMRandom; class IntraPredBase { public: + virtual ~IntraPredBase() {} + virtual void TearDown() { libvpx_test::ClearSystemState(); } diff --git a/libvpx/test/ivf_video_source.h b/libvpx/test/ivf_video_source.h index 48c3a7d..926f801 100644 --- a/libvpx/test/ivf_video_source.h +++ b/libvpx/test/ivf_video_source.h @@ -47,12 +47,13 @@ class IVFVideoSource : public CompressedVideoSource { virtual void Init() { // Allocate a buffer for read in the compressed video frame. compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize]; - ASSERT_TRUE(compressed_frame_buf_) << "Allocate frame buffer failed"; + ASSERT_TRUE(compressed_frame_buf_ != NULL) + << "Allocate frame buffer failed"; } virtual void Begin() { input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_) << "Input file open failed. Filename: " + ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: " << file_name_; // Read file header @@ -72,6 +73,7 @@ class IVFVideoSource : public CompressedVideoSource { } void FillFrame() { + ASSERT_TRUE(input_file_ != NULL); uint8_t frame_hdr[kIvfFrameHdrSize]; // Check frame header and read a frame from input_file. if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) diff --git a/libvpx/test/keyframe_test.cc b/libvpx/test/keyframe_test.cc index 85ca0b9..f7572e8 100644 --- a/libvpx/test/keyframe_test.cc +++ b/libvpx/test/keyframe_test.cc @@ -31,10 +31,6 @@ class KeyframeTest : public ::libvpx_test::EncoderTest, set_cpu_used_ = 0; } - virtual bool Continue() const { - return !HasFatalFailure() && !abort_; - } - virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { if (kf_do_force_kf_) diff --git a/libvpx/test/resize_test.cc b/libvpx/test/resize_test.cc index 0d591ad..7412a24 100644 --- a/libvpx/test/resize_test.cc +++ b/libvpx/test/resize_test.cc @@ -70,10 +70,6 @@ class ResizeTest : public ::libvpx_test::EncoderTest, SetMode(GET_PARAM(1)); } - virtual bool Continue() const { - return !HasFatalFailure() && !abort_; - } - virtual void DecompressedFrameHook(const vpx_image_t &img, vpx_codec_pts_t pts) { frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h)); diff --git a/libvpx/test/sad_test.cc b/libvpx/test/sad_test.cc index 1f5435f..bf3e0b8 100644 --- a/libvpx/test/sad_test.cc +++ b/libvpx/test/sad_test.cc @@ -452,10 +452,14 @@ const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt; #endif #if CONFIG_VP9_ENCODER const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2; +const sad_m_by_n_fn_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2; +const sad_m_by_n_fn_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2; const sad_m_by_n_fn_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2; +const sad_m_by_n_fn_t sad_32x16_sse2_vp9 = vp9_sad32x16_sse2; +const sad_m_by_n_fn_t sad_16x32_sse2_vp9 = vp9_sad16x32_sse2; const sad_m_by_n_fn_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2; -const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2; const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2; +const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2; const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2; const sad_m_by_n_fn_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2; #endif @@ -469,10 +473,14 @@ const sad_m_by_n_test_param_t sse2_tests[] = { #endif #if CONFIG_VP9_ENCODER make_tuple(64, 64, sad_64x64_sse2_vp9), + make_tuple(64, 32, sad_64x32_sse2_vp9), + make_tuple(32, 64, sad_32x64_sse2_vp9), make_tuple(32, 32, sad_32x32_sse2_vp9), + make_tuple(32, 16, sad_32x16_sse2_vp9), + make_tuple(16, 32, sad_16x32_sse2_vp9), make_tuple(16, 16, sad_16x16_sse2_vp9), - make_tuple(8, 16, sad_8x16_sse2_vp9), make_tuple(16, 8, sad_16x8_sse2_vp9), + make_tuple(8, 16, sad_8x16_sse2_vp9), make_tuple(8, 8, sad_8x8_sse2_vp9), make_tuple(8, 4, sad_8x4_sse2_vp9), #endif diff --git a/libvpx/test/subtract_test.cc b/libvpx/test/subtract_test.cc index 81bfb66..574bfbf 100644 --- a/libvpx/test/subtract_test.cc +++ b/libvpx/test/subtract_test.cc @@ -61,7 +61,7 @@ TEST_P(SubtractBlockTest, SimpleSubtract) { int16_t *src_diff = be.src_diff; for (int r = 0; r < kBlockHeight; ++r) { for (int c = 0; c < kBlockWidth; ++c) { - src_diff[c] = 0xa5a5; + src_diff[c] = static_cast<int16_t>(0xa5a5); } src_diff += kDiffPredStride; } diff --git a/libvpx/test/superframe_test.cc b/libvpx/test/superframe_test.cc index 062ec6c..d91e7b1 100644 --- a/libvpx/test/superframe_test.cc +++ b/libvpx/test/superframe_test.cc @@ -33,10 +33,6 @@ class SuperframeTest : public ::libvpx_test::EncoderTest, delete[] modified_buf_; } - virtual bool Continue() const { - return !HasFatalFailure() && !abort_; - } - virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video, libvpx_test::Encoder *encoder) { if (video->frame() == 1) { diff --git a/libvpx/test/test-data.sha1 b/libvpx/test/test-data.sha1 index 1036d7c..0ac4905 100644 --- a/libvpx/test/test-data.sha1 +++ b/libvpx/test/test-data.sha1 @@ -122,223 +122,401 @@ f95eb6214571434f1f73ab7833b9ccdf47588020 vp80-03-segmentation-1437.ivf.md5 086c56378df81b6cee264d7540a7b8f2b405c7a4 vp80-05-sharpness-1439.ivf.md5 d32dc2c4165eb266ea4c23c14a45459b363def32 vp80-05-sharpness-1440.ivf.md5 8c69dc3d8e563f56ffab5ad1e400d9e689dd23df vp80-05-sharpness-1443.ivf.md5 -c5b6fc822d7b4ed97b5a0d69e3a71d9de6cab815 vp90-00-akiyo-100.webm -1cd8ee73b53f4ecc2511effd233f9af6ecdfac7e vp90-00-akiyo-100.webm.md5 -a854b0f2313efde7767a4465afbcbe35005ffb07 vp90-00-akiyo-200.webm -b0f53ad309611246821174b642f6808cc1e670de vp90-00-akiyo-200.webm.md5 -38a5c0e5465f884474b1a5a9184685f17f961ba1 vp90-00-akiyo-300.webm -756a34417fc10dc2a49464eccaa6b7f987227b57 vp90-00-akiyo-300.webm.md5 -1047e6f19dd137ae7bbd5b93d407fc7186f8a98e vp90-00-akiyo-50.webm -0fa08a76901a6a5b2d4b58a6b20bfa5239409b9d vp90-00-akiyo-50.webm.md5 -767511b25dde2c5926f5284782a9f1e04fe7afda vp90-00-bowing-150.webm -b259c3c6afb30fd1ae7d3a563c1fe9fe6a4644cd vp90-00-bowing-150.webm.md5 -2ef831c75c021a03176536fb652196e9afc37888 vp90-00-bowing-25.webm -37d3522cd76b7bab3b5e973e2b2c51edea49ef3f vp90-00-bowing-25.webm.md5 -c1e4639f14914516ca704f38c875d01f4c06be14 vp90-00-bowing-400.webm -ca35c574512185d5f20f3b81517d6ac3333a1377 vp90-00-bowing-400.webm.md5 -e20fc293db095e52f29b891bc09458e7568e8603 vp90-00-bus-100.webm -a754ea588cc409546936c09fb1ad06b3014b94f9 vp90-00-bus-100.webm.md5 -da5eb45fa42f55ff70ec7b71999e6fd8489d12f9 vp90-00-bus-2000.webm -2a7356328eb991175cbddebd51a30018e48632f2 vp90-00-bus-2000.webm.md5 -607169c774664176aca7c7d46dabf04b9c3634e4 vp90-00-bus-300.webm -c84daa3a0290d73226b243dd630820ac97bf4fbd vp90-00-bus-300.webm.md5 -655902b54b9a8a882c11bc8bce1447f3b2085035 vp90-00-bus-4400.webm -f719ecd7b53c8e35fae735396629d1915ffc1ff9 vp90-00-bus-4400.webm.md5 -afcdca9763d233dd63fd67165a7b92ea679822af vp90-00-bus-800.webm -66e2a55560e570cae09520060f1ae315c7ea0a07 vp90-00-bus-800.webm.md5 -390b91c8566d94c3a869af77531585c38f9f78da vp90-00-cheer-1600.webm -3d47da26375a75afef0cf2123f5c808d0862e25d vp90-00-cheer-1600.webm.md5 -23419784db17a50e129e3bd030c20256cf0d6eb0 vp90-00-cheer-2800.webm -0df4676171f19e7807d719a9b8a6fadcefc8f1fc vp90-00-cheer-2800.webm.md5 -45ed3c42874d5ec88852798691cf54bfb0cf652a vp90-00-cheer-400.webm -374fd67ac9ae0e8146051b77963459c54b9eaaa2 vp90-00-cheer-400.webm.md5 -1c9459d824116a297ff0e90bed9be783005f9ac1 vp90-00-cheer-600.webm -9dc0d43f72c8eb49d51a9748fb9948495529a6b5 vp90-00-cheer-600.webm.md5 -a86c5af1929d2f929a5caf6ef847d0066086223b vp90-00-city-1200.webm -231c7f0f406e3a8d2328daee4c4466e1b4d47354 vp90-00-city-1200.webm.md5 -be9cf927e6ab517d7876925d21b3193b1373d03d vp90-00-city-2000.webm -487d60226a3a3039528a049e9c6e8243b07404e6 vp90-00-city-2000.webm.md5 -1f3cd649d5829d52c08da3323baa86b1dcf2d2de vp90-00-city-300.webm -8e3b38cfa2be757e46ea12cff11762cb50134615 vp90-00-city-300.webm.md5 -286f6ea64c33ce735b5b7806aac4ca5ee331af66 vp90-00-city-600.webm -7c51ead147ef4029094a2b455239090c1999d8fe vp90-00-city-600.webm.md5 -f7ecbd63bed06ed15afe0ba2a192f2cf7943714c vp90-00-coastguard-1200.webm -8c8fed2c64cc8fb330e9200e1e0f58a79b953b79 vp90-00-coastguard-1200.webm.md5 -2e63178e5b2c2cc84226df2b514c4dde46c32d70 vp90-00-coastguard-200.webm -128f2b22fdcfd02bc50e63b1cd6d40c0cc4998d6 vp90-00-coastguard-200.webm.md5 -97b779617d3c1ca8f50beda7126be5df913d071d vp90-00-coastguard-3600.webm -0da0ab4794439e6b8ab9ced41239e1307686be69 vp90-00-coastguard-3600.webm.md5 -5e060d66573a40f7f0a46ae9b6acb51b0afb2e3c vp90-00-coastguard-5200.webm -4ba526d4bb895c4794dc20edeb38b102a9b1bd92 vp90-00-coastguard-5200.webm.md5 -17810fa737f29d5b032836e38243bbb666f06636 vp90-00-container-1000.webm -7e0fd7e93c5a16394818f844aa5f2d5fa7a73ee2 vp90-00-container-1000.webm.md5 -38deb4f59cec9e62715dec2f3670ffe7b1cf493e vp90-00-container-200.webm -aa3229017f920750bd5d919e19ea6127ea05adc0 vp90-00-container-200.webm.md5 -8b1a67ef35d3f00981d23c41b56a0a2e09976312 vp90-00-container-50.webm -0a6f1a793b936ff1287326882f1165065a2dcea0 vp90-00-container-50.webm.md5 -4c724db691b7202b60b56107ec7b0abc6cc52bdc vp90-00-deadline-1000.webm -5903bd89be457be681a6c6c8fd8c19f4570173db vp90-00-deadline-1000.webm.md5 -ee5e19a8fe14d3e72b1314a012b49a3bc0586375 vp90-00-deadline-200.webm -77095f98406fa27a2da8661f21664c00292dcefc vp90-00-deadline-200.webm.md5 -8230b07aa0ee7adf3caabae4e3bef997929001eb vp90-00-deadline-50.webm -fc47a159b2d2b0bed93d4e2c35408243e70b6d24 vp90-00-deadline-50.webm.md5 -244d12cda51235dcc421fedbe12422b326f539e7 vp90-00-flower-100.webm -dfeca236450b5ff19c1558ad33fba7ab7ff75f27 vp90-00-flower-100.webm.md5 -d5b7057564f670f7bf82017e2abc3aed5656b810 vp90-00-flower-2000.webm -65118811f4d46ef1e911d520296731536d3a507e vp90-00-flower-2000.webm.md5 -a9c226643365f0c8ae03e780d55aa6c6fa9cc0e7 vp90-00-flower-300.webm -fa5193d1a6e6b9e8bb91f75e91a3a377f00fa42e vp90-00-flower-300.webm.md5 -b206284b51dec6219c46e9b03def38a94d91bf89 vp90-00-flower-4400.webm -c8a73acd8234b287e86465d03fbf4f886d1fefb2 vp90-00-flower-4400.webm.md5 -faff83d7b6aa89f5d9518ffc5d4b145eb02b6800 vp90-00-flower-800.webm -328dd1969804afc094d010f54f350bd05390d6a9 vp90-00-flower-800.webm.md5 -42caa40d3b76b8ae5e7573b95e09bc4e57bea835 vp90-00-football-1600.webm -167b8f58a85d83050d4c56391d6b2d9a9a205b9a vp90-00-football-1600.webm.md5 -4c4f93f594a8ef89a9ba903bbcff914022a5ad9d vp90-00-football-2800.webm -7995f7f91b13d4ab5badcd3f9282bd1fceba38f3 vp90-00-football-2800.webm.md5 -c3ff724e79b4ae0202929f3ed1a1a5b67d10901f vp90-00-football-400.webm -19164a0e58ca5d407282a867866e8ec4a0a08fea vp90-00-football-400.webm.md5 -95de1c4abceab3706f0225e3b9c5dc719901a6cf vp90-00-football-600.webm -4a4454ae4d65748a45eaa3decb783bbe0ba190dc vp90-00-football-600.webm.md5 -80eebcdae76459c00d14b6c50f7529377e53a1c2 vp90-00-foreman-1200.webm -8228cc5a7cc83970b3a65f9b49bc74733255b09c vp90-00-foreman-1200.webm.md5 -601d0ff4f058a3da3af4409e4117795f7c231fda vp90-00-foreman-2000.webm -e0c0b0aa6f9597984a2d78e799a00e0052710b2c vp90-00-foreman-2000.webm.md5 -30ebc327645d68bcc83eab72610bba22f877fb4c vp90-00-foreman-300.webm -080fc2adf29a84f02a3e4b5508fc2f8dc32f1440 vp90-00-foreman-300.webm.md5 -6b1a6be0f7bd7605b565750b3080be397d4c48a0 vp90-00-foreman-600.webm -f7713d3eba8d34d511ba1c9585a5a3f34e133ba5 vp90-00-foreman-600.webm.md5 -b080d9786abc89b4be59bffc5baba7b42fbc286a vp90-00-hallmonitor-1200.webm -77be47800b58001eb7a854d4d4a9b9823bbbe158 vp90-00-hallmonitor-1200.webm.md5 -05cd8e8d58ab8311ad528c27b4c89cdf268e749b vp90-00-hallmonitor-2000.webm -de1aa35c7172e78e07d6b197280214bbd362cc4e vp90-00-hallmonitor-2000.webm.md5 -908676b32b190e956518bb742d1415efceeb8c75 vp90-00-hallmonitor-300.webm -f9d39866db341d18256339e9fd2c0ec296f47702 vp90-00-hallmonitor-300.webm.md5 -1307c7f7558de34a6230912e684ff9571a05db5f vp90-00-hallmonitor-600.webm -954b292dd56be5c1bf153df440b132e1b1fbcb68 vp90-00-hallmonitor-600.webm.md5 -05f556288c5c4211420f7c332daded816f9b31b7 vp90-00-harbour-1200.webm -399481f93cc252f20ad5141dd402cf5363673578 vp90-00-harbour-1200.webm.md5 -fa62e449485c544c281030c5ccff32c60d4dd169 vp90-00-harbour-200.webm -3d0e1885befb2493c477384917797164d4fe58e4 vp90-00-harbour-200.webm.md5 -fa3a5e563c3d2215703c1a68f71fbe2168a42468 vp90-00-harbour-3600.webm -9af392f6b2cb5ec5c9446b7262206773df535319 vp90-00-harbour-3600.webm.md5 -476db4b15989a5a078f1d2fc5f9734d1d24f1da1 vp90-00-harbour-5200.webm -352a05b179dc1f86cf6ce27494a4a8fb42379d72 vp90-00-harbour-5200.webm.md5 -0ea17a4892383a2fd0be9f88f213f5f48f2a61f4 vp90-00-highway-100.webm -a2fe942955bafa83295d1381c9a25264764924c5 vp90-00-highway-100.webm.md5 -7ab80485670a5343a74c4a2454761ed3bed7ceef vp90-00-highway-1600.webm -fda9c82cb5d28a5ff5f7dae7c537e9187dfbd4cc vp90-00-highway-1600.webm.md5 -162d42e033dad04fd7ae3bf9d39e9e204c022edc vp90-00-highway-2800.webm -b882c93a2dc89feb6090b0f72e67ac8a59fc0986 vp90-00-highway-2800.webm.md5 -79b9a0e6fa6cdd2367228e9ac8d6a369a8d647e6 vp90-00-highway-50.webm -80ecf926372dbe8c1b4bcd68ea2101f78a93b02e vp90-00-highway-50.webm.md5 -a67fd02cbb75c1a757b5ea56b9eee46069bfadbf vp90-00-husky-100.webm -12cd583e791c8e5b40b5dffe4a9dbcc1929dc645 vp90-00-husky-100.webm.md5 -1a8b4302eb6f88b14a9acd4a6cbe62d0b380f2e4 vp90-00-husky-2000.webm -a9c2532e5d867d7627bb6767008b43b653cce904 vp90-00-husky-2000.webm.md5 -f56f66afd4d4512a49904275a1c942ba7379fec4 vp90-00-husky-300.webm -196dc386f104b7b9ed2ec6c6a1f104ce0319c2eb vp90-00-husky-300.webm.md5 -6ba3c16fd98d37a8de7023419682a3595778b9bc vp90-00-husky-4400.webm -2f4815ba97e352fcd0089d1a5883a0aff1e5394a vp90-00-husky-4400.webm.md5 -db04a296c377693dd6e974bea36256f4b14cddef vp90-00-husky-800.webm -7658473ad17ee689a37fda558c5a23816131cfc3 vp90-00-husky-800.webm.md5 -50cf9e34b61e1cf32c9dde2ebcc5f5703c379a41 vp90-00-ice-150.webm -806ceba91dc40c45eafc4d7ee61df9346c6fe5f9 vp90-00-ice-150.webm.md5 -4cfca1bea7aae6e4405abfca603cfbded13ded1a vp90-00-ice-400.webm -e4298abf05419973da89c0bfcdf0006b1606ebcd vp90-00-ice-400.webm.md5 -12e3ccfdf96c3f4eebeed8106c5daef6c2b28d83 vp90-00-ice-800.webm -6fb2aacb4d8131dcabaa61a9cd2497cd09854377 vp90-00-ice-800.webm.md5 -124977938c47ba739e918533bc5d6d73e41ce2ec vp90-00-mobile-1600.webm -603b2b523c8ed5922121d285567a845bb6693d35 vp90-00-mobile-1600.webm.md5 -93f204b90250791b884479be5da534a5bc6304ff vp90-00-mobile-2800.webm -21ec8735b774c66e192f7270c12075f598f700d5 vp90-00-mobile-2800.webm.md5 -fe9cdbfdeee2b7554efb532f646703cff55c2d2c vp90-00-mobile-400.webm -4def63c78ee09e90e6385d3122ada95343246102 vp90-00-mobile-400.webm.md5 -2a042aa8a06c45770dcb52c56a7f5cea6d51b8dd vp90-00-mobile-600.webm -03169f031dece0db3d89ce16cc3e0ee3eca21065 vp90-00-mobile-600.webm.md5 -7fc5b0b0c684d63e161c9c5932e1374327e15dd4 vp90-00-motherdaughter-100.webm -290ac7722caf4b15136b307a239c9b903113b9c4 vp90-00-motherdaughter-100.webm.md5 -67ddfce82bff083a1ceb108a7dcfb801791102f1 vp90-00-motherdaughter-300.webm -7696698d38e32f0afeb3a3e9a45b7fe3f237aaba vp90-00-motherdaughter-300.webm.md5 -ff65a1bee2fe384728017c5148df61379043d5b6 vp90-00-motherdaughter-600.webm -f0b167000bf40877d1ba7ba52a08b4310011c032 vp90-00-motherdaughter-600.webm.md5 -d73c54e676bd63424fc9ad8d0cef64e929081cf4 vp90-00-news-100.webm -71821b71a97823e9ba58563efc841dc6beefe9df vp90-00-news-100.webm.md5 -2937238d094863951eb8f218438b966d2b7b5430 vp90-00-news-300.webm -2587d0859a330cf6d8e0a135d1f586bb2a5033fc vp90-00-news-300.webm.md5 -65afdd4fc411951115b48435b8b65155594b5c99 vp90-00-news-600.webm -5815bb341db976f44dab97bb9cfba8ea0ca55502 vp90-00-news-600.webm.md5 -de5dd99ac04d3a937fc0951d06fb8f533fdc393a vp90-00-pamphlet-150.webm -0381d705fa490f35c772e3048b423b382088d546 vp90-00-pamphlet-150.webm.md5 -46f283284cb64b79243b2ea6aad709a526c26393 vp90-00-pamphlet-25.webm -f100fbebcad96f27ed8f340414b939bc738d49d0 vp90-00-pamphlet-25.webm.md5 -8df04ece12455c5c40f14cb089348260798c5f2b vp90-00-pamphlet-400.webm -66a2c87cd4194368d3477e9a334880b76c87e991 vp90-00-pamphlet-400.webm.md5 -a00e97e4a71f5e24f194c59cde7d41bc2c3af325 vp90-00-paris-1000.webm -53ef896e16d1b83aa5166945d149c7133401b3f0 vp90-00-paris-1000.webm.md5 -6b03388e0236f6171e20c73834858e3c87b441b2 vp90-00-paris-200.webm -55a324b0153c5d54cd0c0492fed8755c441fa18c vp90-00-paris-200.webm.md5 -429ec362a9600c8822652cf7e122e22bca033d69 vp90-00-paris-50.webm -4406226b7bddb11ede8ee0c442d52e5d3bbbde78 vp90-00-paris-50.webm.md5 -a7996d4e757ea484aa72e14f623d6c9e72537888 vp90-00-signirene-1000.webm -f65a1ac6e1ce77102e63fb363dbca361b8108c02 vp90-00-signirene-1000.webm.md5 -8c2f686179bc3e87a18b48bcb5058f3cd61e1b4c vp90-00-signirene-200.webm -b8ab16cba9392e49169c374eb1e0c1b763ccaefb vp90-00-signirene-200.webm.md5 -5f8f99c386dce64931bbd4fc42a59a78dc6fdba1 vp90-00-signirene-50.webm -fdb8c4bc302884d413a256634d3e2fbd92867c90 vp90-00-signirene-50.webm.md5 -d5074f0a5bcefe9fd651afbbebf0e0f3fedb965b vp90-00-silent-1000.webm -9c075894fbfb84791fcc7dbd3fcab15b0a9bd64e vp90-00-silent-1000.webm.md5 -32101f334f675715a8f411638dfda80afacc37a6 vp90-00-silent-200.webm -fb0dac37f31ca711443832046a6aaf868e69b357 vp90-00-silent-200.webm.md5 -0aaef50d7f94873e99ec7e39f59a6b74e92ad946 vp90-00-silent-50.webm -be9fc41965b5b63f7c7bbd6c91191e940903e012 vp90-00-silent-50.webm.md5 -5e22ad14c562733d4d4a3ce163b580ed4a64e6fe vp90-00-soccer-100.webm -1ca9a0016910cfca26def9944568749a168131d8 vp90-00-soccer-100.webm.md5 -2d9b2a0fa5ac210f8d7c646578698e045733ad4a vp90-00-soccer-2000.webm -f979078650057606ca770b3f03be4c509efb40a9 vp90-00-soccer-2000.webm.md5 -7b789360ffc1eb5a3735f8a1f8d248a24ca4267c vp90-00-soccer-300.webm -195d33b23ca8304519bd6e38e9657e53a04779d8 vp90-00-soccer-300.webm.md5 -3907318ef35573e4efc5c150d3aff271c7157501 vp90-00-soccer-4400.webm -4b43ceecae9a9a7d39a47347f9e20af3613827d1 vp90-00-soccer-4400.webm.md5 -c89920aa89194cb6a36f77dff8722573f0df7241 vp90-00-soccer-800.webm -1da71751009afa483a03e274a538df24c9f5e513 vp90-00-soccer-800.webm.md5 -efca14e8e0515a8f8ed3ded11fdbff24b09a7f9d vp90-00-stefan-1600.webm -6f103270ce03cc85b28dd1c86d0447922d810671 vp90-00-stefan-1600.webm.md5 -b99ab6a983d48c15aa3a9160d06286fca0074193 vp90-00-stefan-2800.webm -986a72dd9988c6bf4246cd5bd966ce991ba55319 vp90-00-stefan-2800.webm.md5 -eb962244ca51a101ad8f585df6be8f5f96691f18 vp90-00-stefan-400.webm -2747cfd8f74aedc370767f08129b35ace70e1fe7 vp90-00-stefan-400.webm.md5 -b507b8cedd0147c5316db8f84f35ace768c25069 vp90-00-stefan-600.webm -daeb369046c2dc27ecfde978b87fd8b49d83789f vp90-00-stefan-600.webm.md5 -c5c2dd891c2b5fe4a70845858ccb859df3455ee7 vp90-00-students-100.webm -d1be06dc636ece0c34ab8c17399888aaf19e0c19 vp90-00-students-100.webm.md5 -c9e4da3a8b455aa690d89338f32f9d76773cdd18 vp90-00-students-300.webm -a9aa72e1ee27063f8e9f13b4647cec01c8efb2d6 vp90-00-students-300.webm.md5 -e9e5072cd944a8994e50fce367975e3ce526bd67 vp90-00-students-600.webm -86525ce188a98a51f86fad27341729bb61d1ca8b vp90-00-students-600.webm.md5 -58deb053aeafefdfdf13741accf9fcbe4584ea94 vp90-00-tempete-1200.webm -ec395a2ec76b4c1e64e243366a8840da22ee3a65 vp90-00-tempete-1200.webm.md5 -5d35232eaa8ee149a917ff94536968fb37dad50e vp90-00-tempete-200.webm -7f8c7529f40d6b6d6de8e89dbf9697623d27c234 vp90-00-tempete-200.webm.md5 -c44eb147bc3f8682b96096fccef8beb4380c40db vp90-00-tempete-3600.webm -01fd23e412530fa2d5319a22886161957a747ee0 vp90-00-tempete-3600.webm.md5 -56ab322b34a750e16dcc8ccfb735a5b9270cedc4 vp90-00-tempete-5200.webm -1cf803409ae53b991bff10079af4ab07aaa2853d vp90-00-tempete-5200.webm.md5 -ffe48d52019c228e919f4b123028664b8d0c2f4b vp90-00-tennis-100.webm -406fda3367899995d4e37170063495832e2be372 vp90-00-tennis-100.webm.md5 -6c030f8142b1932fbe8eb5c2b39b3452a5eea3aa vp90-00-tennis-2000.webm -dcf20921e2a8ab0dcd09f7f6bdcdd35f979205ae vp90-00-tennis-2000.webm.md5 -3fe0df7b74f301b39e1b21e6926c69a8418b9b70 vp90-00-tennis-300.webm -80c8301d3a37b33ca50318ba000066a6ae9929dc vp90-00-tennis-300.webm.md5 -82a2497083b8dce6b1c73bcdf16323ea69d1cca9 vp90-00-tennis-4400.webm -83ce97bc09a7e1b2f2c3437195a8931d7608a62b vp90-00-tennis-4400.webm.md5 -2c8bd3a29bbd1085169bfcba9fdf65a37f4a16bb vp90-00-tennis-800.webm -9920a65e06d2e7025f13f3d8bf35670503875aed vp90-00-tennis-800.webm.md5 -26469062c5724c2cc4914436ef032bb55373f843 vp90-00-waterfall-150.webm -9b86373ce15302a9b22cef8f808ce0e37e6d2b65 vp90-00-waterfall-150.webm.md5 -410ba6af2ddca5110fa7a4c383dc8b28f38cf565 vp90-00-waterfall-200.webm -251892d3fdcbc9d7a20c22ba202ed4935222e5b8 vp90-00-waterfall-200.webm.md5 -40b643aff88aed3764c5b58c446a8fbbc5fb36d7 vp90-00-waterfall-400.webm -51f31a6b6408f8af4d107e0f2a3c1a274d4da6bb vp90-00-waterfall-400.webm.md5 -bd421141e01f53dc15ced790f9a96ab70a613260 vp90-00-waterfall-800.webm -1366efe772fccaa2b8a6ac3ce45255b312a2ef6c vp90-00-waterfall-800.webm.md5 +ce881e567fe1d0fbcb2d3e9e6281a1a8d74d82e0 vp90-2-00-quantizer-00.webm +ac5eda33407d0521c7afca43a63fd305c0cd9d13 vp90-2-00-quantizer-00.webm.md5 +2ca0463f2cfb93d25d7dded174db70b7cb87cb48 vp90-2-00-quantizer-01.webm +10d98884fc6d9a5f47a2057922b8e25dd48d7786 vp90-2-00-quantizer-01.webm.md5 +d80a2920a5e0819d69dcba8fe260c01f820f8982 vp90-2-00-quantizer-02.webm +c964c8e5e04165fabbf1c6ee8ee5121d35921965 vp90-2-00-quantizer-02.webm.md5 +fdef046777b5b75c962b715d809dbe2ea331afb9 vp90-2-00-quantizer-03.webm +f270bee0b0c7aa2bf4c5afe098556b4f3f890faf vp90-2-00-quantizer-03.webm.md5 +66d98609e809394a6ac730787e6724e3badc075a vp90-2-00-quantizer-04.webm +427433bfe121c4aea1095ec3124fdc174d200e3a vp90-2-00-quantizer-04.webm.md5 +e6e42626d8cadf0b5be16313f69212981b96fee5 vp90-2-00-quantizer-05.webm +c98f6a9a1af4cfd71416792827304266aad4bd46 vp90-2-00-quantizer-05.webm.md5 +413ef09b721f5dcec1a96e937a97e5873c2e6db6 vp90-2-00-quantizer-06.webm +5080e940a23805c82e578e21b57fc2c511e76376 vp90-2-00-quantizer-06.webm.md5 +4a50a5f4ac717c30dfaae8bb46702e3542e867de vp90-2-00-quantizer-07.webm +76c429a02b56762e10ee4db88729d8834b3a70f4 vp90-2-00-quantizer-07.webm.md5 +d2f4e464780bf8b7e647efa18ac777a930e62bc0 vp90-2-00-quantizer-08.webm +ab94aabf9316111b52d7c531962ed4123313b6ba vp90-2-00-quantizer-08.webm.md5 +174bc58433936dd79550398d744f1072ce7f5693 vp90-2-00-quantizer-09.webm +e1f7690cd83ccc56d045e17cce552544a5f03810 vp90-2-00-quantizer-09.webm.md5 +52bc1dfd3a97b24d922eb8a31d07527891561f2a vp90-2-00-quantizer-10.webm +9b37bed893b5f6a4e12f2aa40f02dd40f944d0f8 vp90-2-00-quantizer-10.webm.md5 +10031eecafde1e1d8e6323fe2b2a1d7e77a66869 vp90-2-00-quantizer-11.webm +fe4620a4bb0e4f5cb9bbfedc4039a22b81b0f5c0 vp90-2-00-quantizer-11.webm.md5 +78e9f7bb77e8e348155bbdfa12790789d1d50c34 vp90-2-00-quantizer-12.webm +0961d060cc8dd469c6dac8d7d75f927c0bb971b8 vp90-2-00-quantizer-12.webm.md5 +133b77a3bbcef652552d74ffc46afbfe3b8a1cba vp90-2-00-quantizer-13.webm +df29e5e0f95772af482f540d776f6b9dea4bfa29 vp90-2-00-quantizer-13.webm.md5 +27323afdaf8987e025c27129c74c86502315a206 vp90-2-00-quantizer-14.webm +ce96a2cc312942f0427a463f15a392870dd69764 vp90-2-00-quantizer-14.webm.md5 +ab58d0b41037829f6bc993910999f4af0212aafd vp90-2-00-quantizer-15.webm +40f700db606501aa7cb49049624cbdde6409b122 vp90-2-00-quantizer-15.webm.md5 +cd948e66448aafb65998815ce37241f95d7c9ee7 vp90-2-00-quantizer-16.webm +039b742d149c945ed79c7b9a6384352852a1c116 vp90-2-00-quantizer-16.webm.md5 +62f56e663e13c576764e491cf08f19bd46a71999 vp90-2-00-quantizer-17.webm +90c5a39bf76e6b3e0a1c0d3e9b68a9fd78be963e vp90-2-00-quantizer-17.webm.md5 +f26ecad7263cd66a614e53ba5d7c00df181affeb vp90-2-00-quantizer-18.webm +cda0a1c0fca2ec2976ae55124a8a67305508bae6 vp90-2-00-quantizer-18.webm.md5 +94bfc4c04fcfe139a63b98c569e8c14ba98c401f vp90-2-00-quantizer-19.webm +5b8ec169ccf67d8a0a8e46a62eb173f5a1dbaf4f vp90-2-00-quantizer-19.webm.md5 +0ee88e9318985e1e245de78c2c4a665885ab76a7 vp90-2-00-quantizer-20.webm +4b26f7edb4fcd3a1b4cce9ba3cb8650e3ee6e063 vp90-2-00-quantizer-20.webm.md5 +6a995cb2b1db33da8087321df1e646f95c3e32d1 vp90-2-00-quantizer-21.webm +e216b4a1eceac03efcc433759be54ab8ea87b24b vp90-2-00-quantizer-21.webm.md5 +aa7722fc427e7180115f3c9cd96bb6b2768e7296 vp90-2-00-quantizer-22.webm +1aa813bd45ae831bf5e79ace4d73dfd25989a07d vp90-2-00-quantizer-22.webm.md5 +7677e5b929ed6d142041f19b8a9cd5822ee1504a vp90-2-00-quantizer-23.webm +0de0af34abd843d5b37e58baf3ed96a6104b64c3 vp90-2-00-quantizer-23.webm.md5 +b2995cbe1128b2d4926f1b28d01c501ecb6be8c8 vp90-2-00-quantizer-24.webm +db6033af2ba2f2bca62468fb4b8808e474f93923 vp90-2-00-quantizer-24.webm.md5 +8135ba35587fd92cd4667be7896323d9b634401c vp90-2-00-quantizer-25.webm +3499e00c2cc15876f61f07e3d3cfca54ebcd98fd vp90-2-00-quantizer-25.webm.md5 +af0fa2907746db82d345f6d831fcc1b2862a29fb vp90-2-00-quantizer-26.webm +cd6fe3d14dab48886ebf65be00e6ed9616ebe5a7 vp90-2-00-quantizer-26.webm.md5 +bd0002e91323776beb5ff11e06edcf19fc08e9b9 vp90-2-00-quantizer-27.webm +fe72154ef196067d6c272521012dd79706496cac vp90-2-00-quantizer-27.webm.md5 +fc15eb606f81455ff03df16bf3432296b002c43c vp90-2-00-quantizer-28.webm +40b2e24b542206a6bfd746ef199e49ccea07678a vp90-2-00-quantizer-28.webm.md5 +3090bbf913cad0b2eddca7228f5ed51a58378b8d vp90-2-00-quantizer-29.webm +eb59745e0912d8ed6c928268bcf265237c9ba93f vp90-2-00-quantizer-29.webm.md5 +c615abdca9c25e1cb110d908edbedfb3b7c92b91 vp90-2-00-quantizer-30.webm +ad0f4fe6733e4e7cdfe8ef8722bb341dcc7538c0 vp90-2-00-quantizer-30.webm.md5 +037d9f242086cfb085518f6416259defa82d5fc2 vp90-2-00-quantizer-31.webm +4654b40792572f0a790874c6347ef9196d86c1a7 vp90-2-00-quantizer-31.webm.md5 +505899f3f3515044c5c8b3213d9b9d16f614619d vp90-2-00-quantizer-32.webm +659a2e6dd02df323f62600626859006640b445df vp90-2-00-quantizer-32.webm.md5 +8b32ec9c3b7e5ca8ddc6b8aea1c1cb7ca996bccc vp90-2-00-quantizer-33.webm +5b175ef1120ddeba4feae1247bf381bbc4e816ce vp90-2-00-quantizer-33.webm.md5 +4d283755d17e287b1d099a80604398f60d7fb6ea vp90-2-00-quantizer-34.webm +22a739de95acfeb27524e3700b8f678a9ad744d8 vp90-2-00-quantizer-34.webm.md5 +4296f56a892a412d3d4f64824718dd566c4e6459 vp90-2-00-quantizer-35.webm +c532c9c8dc7b3506fc6a51e5c20c17ef0ac039e7 vp90-2-00-quantizer-35.webm.md5 +6f54e11da461e4410dd9075b015e2d9bc1d07dfb vp90-2-00-quantizer-36.webm +0b3573f5addea4e3eb11a0b85f068299d5bdad78 vp90-2-00-quantizer-36.webm.md5 +210581682a26c2c4375efc785c36e07539888bc2 vp90-2-00-quantizer-37.webm +2b4fb6f8ba975237858e61cc8f560bcfc87cb38e vp90-2-00-quantizer-37.webm.md5 +a15ef31283dfc4860f837fe200eb32a445f59629 vp90-2-00-quantizer-38.webm +fb76771f3a795054b9936f70da7505c3ac585284 vp90-2-00-quantizer-38.webm.md5 +1df8433a441412831daae6726df89fa70d21b14d vp90-2-00-quantizer-39.webm +39e162c09a20e7e684868097766347014371fee6 vp90-2-00-quantizer-39.webm.md5 +5330e4788ab9129dbb25a7a7d5411104521248b6 vp90-2-00-quantizer-40.webm +872cc0f2cc9dbf000f89eadb4d8f9940e48e00b1 vp90-2-00-quantizer-40.webm.md5 +d88d03b982889e399a78d7a06eeb1cf30e6c2da2 vp90-2-00-quantizer-41.webm +5b4f7217e57fa2a221011d0b32f8d0409496b7b6 vp90-2-00-quantizer-41.webm.md5 +9e16406e3e26955a6e17d455ef1ef64bbfa26e53 vp90-2-00-quantizer-42.webm +0219d090cf37daabe19256ba8e932ba4874b92e4 vp90-2-00-quantizer-42.webm.md5 +a9b15843486fb05f8cd15437ef279782a42b75db vp90-2-00-quantizer-43.webm +3c9b0b4c607f9579a31726bfcf56729334ddc686 vp90-2-00-quantizer-43.webm.md5 +1dbc931ac446c91eabe7213efff55b596cccf07c vp90-2-00-quantizer-44.webm +73bc8f675103abaef3d9f73a2742b3bffd726d23 vp90-2-00-quantizer-44.webm.md5 +7c6c1be15beb9d6201204b018966c8c4f9777efc vp90-2-00-quantizer-45.webm +c907b29da821f790c6748de61f592689312e4e36 vp90-2-00-quantizer-45.webm.md5 +07b434da1a467580f73b32177ee11b3e00f65a0d vp90-2-00-quantizer-46.webm +7b2b7ce60c50bc970bc0ada46d7a7ce440148da3 vp90-2-00-quantizer-46.webm.md5 +233d0465fb1a6fa36e9f89bd2193ac79bd4d2809 vp90-2-00-quantizer-47.webm +527e0a9fb932efe915027ffe077f9e8d3a4fb139 vp90-2-00-quantizer-47.webm.md5 +719613df7307e205c3fdb6acfb373849c5ab23c7 vp90-2-00-quantizer-48.webm +65ab6c9d1b682c183b201c7ff42b90343ce3e304 vp90-2-00-quantizer-48.webm.md5 +3bf04a598325ed0eabae1598ec7f718f715ec672 vp90-2-00-quantizer-49.webm +ac68c4387ce11fcc998d8ba455ab9b2bb361d240 vp90-2-00-quantizer-49.webm.md5 +d59238fb3a654931c9b65a11e7321b40d1f702e9 vp90-2-00-quantizer-50.webm +d0576bfede46fd55659f028f2fd28554ceb3e6cc vp90-2-00-quantizer-50.webm.md5 +3f579785101d4209360dd96f8c2ffe9beddf3bee vp90-2-00-quantizer-51.webm +89fcfe04f4457a7f02ab4a2f94aacbb88aee5789 vp90-2-00-quantizer-51.webm.md5 +28be5836e2fedefe4babf12fc9b79e460ab0a0f4 vp90-2-00-quantizer-52.webm +f3dd52b70c18345fee740220f35da9c4def2017a vp90-2-00-quantizer-52.webm.md5 +488ad4058c17170665b6acd1021fade9a02771e4 vp90-2-00-quantizer-53.webm +1cdcb1d4f3a37cf83ad235eb27ec62ed2a01afc7 vp90-2-00-quantizer-53.webm.md5 +682978289cb28cc8c9d39bc797300e45d6039de7 vp90-2-00-quantizer-54.webm +36c35353f2c03cb099bd710d9994de7d9ed88834 vp90-2-00-quantizer-54.webm.md5 +c398ce49af762a48f10cc4da9fae0769aae5f226 vp90-2-00-quantizer-55.webm +2cf3570542d984f167ab087f59493c7fb47e0ed2 vp90-2-00-quantizer-55.webm.md5 +3071f18b2fce261aa82d61f81a7ae4ca9a75d0e3 vp90-2-00-quantizer-56.webm +d3f93f8272b6de31cffb011a26f11abb514efb12 vp90-2-00-quantizer-56.webm.md5 +f4e8e14b1f278801a7eb6f11734780a01b1668e9 vp90-2-00-quantizer-57.webm +6478fdf1d7faf6db5f19dffc5e1363af358699ee vp90-2-00-quantizer-57.webm.md5 +307dc264f57cc618fff211fa44d7f52767ed9660 vp90-2-00-quantizer-58.webm +cf231d4a52d492fa692ea4194ec5eb7511fec54e vp90-2-00-quantizer-58.webm.md5 +1fd7cd596170afce2de0b1441b7674bda5723440 vp90-2-00-quantizer-59.webm +4681f7ef96f63e085c41bb1a964b0df7e67e0b38 vp90-2-00-quantizer-59.webm.md5 +34cdcc81c0ba7085aefbb22d7b4aa9bca3dd7c62 vp90-2-00-quantizer-60.webm +58691ef53b6b623810e2c57ded374c77535df935 vp90-2-00-quantizer-60.webm.md5 +e6e812406aab81021bb16e772c1db03f75906cb6 vp90-2-00-quantizer-61.webm +76436eace62f08ff92b61a0845e66667a027db1b vp90-2-00-quantizer-61.webm.md5 +84d811bceed70c950a6a08e572a6e274866e72b1 vp90-2-00-quantizer-62.webm +2d937cc011eeddd95222b960982da5cd18db580f vp90-2-00-quantizer-62.webm.md5 +0912b295ba0ea09359315315ffd67d22d046f883 vp90-2-00-quantizer-63.webm +5a829031055d70565f57dbcd47a6ac33619952b3 vp90-2-00-quantizer-63.webm.md5 +0cf9e5ebe0112bdb47b5887ee5d58eb9d4727c00 vp90-2-01-sharpness-1.webm +5a0476be4448bae8f8ca17ea236c98793a755948 vp90-2-01-sharpness-1.webm.md5 +51e02d7911810cdf5be8b68ac40aedab479a3179 vp90-2-01-sharpness-2.webm +a0ca5bc87a5ed7c7051f59078daa0d03be1b45b6 vp90-2-01-sharpness-2.webm.md5 +0603f8ad239c07a531d948187f4dafcaf51eda8d vp90-2-01-sharpness-3.webm +3af8000a69c72fe77881e3176f026c2affb78cc7 vp90-2-01-sharpness-3.webm.md5 +4ca4839f48146252fb261ed88838d80211804841 vp90-2-01-sharpness-4.webm +08832a1494f84fa9edd40e080bcf2c0e80100c76 vp90-2-01-sharpness-4.webm.md5 +95099dc8f9cbaf9b9a7dd65311923e441ff70731 vp90-2-01-sharpness-5.webm +93ceee30c140f0b406726c0d896b9db6031c4c7f vp90-2-01-sharpness-5.webm.md5 +ceb4116fb7b078d266d153233b6d62a255a34e4c vp90-2-01-sharpness-6.webm +da83efe59e537ce538e8b03a6eac63cf25849c9a vp90-2-01-sharpness-6.webm.md5 +b5f7cd19aece3880f9d616a778e5cc24c6b9b505 vp90-2-01-sharpness-7.webm +2957408d20deac8633941a2169f801bae6f086e1 vp90-2-01-sharpness-7.webm.md5 +ffc096c2ce1050450ad462b5fabd2a5220846319 vp90-2-02-size-08x08.webm +e36d2ed6fa2746347710b750586aafa6a01ff3ae vp90-2-02-size-08x08.webm.md5 +895b986f9fd55cd879472b31c6a06b82094418c8 vp90-2-02-size-08x10.webm +079157a19137ccaebba606f2871f45a397347150 vp90-2-02-size-08x10.webm.md5 +1c5992203e62a2b83040ccbecd748b604e19f4c0 vp90-2-02-size-08x16.webm +9aa45ffdf2078f883bbed01450031b691819c144 vp90-2-02-size-08x16.webm.md5 +d0a8953da1f85f484487408fee5da9e2a8391901 vp90-2-02-size-08x18.webm +59a5cc17d354c6a23e5e959d666b1456a5d49c56 vp90-2-02-size-08x18.webm.md5 +1b13461a9fc65cb041bacfe4ea6f02d363397d61 vp90-2-02-size-08x32.webm +2bdddd6878f05d37d84cde056a3f5e7f926ba3d6 vp90-2-02-size-08x32.webm.md5 +2861f0a0daadb62295b0504a1fbe5b50c79a8f59 vp90-2-02-size-08x34.webm +6b5812cfb8a82d378ea2913bf009e93668020147 vp90-2-02-size-08x34.webm.md5 +02f948216d4246579dc53c47fe55d8fb264ba251 vp90-2-02-size-08x64.webm +84b55fdee6d9aa820c7a8c62822446184b191767 vp90-2-02-size-08x64.webm.md5 +4b011242cbf42516efd2b197baebb61dd34562c9 vp90-2-02-size-08x66.webm +6b1fa0a885947b3cc0fe58f75f838e662bd9bb8b vp90-2-02-size-08x66.webm.md5 +4057796be9dd12df48ab607f502ae6aa70eeeab6 vp90-2-02-size-10x08.webm +71c752c51aec9f48de286b93f4c20e9c11cad7d0 vp90-2-02-size-10x08.webm.md5 +6583c853fa43fc53d51743eac5f3a43a359d45d0 vp90-2-02-size-10x10.webm +1da524d24af1944b671d4d3f2b398d6e336584c3 vp90-2-02-size-10x10.webm.md5 +ba442fc03ccd3a705c64c83b36f5ada67d198874 vp90-2-02-size-10x16.webm +7cfd960f232c34c641a4a2a9411b6fd0efb2fc50 vp90-2-02-size-10x16.webm.md5 +cc92ed40eef14f52e4d080cb2c57939dd8326374 vp90-2-02-size-10x18.webm +db5626275cc55ce970b91c995e74f6838d943aca vp90-2-02-size-10x18.webm.md5 +3a93d501d22325e9fd4c9d8b82e2a432de33c351 vp90-2-02-size-10x32.webm +5cae51b0c71cfc131651f345f87583eb2903afaf vp90-2-02-size-10x32.webm.md5 +50d2f2b15a9a5178153db44a9e03aaf32b227f67 vp90-2-02-size-10x34.webm +bb0efe058122641e7f73e94497dda2b9e6c21efd vp90-2-02-size-10x34.webm.md5 +01624ec173e533e0b33fd9bdb91eb7360c7c9175 vp90-2-02-size-10x64.webm +b9c0e3b054463546356acf5157f9be92fd34732f vp90-2-02-size-10x64.webm.md5 +2942879baf1c09e96b14d0fc84806abfe129c706 vp90-2-02-size-10x66.webm +bab5f539c2f91952e187456b4beafbb4c01e25ee vp90-2-02-size-10x66.webm.md5 +88d2b63ca5e9ee163d8f20e8886f3df3ff301a66 vp90-2-02-size-16x08.webm +7f48a0fcf8c25963f3057d7f6669c5f2415834b8 vp90-2-02-size-16x08.webm.md5 +59261eb34c15ea9b5ddd2d416215c1a8b9e6dc1f vp90-2-02-size-16x10.webm +73a7c209a46dd051c9f7339b6e02ccd5b3b9fc81 vp90-2-02-size-16x10.webm.md5 +066834fef9cf5b9a72932cf4dea5f253e14a976d vp90-2-02-size-16x16.webm +faec542f52f37601cb9c480d887ae9355be99372 vp90-2-02-size-16x16.webm.md5 +195307b4eb3192271ee4a935b0e48deef0c54cc2 vp90-2-02-size-16x18.webm +5a92e19e624c0376321d4d0e22c0c91995bc23e1 vp90-2-02-size-16x18.webm.md5 +14f3f884216d7ae16ec521f024a2f2d31bbf9c1a vp90-2-02-size-16x32.webm +ea622d1c817dd174556f7ee7ccfe4942b34d4845 vp90-2-02-size-16x32.webm.md5 +2e0501100578a5da9dd47e4beea160f945bdd1ba vp90-2-02-size-16x34.webm +1b8645ef64239334921c5f56b24ce815e6070b05 vp90-2-02-size-16x34.webm.md5 +89a6797fbebebe93215f367229a9152277f5dcfe vp90-2-02-size-16x64.webm +a03d8c1179ca626a8856fb416d635dbf377979cd vp90-2-02-size-16x64.webm.md5 +0f3a182e0750fcbae0b9eae80c7a53aabafdd18d vp90-2-02-size-16x66.webm +8cb6736dc2d897c1283919a32068af377d66c59c vp90-2-02-size-16x66.webm.md5 +68fe70dc7914cc1d8d6dcd97388b79196ba3e7f1 vp90-2-02-size-18x08.webm +874c7fb505be9db3160c57cb405c4dbd5b990dc2 vp90-2-02-size-18x08.webm.md5 +0546352dd78496d4dd86c3727ac2ff36c9e72032 vp90-2-02-size-18x10.webm +1d80eb36557ea5f25a386495a36f93da0f25316b vp90-2-02-size-18x10.webm.md5 +60fe99e5f5cc99706efa3e0b894e45cbcf0d6330 vp90-2-02-size-18x16.webm +1ab6cdd89a53662995d103546e6611c84f9292ab vp90-2-02-size-18x16.webm.md5 +f9a8f5fb749d69fd555db6ca093b7f77800c7b4f vp90-2-02-size-18x18.webm +ace8a66328f7802b15f9989c2720c029c6abd279 vp90-2-02-size-18x18.webm.md5 +a197123a527ec25913a9bf52dc8c347749e00045 vp90-2-02-size-18x32.webm +34fbd7036752232d1663e70d7f7cdc93f7129202 vp90-2-02-size-18x32.webm.md5 +f219655a639a774a2c9c0a9f45c28dc0b5e75e24 vp90-2-02-size-18x34.webm +2c4d622a9ea548791c1a07903d3702e9774388bb vp90-2-02-size-18x34.webm.md5 +5308578da48c677d477a5404e19391d1303033c9 vp90-2-02-size-18x64.webm +e7fd4462527bac38559518ba80e41847db880f15 vp90-2-02-size-18x64.webm.md5 +e109a7e013bd179f97e378542e1e81689ed06802 vp90-2-02-size-18x66.webm +45c04e422fb383c1f3be04beefaa4490e83bdb1a vp90-2-02-size-18x66.webm.md5 +38844cae5d99caf445f7de33c3ae78494ce36c01 vp90-2-02-size-32x08.webm +ad018be39e493ca2405225034b1a5b7a42af6f3a vp90-2-02-size-32x08.webm.md5 +7b57eaad55906f9de9903c8657a3fcb2aaf792ea vp90-2-02-size-32x10.webm +2294425d4e55d275af5e25a0beac9738a1b4ee73 vp90-2-02-size-32x10.webm.md5 +f47ca2ced0d47f761bb0a5fdcd911d3f450fdcc1 vp90-2-02-size-32x16.webm +ae10981d93913f0ab1f28c1146255e01769aa8c0 vp90-2-02-size-32x16.webm.md5 +08b23ad838b6cf1fbfe3ad7e7775d95573e815fc vp90-2-02-size-32x18.webm +1ba76f4c4a4ac7aabfa3ce195c1b473535eb7cc8 vp90-2-02-size-32x18.webm.md5 +d5b88ae6c8c25c53dee74d9f1e6ca64244349a57 vp90-2-02-size-32x32.webm +e39c067a8ee2da52a51641eb1cb7f8eba935eb6b vp90-2-02-size-32x32.webm.md5 +529429920dc36bd899059fa75a767f02c8c60874 vp90-2-02-size-32x34.webm +56888e7834f52b106e8911e3a7fc0f473b609995 vp90-2-02-size-32x34.webm.md5 +38e848e160391c2b1a55040aadde613b9f4bf15e vp90-2-02-size-32x64.webm +8950485fb3f68b0e8be234db860e4ec5f5490fd0 vp90-2-02-size-32x64.webm.md5 +5e8670f0b8ec9cefa8795b8959ffbe1a8e1aea94 vp90-2-02-size-32x66.webm +225df9d7d72ec711b0b60f4aeb65311c97db054a vp90-2-02-size-32x66.webm.md5 +695f929e2ce6fb11a1f180322d46c5cb1c97fa61 vp90-2-02-size-34x08.webm +5bb4262030018dd01883965c6aa6070185924ef6 vp90-2-02-size-34x08.webm.md5 +5adf74ec906d2ad3f7526e06bd29f5ad7d966a90 vp90-2-02-size-34x10.webm +71c100b437d3e8701632ae8d65c3555339b1c68f vp90-2-02-size-34x10.webm.md5 +d0918923c987fba2d00193d83797b21289fe54aa vp90-2-02-size-34x16.webm +5d5a52f3535b4d2698dd3d87f4a13fdc9b57163d vp90-2-02-size-34x16.webm.md5 +553ab0042cf87f5e668ec31b2e4b2a4b6ec196fd vp90-2-02-size-34x18.webm +a164c7f3c424987df2340496e6a8cf76e973f0f1 vp90-2-02-size-34x18.webm.md5 +baf3e233634f150de81c18ba5d8848068e1c3c54 vp90-2-02-size-34x32.webm +22a79d3bd1c9b85dfe8c70bb2e19f08a92a8be03 vp90-2-02-size-34x32.webm.md5 +6d50a533774a7167350e4a7ef43c94a5622179a2 vp90-2-02-size-34x34.webm +0c099638e79c273546523e06704553e42eb00b00 vp90-2-02-size-34x34.webm.md5 +698cdd0a5e895cc202c488675e682a8c537ede4f vp90-2-02-size-34x64.webm +9317b63987cddab8389510a27b86f9f3d46e3fa5 vp90-2-02-size-34x64.webm.md5 +4b5335ca06f082b6b69f584eb8e7886bdcafefd3 vp90-2-02-size-34x66.webm +e18d68b35428f46a84a947c646804a51ef1d7cec vp90-2-02-size-34x66.webm.md5 +a54ae7b494906ec928a876e8290e5574f2f9f6a2 vp90-2-02-size-64x08.webm +87f9f7087b6489d45e9e4b38ede2c5aef4a4928f vp90-2-02-size-64x08.webm.md5 +24522c70804a3c23d937df2d829ae63965b23f38 vp90-2-02-size-64x10.webm +447ce03938ab53bffcb4a841ee0bfaa90462dcb9 vp90-2-02-size-64x10.webm.md5 +2a5035d035d214ae614af8051930690ef623989b vp90-2-02-size-64x16.webm +84e355761dd2e0361b904c84c52a0dd0384d89cf vp90-2-02-size-64x16.webm.md5 +3a293ef4e270a19438e59b817fbe5f43eed4d36b vp90-2-02-size-64x18.webm +666824e5ba746779eb46079e0631853dcc86d48b vp90-2-02-size-64x18.webm.md5 +ed32fae837095c9e8fc95d223ec68101812932c2 vp90-2-02-size-64x32.webm +97086eadedce1d0d9c072b585ba7b49aec69b1e7 vp90-2-02-size-64x32.webm.md5 +696c7a7250bdfff594f4dfd88af34239092ecd00 vp90-2-02-size-64x34.webm +253a1d38d452e7826b086846c6f872f829c276bb vp90-2-02-size-64x34.webm.md5 +fc508e0e3c2e6872c60919a60b812c5232e9c2b0 vp90-2-02-size-64x64.webm +2cd6ebeca0f82e9f505616825c07950371b905ab vp90-2-02-size-64x64.webm.md5 +0f8a4fc1d6521187660425c283f08dff8c66e476 vp90-2-02-size-64x66.webm +5806be11a1d346be235f88d3683e69f73746166c vp90-2-02-size-64x66.webm.md5 +273b0c36e3658685cde250408a478116d7ae92f1 vp90-2-02-size-66x08.webm +23c3cd0dca20a2f71f036e77ea92025ff4e7a298 vp90-2-02-size-66x08.webm.md5 +4844c59c3306d1e671bb0568f00e344bf797e66e vp90-2-02-size-66x10.webm +e041eaf6841d775f8fde8bbb4949d2733fdaab7f vp90-2-02-size-66x10.webm.md5 +bdf3f1582b234fcd2805ffec59f9d716a2345302 vp90-2-02-size-66x16.webm +2ec85ee18119e6798968571ea6e1b93ca386e3af vp90-2-02-size-66x16.webm.md5 +0acce9af12b13b025d5274013da7ef6f568f075f vp90-2-02-size-66x18.webm +77c4d53e2a5c96b70af9d575fe6811e0f5ee627b vp90-2-02-size-66x18.webm.md5 +682b36a25774bbdedcd603f504d18eb63f0167d4 vp90-2-02-size-66x32.webm +53728fae2a428f16d376a29f341a64ddca97996a vp90-2-02-size-66x32.webm.md5 +e71b70e901e29eaa6672a6aa4f37f6f5faa02bd6 vp90-2-02-size-66x34.webm +f69a6a555e3f614b0a35f9bfc313d8ebb35bc725 vp90-2-02-size-66x34.webm.md5 +4151b8c29452d5c2266397a7b9bf688899a2937b vp90-2-02-size-66x64.webm +69486e7fd9e380b6c97a03d3e167affc79f73840 vp90-2-02-size-66x64.webm.md5 +68784a1ecac776fe2a3f230345af32f06f123536 vp90-2-02-size-66x66.webm +7f008c7f48d55e652fbd6bac405b51e0015c94f2 vp90-2-02-size-66x66.webm.md5 +7e1bc449231ac1c5c2a11c9a6333b3e828763798 vp90-2-03-size-196x196.webm +6788a561466dace32d500194bf042e19cccc35e1 vp90-2-03-size-196x196.webm.md5 +a170c9a88ec1dd854c7a471ff55fb2a97ac31870 vp90-2-03-size-196x198.webm +6bf9d6a8e2bdc5bf4f8a78071a3fed5ca02ad6f2 vp90-2-03-size-196x198.webm.md5 +68f861d21c4c8b03d572c3d3fcd9f4fbf1f4503f vp90-2-03-size-196x200.webm +bbfc260b2bfd872cc6054272bb6b7f959a9e1c6e vp90-2-03-size-196x200.webm.md5 +fc34889feeca2b7e5b27b4f1ce22d2e2b8e3e4b1 vp90-2-03-size-196x202.webm +158ee72af578f39aad0c3b8f4cbed2fc78b57e0f vp90-2-03-size-196x202.webm.md5 +dd28fb7247af534bdf5e6795a3ac429610489a0b vp90-2-03-size-196x208.webm +7546be847efce2d1c0a23f807bfb03f91b764e1e vp90-2-03-size-196x208.webm.md5 +41d5cf5ed65b722a1b6dc035e67f978ea8ffecf8 vp90-2-03-size-196x210.webm +9444fdf632d6a1b6143f4cb10fed8f63c1d67ec1 vp90-2-03-size-196x210.webm.md5 +5007bc618143437c009d6dde5fc2e86f72d37dc2 vp90-2-03-size-196x224.webm +858361d8f79b44df5545feabbc9754ec9ede632f vp90-2-03-size-196x224.webm.md5 +0bcbe357fbc776c3fa68e7117179574ed7564a44 vp90-2-03-size-196x226.webm +72006a5f42031a43d70a2cd9fc1958962a86628f vp90-2-03-size-196x226.webm.md5 +000239f048cceaac055558e97ef07078ebf65502 vp90-2-03-size-198x196.webm +2d6841901b72000c5340f30be602853438c1b787 vp90-2-03-size-198x196.webm.md5 +ae75b766306a6404c3b3b35a6b6d53633c14fbdb vp90-2-03-size-198x198.webm +3f2544b4f3b4b643a98f2c3b15ea5826fc702fa1 vp90-2-03-size-198x198.webm.md5 +95ffd573fa84ccef1cd59e1583e6054f56a5c83d vp90-2-03-size-198x200.webm +5d537e3c9b9c54418c79677543454c4cda3de1af vp90-2-03-size-198x200.webm.md5 +ecc845bf574375f469bc91bf5c75c79dc00073d6 vp90-2-03-size-198x202.webm +1b59f5e111265615a7a459eeda8cc9045178d228 vp90-2-03-size-198x202.webm.md5 +432fb27144fe421b9f51cf44d2750a26133ed585 vp90-2-03-size-198x208.webm +a58a67f4fb357c73ca078aeecbc0f782975630b1 vp90-2-03-size-198x208.webm.md5 +ff5058e7e6a47435046612afc8536f2040989e6f vp90-2-03-size-198x210.webm +18d3be7935e52217e2e9400b6f2c681a9e45dc89 vp90-2-03-size-198x210.webm.md5 +a0d55263c1ed2c03817454dd4ec4090d36dbc864 vp90-2-03-size-198x224.webm +efa366a299817e2da51c00623b165aab9fbb8d91 vp90-2-03-size-198x224.webm.md5 +ccd142fa2920fc85bb753f049160c1c353ad1574 vp90-2-03-size-198x226.webm +534524a0b2dbff852e0b92ef09939db072f83243 vp90-2-03-size-198x226.webm.md5 +0d483b94ed40abc8ab6e49f960432ee54ad9c7f1 vp90-2-03-size-200x196.webm +41795f548181717906e7a504ba551f06c32102ae vp90-2-03-size-200x196.webm.md5 +f6c2dc54e0989d50f01333fe40c91661fcbf849a vp90-2-03-size-200x198.webm +43df5d8c46a40089441392e6d096c588c1079a68 vp90-2-03-size-200x198.webm.md5 +2f6e9df82e44fc145f0d9212dcccbed3de605e23 vp90-2-03-size-200x200.webm +757b2ef96b82093255725bab9690bbafe27f3caf vp90-2-03-size-200x200.webm.md5 +40c5ea60415642a4a2e75c0d127b06309baadfab vp90-2-03-size-200x202.webm +3022c4a1c625b5dc04fdb1052d17d45b4171cfba vp90-2-03-size-200x202.webm.md5 +6942ed5b27476bb8506d10e600d6ff60887780ca vp90-2-03-size-200x208.webm +c4ab8c66f3cf2dc8e8dd7abae9ac21f4d32cd6be vp90-2-03-size-200x208.webm.md5 +71dbc99b83c49d1da45589b91eabb98e2f4a7b1e vp90-2-03-size-200x210.webm +3f0b40da7eef7974b9bc326562f251feb67d9c7c vp90-2-03-size-200x210.webm.md5 +6b6b8489081cfefb377cc5f18eb754ec2383f655 vp90-2-03-size-200x224.webm +a259df2ac0e294492e3f9d4315baa34cab044f04 vp90-2-03-size-200x224.webm.md5 +c9adc1c9bb07559349a0b054df4af56f7a6edbb9 vp90-2-03-size-200x226.webm +714cec61e3575581e4f1a0e3921f4dfdbbd316c5 vp90-2-03-size-200x226.webm.md5 +f9bdc936bdf53f8be9ce78fecd41a21d31ff3943 vp90-2-03-size-202x196.webm +5b8e2e50fcea2c43b12fc067b8a9cc117af77bda vp90-2-03-size-202x196.webm.md5 +c7b66ea3da87613deb47ff24a111247d3c384fec vp90-2-03-size-202x198.webm +517e91204b25586da943556f4adc5951c9be8bee vp90-2-03-size-202x198.webm.md5 +935ef56b01cfdb4265a7e24696645209ccb20970 vp90-2-03-size-202x200.webm +55b8ec4a2513183144a8e27564596c06c7576fce vp90-2-03-size-202x200.webm.md5 +849acf75e4f1d8d90046704e1103a18c64f30e35 vp90-2-03-size-202x202.webm +c79afc6660df2824e7df314e5bfd71f0d8acf76b vp90-2-03-size-202x202.webm.md5 +17b3a4d55576b770626ccb856b9f1a6c8f6ae476 vp90-2-03-size-202x208.webm +0b887ff30409c58f2ccdc3bfacd6be7c69f8997a vp90-2-03-size-202x208.webm.md5 +032d0ade4230fb2eef6d19915a7a1c9aa4a52617 vp90-2-03-size-202x210.webm +f78f8e79533c0c88dd2bfdcec9b1c07848568ece vp90-2-03-size-202x210.webm.md5 +915a38c31fe425d5b93c837121cfa8082f5ea5bc vp90-2-03-size-202x224.webm +bf52a104074d0c5942aa7a5b31e11db47e43d48e vp90-2-03-size-202x224.webm.md5 +be5cfde35666fa435e47d544d9258215beb1cf29 vp90-2-03-size-202x226.webm +2fa2f87502fda756b319389c8975204e130a2e3f vp90-2-03-size-202x226.webm.md5 +15d908e97862b5b4bf295610df011fb9aa09909b vp90-2-03-size-208x196.webm +50c60792305d6a99be376dd596a6ff979325e6cc vp90-2-03-size-208x196.webm.md5 +a367c7bc9fde56d6f4848cc573c7d4c1ce75e348 vp90-2-03-size-208x198.webm +be85fb2c8d435a75484231356f07d06ebddd13cd vp90-2-03-size-208x198.webm.md5 +05fd46deb7288e7253742091f56e54a9a441a187 vp90-2-03-size-208x200.webm +74f8ec3b3a2fe81767ed1ab36a47bc0062d6223c vp90-2-03-size-208x200.webm.md5 +d8985c4b386513a7385a4b3639bf91e469f1378b vp90-2-03-size-208x202.webm +0614a1e8d92048852adcf605a51333f5fabc7f03 vp90-2-03-size-208x202.webm.md5 +28b002242238479165ba4fb87ee6b442c64b32e4 vp90-2-03-size-208x208.webm +37de5aca59bb900228400b0e115d3229edb9dcc0 vp90-2-03-size-208x208.webm.md5 +c545be0050c2fad7c68427dbf86c62a739e94ab3 vp90-2-03-size-208x210.webm +d646eccb3cd578f94b54777e32b88898bef6e17a vp90-2-03-size-208x210.webm.md5 +63a0cfe295b661026dd7b1bebb67acace1db766f vp90-2-03-size-208x224.webm +85c0361d93bf85a335248fef2767ff43eeef23db vp90-2-03-size-208x224.webm.md5 +f911cc718d66e4fe8a865226088939c9eb1b7825 vp90-2-03-size-208x226.webm +a6d583a57876e7b7ec48625b2b2cdbcf70cab837 vp90-2-03-size-208x226.webm.md5 +5bbb0f36da9a4683cf04e724124d8696332911bf vp90-2-03-size-210x196.webm +a3580fc7816d7fbcfb54fdba501cabbd06ba2f1d vp90-2-03-size-210x196.webm.md5 +8db64d6f9ce36dd382013b42ae4e292deba697bc vp90-2-03-size-210x198.webm +eda20f8268c7f4147bead4059e9c4897e09140a9 vp90-2-03-size-210x198.webm.md5 +ce391505eeaf1d12406563101cd6b2dbbbb44bfc vp90-2-03-size-210x200.webm +79d73b7f623082d2a00aa33e95c79d11c7d9c3a8 vp90-2-03-size-210x200.webm.md5 +852db6fdc206e72391fc69b807f1954934679949 vp90-2-03-size-210x202.webm +f69414c5677ed2f2b8b37ae76429e509a92276a5 vp90-2-03-size-210x202.webm.md5 +c424cc3edd2308da7d33f27acb36b54db5bf2595 vp90-2-03-size-210x208.webm +27b18562faa1b3184256f4eae8114b539b3e9d3e vp90-2-03-size-210x208.webm.md5 +dd029eba719d50a2851592fa8b9b2efe88904930 vp90-2-03-size-210x210.webm +c853a1670465eaa04ca31b3511995f1b6ed4f58f vp90-2-03-size-210x210.webm.md5 +d962e8ae676c54d0c3ea04ec7c04b37ae6a786e3 vp90-2-03-size-210x224.webm +93b793e79d987065b39ad8e2e71244368435fc25 vp90-2-03-size-210x224.webm.md5 +3d0825fe83bcc125be1f78145ff43ca6d7588784 vp90-2-03-size-210x226.webm +5230f31a57ca3b5311698a12035d2644533b3ec4 vp90-2-03-size-210x226.webm.md5 +6622f8bd9279e1ce45509a58a31a990052d45e14 vp90-2-03-size-224x196.webm +65411da07f60113f2be05c807879072b161d561e vp90-2-03-size-224x196.webm.md5 +6744ff2ee2c41eb08c62ff30880833b6d77b585b vp90-2-03-size-224x198.webm +46ea3641d41acd4bff347b224646c060d5620385 vp90-2-03-size-224x198.webm.md5 +8eb91f3416a1404705f370caecd74b2b458351b1 vp90-2-03-size-224x200.webm +196aefb854c8b95b9330263d6690b7ee15693ecf vp90-2-03-size-224x200.webm.md5 +256a5a23ef4e6d5ef2871af5afb8cd13d28cec00 vp90-2-03-size-224x202.webm +840ad8455dcf2be378c14b007e66fa642fc8196d vp90-2-03-size-224x202.webm.md5 +db4606480ab48b96c9a6ff5e639f1f1aea2a12e4 vp90-2-03-size-224x208.webm +40b9801d5620467499ac70fa6b7c40aaa5e1c331 vp90-2-03-size-224x208.webm.md5 +e37159e687fe1cb24cffddfae059301adbaf4212 vp90-2-03-size-224x210.webm +1e4acd4b6334ae260c3eed08652d0ba8122073f2 vp90-2-03-size-224x210.webm.md5 +0de1eb4bb6285ae621e4f2b613d2aa4a8c95a130 vp90-2-03-size-224x224.webm +37db449ad86fb286c2c02d94aa8fe0379c05044a vp90-2-03-size-224x224.webm.md5 +32ebbf903a7d7881bcfe59639f1d472371f3bf27 vp90-2-03-size-224x226.webm +5cc3ac5dc9f6912491aa2ddac863f8187f34c569 vp90-2-03-size-224x226.webm.md5 +9480ff5c2c32b1870ac760c87514912616e6cf01 vp90-2-03-size-226x196.webm +fe83655c0f1888f0af7b047785f01ba7ca9f1324 vp90-2-03-size-226x196.webm.md5 +09cad4221996315cdddad4e502dbfabf53ca1d6a vp90-2-03-size-226x198.webm +e3ddfdc650acb95adb45abd9b634e1f09ea8ac96 vp90-2-03-size-226x198.webm.md5 +c34f49d55fe39e3f0b607e3cc95e30244225cecb vp90-2-03-size-226x200.webm +abb83edc868a3523ccd4e5523fac2efbe7c3df1f vp90-2-03-size-226x200.webm.md5 +d17bc08eedfc60c4c23d576a6c964a21bf854d1f vp90-2-03-size-226x202.webm +1d22d2d0f375251c2d5a1acb4714bc35d963865b vp90-2-03-size-226x202.webm.md5 +9bd537c4f92a25596ccd29fedfe181feac948b92 vp90-2-03-size-226x208.webm +6feb0e7325386275719f3511ada9e248a2ae7df4 vp90-2-03-size-226x208.webm.md5 +4487067f6cedd495b93696b44b37fe0a3e7eda14 vp90-2-03-size-226x210.webm +49a8fa87945f47208168d541c068e78d878075d5 vp90-2-03-size-226x210.webm.md5 +559fea2f8da42b33c1aa1dbc34d1d6781009847a vp90-2-03-size-226x224.webm +83c6d8f2969b759e10e5c6542baca1265c874c29 vp90-2-03-size-226x224.webm.md5 +fe0af2ee47b1e5f6a66db369e2d7e9d870b38dce vp90-2-03-size-226x226.webm +94ad19b8b699cea105e2ff18f0df2afd7242bcf7 vp90-2-03-size-226x226.webm.md5 diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk index 806901d..619533a 100644 --- a/libvpx/test/test.mk +++ b/libvpx/test/test.mk @@ -25,6 +25,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += i420_video_source.h LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc LIBVPX_TEST_SRCS-$(CONFIG_DECODERS) += ../md5_utils.h ../md5_utils.c LIBVPX_TEST_SRCS-yes += decode_test_driver.cc @@ -66,6 +68,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc @@ -227,223 +230,401 @@ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-100.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-100.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-300.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-300.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-50.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-akiyo-50.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-150.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-150.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-25.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-25.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-400.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bowing-400.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-100.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-100.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-2000.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-2000.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-300.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-300.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-4400.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-4400.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-800.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-bus-800.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-1600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-1600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-2800.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-2800.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-400.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-400.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-cheer-600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-1200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-1200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-2000.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-2000.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-300.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-300.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-city-600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-1200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-1200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-3600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-3600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-5200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-coastguard-5200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-1000.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-1000.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-50.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-container-50.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-1000.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-1000.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-50.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-deadline-50.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-100.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-100.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-2000.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-2000.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-300.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-300.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-4400.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-4400.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-800.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-flower-800.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-1600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-1600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-2800.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-2800.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-400.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-400.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-football-600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-1200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-1200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-2000.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-2000.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-300.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-300.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-foreman-600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-1200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-1200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-2000.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-2000.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-300.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-300.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-hallmonitor-600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-1200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-1200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-3600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-3600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-5200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-harbour-5200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-100.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-100.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-1600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-1600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-2800.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-2800.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-50.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-highway-50.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-100.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-100.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-2000.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-2000.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-300.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-300.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-4400.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-4400.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-800.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-husky-800.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-150.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-150.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-400.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-400.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-800.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-ice-800.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-1600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-1600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-2800.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-2800.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-400.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-400.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-mobile-600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-100.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-100.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-300.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-300.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-motherdaughter-600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-100.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-100.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-300.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-300.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-news-600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-150.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-150.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-25.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-25.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-400.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-pamphlet-400.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-1000.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-1000.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-50.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-paris-50.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-1000.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-1000.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-50.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-signirene-50.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-1000.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-1000.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-50.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-silent-50.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-100.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-100.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-2000.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-2000.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-300.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-300.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-4400.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-4400.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-800.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-soccer-800.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-1600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-1600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-2800.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-2800.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-400.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-400.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-stefan-600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-100.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-100.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-300.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-300.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-students-600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-1200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-1200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-3600.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-3600.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-5200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tempete-5200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-100.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-100.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-2000.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-2000.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-300.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-300.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-4400.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-4400.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-800.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-tennis-800.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-150.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-150.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-200.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-200.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-400.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-400.webm.md5 -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-800.webm -LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-00-waterfall-800.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-02.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-03.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-04.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-05.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-06.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-07.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-09.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-11.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-12.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-13.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-14.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-15.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-17.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-19.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-20.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-21.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-22.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-23.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-24.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-25.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-26.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-27.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-28.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-29.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-30.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-31.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-33.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-35.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-36.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-37.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-38.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-39.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-40.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-41.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-42.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-43.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-44.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-45.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-46.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-47.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-48.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-49.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-50.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-51.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-52.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-53.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-54.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-55.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-56.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-57.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-58.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-59.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-60.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-61.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-62.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-63.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-1.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-2.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-3.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-4.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-5.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-6.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-01-sharpness-7.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-08x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-10x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-16x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-18x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-32x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-34x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-64x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x08.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x10.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x16.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x18.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x32.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x34.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-198x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-200x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-202x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-208x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-210x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-224x226.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x196.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x198.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x200.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x202.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x208.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x210.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x224.webm.md5 +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-226x226.webm.md5 diff --git a/libvpx/test/test_vector_test.cc b/libvpx/test/test_vector_test.cc index d7bd184..9b0e9d5 100644 --- a/libvpx/test/test_vector_test.cc +++ b/libvpx/test/test_vector_test.cc @@ -60,61 +60,106 @@ const char *kVP8TestVectors[] = { #endif #if CONFIG_VP9_DECODER const char *kVP9TestVectors[] = { - "vp90-00-akiyo-200.webm", "vp90-00-akiyo-300.webm", - "vp90-00-akiyo-50.webm", "vp90-00-bowing-150.webm", - "vp90-00-bowing-25.webm", "vp90-00-bowing-400.webm", - "vp90-00-bus-100.webm", "vp90-00-bus-2000.webm", - "vp90-00-bus-300.webm", "vp90-00-bus-4400.webm", - "vp90-00-bus-800.webm", "vp90-00-cheer-1600.webm", - "vp90-00-cheer-2800.webm", "vp90-00-cheer-400.webm", - "vp90-00-cheer-600.webm", "vp90-00-city-1200.webm", - "vp90-00-city-2000.webm", "vp90-00-city-300.webm", - "vp90-00-city-600.webm", "vp90-00-coastguard-1200.webm", - "vp90-00-coastguard-200.webm", "vp90-00-coastguard-3600.webm", - "vp90-00-coastguard-5200.webm", "vp90-00-container-1000.webm", - "vp90-00-container-200.webm", "vp90-00-container-50.webm", - "vp90-00-deadline-1000.webm", "vp90-00-deadline-200.webm", - "vp90-00-deadline-50.webm", "vp90-00-flower-100.webm", - "vp90-00-flower-2000.webm", "vp90-00-flower-300.webm", - "vp90-00-flower-4400.webm", "vp90-00-flower-800.webm", - "vp90-00-football-1600.webm", "vp90-00-football-2800.webm", - "vp90-00-football-400.webm", "vp90-00-football-600.webm", - "vp90-00-foreman-1200.webm", "vp90-00-foreman-2000.webm", - "vp90-00-foreman-300.webm", "vp90-00-foreman-600.webm", - "vp90-00-hallmonitor-1200.webm", "vp90-00-hallmonitor-2000.webm", - "vp90-00-hallmonitor-300.webm", "vp90-00-hallmonitor-600.webm", - "vp90-00-harbour-1200.webm", "vp90-00-harbour-200.webm", - "vp90-00-harbour-3600.webm", "vp90-00-harbour-5200.webm", - "vp90-00-highway-100.webm", "vp90-00-highway-1600.webm", - "vp90-00-highway-2800.webm", "vp90-00-highway-50.webm", - "vp90-00-husky-100.webm", "vp90-00-husky-2000.webm", - "vp90-00-husky-300.webm", "vp90-00-husky-4400.webm", - "vp90-00-husky-800.webm", "vp90-00-ice-150.webm", - "vp90-00-ice-400.webm", "vp90-00-ice-800.webm", - "vp90-00-mobile-1600.webm", "vp90-00-mobile-2800.webm", - "vp90-00-mobile-400.webm", "vp90-00-mobile-600.webm", - "vp90-00-motherdaughter-100.webm", "vp90-00-motherdaughter-300.webm", - "vp90-00-motherdaughter-600.webm", "vp90-00-news-100.webm", - "vp90-00-news-300.webm", "vp90-00-news-600.webm", - "vp90-00-pamphlet-150.webm", "vp90-00-pamphlet-25.webm", - "vp90-00-pamphlet-400.webm", "vp90-00-paris-1000.webm", - "vp90-00-paris-200.webm", "vp90-00-paris-50.webm", - "vp90-00-signirene-1000.webm", "vp90-00-signirene-200.webm", - "vp90-00-signirene-50.webm", "vp90-00-silent-1000.webm", - "vp90-00-silent-200.webm", "vp90-00-silent-50.webm", - "vp90-00-soccer-100.webm", "vp90-00-soccer-2000.webm", - "vp90-00-soccer-300.webm", "vp90-00-soccer-4400.webm", - "vp90-00-soccer-800.webm", "vp90-00-stefan-1600.webm", - "vp90-00-stefan-2800.webm", "vp90-00-stefan-400.webm", - "vp90-00-stefan-600.webm", "vp90-00-students-100.webm", - "vp90-00-students-300.webm", "vp90-00-students-600.webm", - "vp90-00-tempete-1200.webm", "vp90-00-tempete-200.webm", - "vp90-00-tempete-3600.webm", "vp90-00-tempete-5200.webm", - "vp90-00-tennis-100.webm", "vp90-00-tennis-2000.webm", - "vp90-00-tennis-300.webm", "vp90-00-tennis-4400.webm", - "vp90-00-tennis-800.webm", "vp90-00-waterfall-150.webm", - "vp90-00-waterfall-200.webm", "vp90-00-waterfall-400.webm", - "vp90-00-waterfall-800.webm", + "vp90-2-00-quantizer-00.webm", "vp90-2-00-quantizer-01.webm", + "vp90-2-00-quantizer-02.webm", "vp90-2-00-quantizer-03.webm", + "vp90-2-00-quantizer-04.webm", "vp90-2-00-quantizer-05.webm", + "vp90-2-00-quantizer-06.webm", "vp90-2-00-quantizer-07.webm", + "vp90-2-00-quantizer-08.webm", "vp90-2-00-quantizer-09.webm", + "vp90-2-00-quantizer-10.webm", "vp90-2-00-quantizer-11.webm", + "vp90-2-00-quantizer-12.webm", "vp90-2-00-quantizer-13.webm", + "vp90-2-00-quantizer-14.webm", "vp90-2-00-quantizer-15.webm", + "vp90-2-00-quantizer-16.webm", "vp90-2-00-quantizer-17.webm", + "vp90-2-00-quantizer-18.webm", "vp90-2-00-quantizer-19.webm", + "vp90-2-00-quantizer-20.webm", "vp90-2-00-quantizer-21.webm", + "vp90-2-00-quantizer-22.webm", "vp90-2-00-quantizer-23.webm", + "vp90-2-00-quantizer-24.webm", "vp90-2-00-quantizer-25.webm", + "vp90-2-00-quantizer-26.webm", "vp90-2-00-quantizer-27.webm", + "vp90-2-00-quantizer-28.webm", "vp90-2-00-quantizer-29.webm", + "vp90-2-00-quantizer-30.webm", "vp90-2-00-quantizer-31.webm", + "vp90-2-00-quantizer-32.webm", "vp90-2-00-quantizer-33.webm", + "vp90-2-00-quantizer-34.webm", "vp90-2-00-quantizer-35.webm", + "vp90-2-00-quantizer-36.webm", "vp90-2-00-quantizer-37.webm", + "vp90-2-00-quantizer-38.webm", "vp90-2-00-quantizer-39.webm", + "vp90-2-00-quantizer-40.webm", "vp90-2-00-quantizer-41.webm", + "vp90-2-00-quantizer-42.webm", "vp90-2-00-quantizer-43.webm", + "vp90-2-00-quantizer-44.webm", "vp90-2-00-quantizer-45.webm", + "vp90-2-00-quantizer-46.webm", "vp90-2-00-quantizer-47.webm", + "vp90-2-00-quantizer-48.webm", "vp90-2-00-quantizer-49.webm", + "vp90-2-00-quantizer-50.webm", "vp90-2-00-quantizer-51.webm", + "vp90-2-00-quantizer-52.webm", "vp90-2-00-quantizer-53.webm", + "vp90-2-00-quantizer-54.webm", "vp90-2-00-quantizer-55.webm", + "vp90-2-00-quantizer-56.webm", "vp90-2-00-quantizer-57.webm", + "vp90-2-00-quantizer-58.webm", "vp90-2-00-quantizer-59.webm", + "vp90-2-00-quantizer-60.webm", "vp90-2-00-quantizer-61.webm", + "vp90-2-00-quantizer-62.webm", "vp90-2-00-quantizer-63.webm", + "vp90-2-01-sharpness-1.webm", "vp90-2-01-sharpness-2.webm", + "vp90-2-01-sharpness-3.webm", "vp90-2-01-sharpness-4.webm", + "vp90-2-01-sharpness-5.webm", "vp90-2-01-sharpness-6.webm", + "vp90-2-01-sharpness-7.webm", "vp90-2-02-size-08x08.webm", + "vp90-2-02-size-08x10.webm", "vp90-2-02-size-08x16.webm", + "vp90-2-02-size-08x18.webm", "vp90-2-02-size-08x32.webm", + "vp90-2-02-size-08x34.webm", "vp90-2-02-size-08x64.webm", + "vp90-2-02-size-08x66.webm", "vp90-2-02-size-10x08.webm", + "vp90-2-02-size-10x10.webm", "vp90-2-02-size-10x16.webm", + "vp90-2-02-size-10x18.webm", "vp90-2-02-size-10x32.webm", + "vp90-2-02-size-10x34.webm", "vp90-2-02-size-10x64.webm", + "vp90-2-02-size-10x66.webm", "vp90-2-02-size-16x08.webm", + "vp90-2-02-size-16x10.webm", "vp90-2-02-size-16x16.webm", + "vp90-2-02-size-16x18.webm", "vp90-2-02-size-16x32.webm", + "vp90-2-02-size-16x34.webm", "vp90-2-02-size-16x64.webm", + "vp90-2-02-size-16x66.webm", "vp90-2-02-size-18x08.webm", + "vp90-2-02-size-18x10.webm", "vp90-2-02-size-18x16.webm", + "vp90-2-02-size-18x18.webm", "vp90-2-02-size-18x32.webm", + "vp90-2-02-size-18x34.webm", "vp90-2-02-size-18x64.webm", + "vp90-2-02-size-18x66.webm", "vp90-2-02-size-32x08.webm", + "vp90-2-02-size-32x10.webm", "vp90-2-02-size-32x16.webm", + "vp90-2-02-size-32x18.webm", "vp90-2-02-size-32x32.webm", + "vp90-2-02-size-32x34.webm", "vp90-2-02-size-32x64.webm", + "vp90-2-02-size-32x66.webm", "vp90-2-02-size-34x08.webm", + "vp90-2-02-size-34x10.webm", "vp90-2-02-size-34x16.webm", + "vp90-2-02-size-34x18.webm", "vp90-2-02-size-34x32.webm", + "vp90-2-02-size-34x34.webm", "vp90-2-02-size-34x64.webm", + "vp90-2-02-size-34x66.webm", "vp90-2-02-size-64x08.webm", + "vp90-2-02-size-64x10.webm", "vp90-2-02-size-64x16.webm", + "vp90-2-02-size-64x18.webm", "vp90-2-02-size-64x32.webm", + "vp90-2-02-size-64x34.webm", "vp90-2-02-size-64x64.webm", + "vp90-2-02-size-64x66.webm", "vp90-2-02-size-66x08.webm", + "vp90-2-02-size-66x10.webm", "vp90-2-02-size-66x16.webm", + "vp90-2-02-size-66x18.webm", "vp90-2-02-size-66x32.webm", + "vp90-2-02-size-66x34.webm", "vp90-2-02-size-66x64.webm", + "vp90-2-02-size-66x66.webm", "vp90-2-03-size-196x196.webm", + "vp90-2-03-size-196x198.webm", "vp90-2-03-size-196x200.webm", + "vp90-2-03-size-196x202.webm", "vp90-2-03-size-196x208.webm", + "vp90-2-03-size-196x210.webm", "vp90-2-03-size-196x224.webm", + "vp90-2-03-size-196x226.webm", "vp90-2-03-size-198x196.webm", + "vp90-2-03-size-198x198.webm", "vp90-2-03-size-198x200.webm", + "vp90-2-03-size-198x202.webm", "vp90-2-03-size-198x208.webm", + "vp90-2-03-size-198x210.webm", "vp90-2-03-size-198x224.webm", + "vp90-2-03-size-198x226.webm", "vp90-2-03-size-200x196.webm", + "vp90-2-03-size-200x198.webm", "vp90-2-03-size-200x200.webm", + "vp90-2-03-size-200x202.webm", "vp90-2-03-size-200x208.webm", + "vp90-2-03-size-200x210.webm", "vp90-2-03-size-200x224.webm", + "vp90-2-03-size-200x226.webm", "vp90-2-03-size-202x196.webm", + "vp90-2-03-size-202x198.webm", "vp90-2-03-size-202x200.webm", + "vp90-2-03-size-202x202.webm", "vp90-2-03-size-202x208.webm", + "vp90-2-03-size-202x210.webm", "vp90-2-03-size-202x224.webm", + "vp90-2-03-size-202x226.webm", "vp90-2-03-size-208x196.webm", + "vp90-2-03-size-208x198.webm", "vp90-2-03-size-208x200.webm", + "vp90-2-03-size-208x202.webm", "vp90-2-03-size-208x208.webm", + "vp90-2-03-size-208x210.webm", "vp90-2-03-size-208x224.webm", + "vp90-2-03-size-208x226.webm", "vp90-2-03-size-210x196.webm", + "vp90-2-03-size-210x198.webm", "vp90-2-03-size-210x200.webm", + "vp90-2-03-size-210x202.webm", "vp90-2-03-size-210x208.webm", + "vp90-2-03-size-210x210.webm", "vp90-2-03-size-210x224.webm", + "vp90-2-03-size-210x226.webm", "vp90-2-03-size-224x196.webm", + "vp90-2-03-size-224x198.webm", "vp90-2-03-size-224x200.webm", + "vp90-2-03-size-224x202.webm", "vp90-2-03-size-224x208.webm", + "vp90-2-03-size-224x210.webm", "vp90-2-03-size-224x224.webm", + "vp90-2-03-size-224x226.webm", "vp90-2-03-size-226x196.webm", + "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm", + "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm", + "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm", + "vp90-2-03-size-226x226.webm" }; #endif @@ -136,6 +181,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest, virtual void DecompressedFrameHook(const vpx_image_t& img, const unsigned int frame_number) { + ASSERT_TRUE(md5_file_ != NULL); char expected_md5[33]; char junk[128]; diff --git a/libvpx/test/tile_independence_test.cc b/libvpx/test/tile_independence_test.cc index 9633ed7..403dbb6 100644 --- a/libvpx/test/tile_independence_test.cc +++ b/libvpx/test/tile_independence_test.cc @@ -23,10 +23,13 @@ extern "C" { namespace { class TileIndependenceTest : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWithParam<int> { + public ::libvpx_test::CodecTestWithParam<int> { protected: - TileIndependenceTest() : EncoderTest(GET_PARAM(0)), n_tiles_(GET_PARAM(1)), - md5_fw_order_(), md5_inv_order_() { + TileIndependenceTest() + : EncoderTest(GET_PARAM(0)), + md5_fw_order_(), + md5_inv_order_(), + n_tiles_(GET_PARAM(1)) { init_flags_ = VPX_CODEC_USE_PSNR; vpx_codec_dec_cfg_t cfg; cfg.w = 704; @@ -56,9 +59,8 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest, void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt, ::libvpx_test::MD5 *md5) { - const vpx_codec_err_t res = - dec->DecodeFrame(reinterpret_cast<uint8_t*>(pkt->data.frame.buf), - pkt->data.frame.sz); + const vpx_codec_err_t res = dec->DecodeFrame( + reinterpret_cast<uint8_t*>(pkt->data.frame.buf), pkt->data.frame.sz); if (res != VPX_CODEC_OK) { abort_ = true; ASSERT_EQ(VPX_CODEC_OK, res); @@ -72,11 +74,11 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest, UpdateMD5(inv_dec_, pkt, &md5_inv_order_); } - private: - int n_tiles_; - protected: ::libvpx_test::MD5 md5_fw_order_, md5_inv_order_; ::libvpx_test::Decoder *fw_dec_, *inv_dec_; + + private: + int n_tiles_; }; // run an encode with 2 or 4 tiles, and do the decode both in normal and @@ -93,7 +95,7 @@ TEST_P(TileIndependenceTest, MD5Match) { timebase.den, timebase.num, 0, 30); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - const char *md5_fw_str = md5_fw_order_.Get(); + const char *md5_fw_str = md5_fw_order_.Get(); const char *md5_inv_str = md5_inv_order_.Get(); // could use ASSERT_EQ(!memcmp(.., .., 16) here, but this gives nicer @@ -102,7 +104,6 @@ TEST_P(TileIndependenceTest, MD5Match) { ASSERT_STREQ(md5_fw_str, md5_inv_str); } -VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, - ::testing::Range(0, 2, 1)); +VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1)); } // namespace diff --git a/libvpx/test/util.h b/libvpx/test/util.h index 533a1db..4d7f3d4 100644 --- a/libvpx/test/util.h +++ b/libvpx/test/util.h @@ -37,7 +37,7 @@ static double compute_psnr(const vpx_image_t *img1, img2->planes[VPX_PLANE_Y][i * img2->stride[VPX_PLANE_Y] + j]; sqrerr += d * d; } - double mse = sqrerr / (width_y * height_y); + double mse = static_cast<double>(sqrerr) / (width_y * height_y); double psnr = 100.0; if (mse > 0.0) { psnr = 10 * log10(255.0 * 255.0 / mse); diff --git a/libvpx/test/variance_test.cc b/libvpx/test/variance_test.cc index dfa1a07..207b6e7 100644 --- a/libvpx/test/variance_test.cc +++ b/libvpx/test/variance_test.cc @@ -13,10 +13,12 @@ #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/clear_system_state.h" +#include "test/register_state_check.h" #include "vpx/vpx_integer.h" #include "vpx_config.h" extern "C" { +#include "vpx_mem/vpx_mem.h" #if CONFIG_VP8_ENCODER # include "vp8/common/variance.h" # include "vp8_rtcd.h" @@ -26,12 +28,83 @@ extern "C" { # include "vp9_rtcd.h" #endif } +#include "test/acm_random.h" namespace { using ::std::tr1::get; using ::std::tr1::make_tuple; using ::std::tr1::tuple; +using libvpx_test::ACMRandom; + +static unsigned int variance_ref(const uint8_t *ref, const uint8_t *src, + int l2w, int l2h, unsigned int *sse_ptr) { + int se = 0; + unsigned int sse = 0; + const int w = 1 << l2w, h = 1 << l2h; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + int diff = ref[w * y + x] - src[w * y + x]; + se += diff; + sse += diff * diff; + } + } + *sse_ptr = sse; + return sse - (((int64_t) se * se) >> (l2w + l2h)); +} + +static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, + int l2w, int l2h, int xoff, int yoff, + unsigned int *sse_ptr) { + int se = 0; + unsigned int sse = 0; + const int w = 1 << l2w, h = 1 << l2h; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // bilinear interpolation at a 16th pel step + const int a1 = ref[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + int diff = r - src[w * y + x]; + se += diff; + sse += diff * diff; + } + } + *sse_ptr = sse; + return sse - (((int64_t) se * se) >> (l2w + l2h)); +} + +static unsigned int subpel_avg_variance_ref(const uint8_t *ref, + const uint8_t *src, + const uint8_t *second_pred, + int l2w, int l2h, + int xoff, int yoff, + unsigned int *sse_ptr) { + int se = 0; + unsigned int sse = 0; + const int w = 1 << l2w, h = 1 << l2h; + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + // bilinear interpolation at a 16th pel step + const int a1 = ref[(w + 1) * (y + 0) + x + 0]; + const int a2 = ref[(w + 1) * (y + 0) + x + 1]; + const int b1 = ref[(w + 1) * (y + 1) + x + 0]; + const int b2 = ref[(w + 1) * (y + 1) + x + 1]; + const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); + const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); + const int r = a + (((b - a) * yoff + 8) >> 4); + int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x]; + se += diff; + sse += diff * diff; + } + } + *sse_ptr = sse; + return sse - (((int64_t) se * se) >> (l2w + l2h)); +} template<typename VarianceFunctionType> class VarianceTest : @@ -39,10 +112,13 @@ class VarianceTest : public: virtual void SetUp() { const tuple<int, int, VarianceFunctionType>& params = this->GetParam(); - width_ = get<0>(params); - height_ = get<1>(params); + log2width_ = get<0>(params); + width_ = 1 << log2width_; + log2height_ = get<1>(params); + height_ = 1 << log2height_; variance_ = get<2>(params); + rnd(ACMRandom::DeterministicSeed()); block_size_ = width_ * height_; src_ = new uint8_t[block_size_]; ref_ = new uint8_t[block_size_]; @@ -58,15 +134,16 @@ class VarianceTest : protected: void ZeroTest(); + void RefTest(); void OneQuarterTest(); + ACMRandom rnd; uint8_t* src_; uint8_t* ref_; - int width_; - int height_; + int width_, log2width_; + int height_, log2height_; int block_size_; VarianceFunctionType variance_; - }; template<typename VarianceFunctionType> @@ -76,24 +153,133 @@ void VarianceTest<VarianceFunctionType>::ZeroTest() { for (int j = 0; j <= 255; ++j) { memset(ref_, j, block_size_); unsigned int sse; - const unsigned int var = variance_(src_, width_, ref_, width_, &sse); + unsigned int var; + REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse)); EXPECT_EQ(0u, var) << "src values: " << i << "ref values: " << j; } } } template<typename VarianceFunctionType> +void VarianceTest<VarianceFunctionType>::RefTest() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size_; j++) { + src_[j] = rnd.Rand8(); + ref_[j] = rnd.Rand8(); + } + unsigned int sse1, sse2; + unsigned int var1; + REGISTER_STATE_CHECK(var1 = variance_(src_, width_, ref_, width_, &sse1)); + const unsigned int var2 = variance_ref(src_, ref_, log2width_, + log2height_, &sse2); + EXPECT_EQ(sse1, sse2); + EXPECT_EQ(var1, var2); + } +} + +template<typename VarianceFunctionType> void VarianceTest<VarianceFunctionType>::OneQuarterTest() { memset(src_, 255, block_size_); const int half = block_size_ / 2; memset(ref_, 255, half); memset(ref_ + half, 0, half); unsigned int sse; - const unsigned int var = variance_(src_, width_, ref_, width_, &sse); + unsigned int var; + REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse)); const unsigned int expected = block_size_ * 255 * 255 / 4; EXPECT_EQ(expected, var); } +template<typename SubpelVarianceFunctionType> +class SubpelVarianceTest : + public ::testing::TestWithParam<tuple<int, int, + SubpelVarianceFunctionType> > { + public: + virtual void SetUp() { + const tuple<int, int, SubpelVarianceFunctionType>& params = + this->GetParam(); + log2width_ = get<0>(params); + width_ = 1 << log2width_; + log2height_ = get<1>(params); + height_ = 1 << log2height_; + subpel_variance_ = get<2>(params); + + rnd(ACMRandom::DeterministicSeed()); + block_size_ = width_ * height_; + src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_)); + sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_)); + ref_ = new uint8_t[block_size_ + width_ + height_ + 1]; + ASSERT_TRUE(src_ != NULL); + ASSERT_TRUE(sec_ != NULL); + ASSERT_TRUE(ref_ != NULL); + } + + virtual void TearDown() { + vpx_free(src_); + delete[] ref_; + vpx_free(sec_); + libvpx_test::ClearSystemState(); + } + + protected: + void RefTest(); + + ACMRandom rnd; + uint8_t *src_; + uint8_t *ref_; + uint8_t *sec_; + int width_, log2width_; + int height_, log2height_; + int block_size_; + SubpelVarianceFunctionType subpel_variance_; +}; + +template<typename SubpelVarianceFunctionType> +void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() { + for (int x = 0; x < 16; ++x) { + for (int y = 0; y < 16; ++y) { + for (int j = 0; j < block_size_; j++) { + src_[j] = rnd.Rand8(); + } + for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + ref_[j] = rnd.Rand8(); + } + unsigned int sse1, sse2; + unsigned int var1; + REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y, + src_, width_, &sse1)); + const unsigned int var2 = subpel_variance_ref(ref_, src_, log2width_, + log2height_, x, y, &sse2); + EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; + EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; + } + } +} + +template<> +void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() { + for (int x = 0; x < 16; ++x) { + for (int y = 0; y < 16; ++y) { + for (int j = 0; j < block_size_; j++) { + src_[j] = rnd.Rand8(); + sec_[j] = rnd.Rand8(); + } + for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { + ref_[j] = rnd.Rand8(); + } + unsigned int sse1, sse2; + unsigned int var1; + REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y, + src_, width_, &sse1, sec_)); + const unsigned int var2 = subpel_avg_variance_ref(ref_, src_, sec_, + log2width_, log2height_, + x, y, &sse2); + EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; + EXPECT_EQ(var1, var2) << "at position " << x << ", " << y; + } + } +} + // ----------------------------------------------------------------------------- // VP8 test cases. @@ -103,6 +289,7 @@ namespace vp8 { typedef VarianceTest<vp8_variance_fn_t> VP8VarianceTest; TEST_P(VP8VarianceTest, Zero) { ZeroTest(); } +TEST_P(VP8VarianceTest, Ref) { RefTest(); } TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); } const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c; @@ -112,11 +299,11 @@ const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c; const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c; INSTANTIATE_TEST_CASE_P( C, VP8VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_c), - make_tuple(8, 8, variance8x8_c), - make_tuple(8, 16, variance8x16_c), - make_tuple(16, 8, variance16x8_c), - make_tuple(16, 16, variance16x16_c))); + ::testing::Values(make_tuple(2, 2, variance4x4_c), + make_tuple(3, 3, variance8x8_c), + make_tuple(3, 4, variance8x16_c), + make_tuple(4, 3, variance16x8_c), + make_tuple(4, 4, variance16x16_c))); #if HAVE_MMX const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx; @@ -126,11 +313,11 @@ const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx; const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx; INSTANTIATE_TEST_CASE_P( MMX, VP8VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_mmx), - make_tuple(8, 8, variance8x8_mmx), - make_tuple(8, 16, variance8x16_mmx), - make_tuple(16, 8, variance16x8_mmx), - make_tuple(16, 16, variance16x16_mmx))); + ::testing::Values(make_tuple(2, 2, variance4x4_mmx), + make_tuple(3, 3, variance8x8_mmx), + make_tuple(3, 4, variance8x16_mmx), + make_tuple(4, 3, variance16x8_mmx), + make_tuple(4, 4, variance16x16_mmx))); #endif #if HAVE_SSE2 @@ -141,11 +328,11 @@ const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt; const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt; INSTANTIATE_TEST_CASE_P( SSE2, VP8VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_wmt), - make_tuple(8, 8, variance8x8_wmt), - make_tuple(8, 16, variance8x16_wmt), - make_tuple(16, 8, variance16x8_wmt), - make_tuple(16, 16, variance16x16_wmt))); + ::testing::Values(make_tuple(2, 2, variance4x4_wmt), + make_tuple(3, 3, variance8x8_wmt), + make_tuple(3, 4, variance8x16_wmt), + make_tuple(4, 3, variance16x8_wmt), + make_tuple(4, 4, variance16x16_wmt))); #endif #endif // CONFIG_VP8_ENCODER @@ -158,22 +345,127 @@ namespace vp9 { #if CONFIG_VP9_ENCODER typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest; +typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest; +typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest; TEST_P(VP9VarianceTest, Zero) { ZeroTest(); } +TEST_P(VP9VarianceTest, Ref) { RefTest(); } +TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); } +TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); } TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); } const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c; +const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c; +const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c; const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c; const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c; const vp9_variance_fn_t variance16x8_c = vp9_variance16x8_c; const vp9_variance_fn_t variance16x16_c = vp9_variance16x16_c; +const vp9_variance_fn_t variance16x32_c = vp9_variance16x32_c; +const vp9_variance_fn_t variance32x16_c = vp9_variance32x16_c; +const vp9_variance_fn_t variance32x32_c = vp9_variance32x32_c; +const vp9_variance_fn_t variance32x64_c = vp9_variance32x64_c; +const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c; +const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c; INSTANTIATE_TEST_CASE_P( C, VP9VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_c), - make_tuple(8, 8, variance8x8_c), - make_tuple(8, 16, variance8x16_c), - make_tuple(16, 8, variance16x8_c), - make_tuple(16, 16, variance16x16_c))); + ::testing::Values(make_tuple(2, 2, variance4x4_c), + make_tuple(2, 3, variance4x8_c), + make_tuple(3, 2, variance8x4_c), + make_tuple(3, 3, variance8x8_c), + make_tuple(3, 4, variance8x16_c), + make_tuple(4, 3, variance16x8_c), + make_tuple(4, 4, variance16x16_c), + make_tuple(4, 5, variance16x32_c), + make_tuple(5, 4, variance32x16_c), + make_tuple(5, 5, variance32x32_c), + make_tuple(5, 6, variance32x64_c), + make_tuple(6, 5, variance64x32_c), + make_tuple(6, 6, variance64x64_c))); + +const vp9_subpixvariance_fn_t subpel_variance4x4_c = + vp9_sub_pixel_variance4x4_c; +const vp9_subpixvariance_fn_t subpel_variance4x8_c = + vp9_sub_pixel_variance4x8_c; +const vp9_subpixvariance_fn_t subpel_variance8x4_c = + vp9_sub_pixel_variance8x4_c; +const vp9_subpixvariance_fn_t subpel_variance8x8_c = + vp9_sub_pixel_variance8x8_c; +const vp9_subpixvariance_fn_t subpel_variance8x16_c = + vp9_sub_pixel_variance8x16_c; +const vp9_subpixvariance_fn_t subpel_variance16x8_c = + vp9_sub_pixel_variance16x8_c; +const vp9_subpixvariance_fn_t subpel_variance16x16_c = + vp9_sub_pixel_variance16x16_c; +const vp9_subpixvariance_fn_t subpel_variance16x32_c = + vp9_sub_pixel_variance16x32_c; +const vp9_subpixvariance_fn_t subpel_variance32x16_c = + vp9_sub_pixel_variance32x16_c; +const vp9_subpixvariance_fn_t subpel_variance32x32_c = + vp9_sub_pixel_variance32x32_c; +const vp9_subpixvariance_fn_t subpel_variance32x64_c = + vp9_sub_pixel_variance32x64_c; +const vp9_subpixvariance_fn_t subpel_variance64x32_c = + vp9_sub_pixel_variance64x32_c; +const vp9_subpixvariance_fn_t subpel_variance64x64_c = + vp9_sub_pixel_variance64x64_c; +INSTANTIATE_TEST_CASE_P( + C, VP9SubpelVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c), + make_tuple(2, 3, subpel_variance4x8_c), + make_tuple(3, 2, subpel_variance8x4_c), + make_tuple(3, 3, subpel_variance8x8_c), + make_tuple(3, 4, subpel_variance8x16_c), + make_tuple(4, 3, subpel_variance16x8_c), + make_tuple(4, 4, subpel_variance16x16_c), + make_tuple(4, 5, subpel_variance16x32_c), + make_tuple(5, 4, subpel_variance32x16_c), + make_tuple(5, 5, subpel_variance32x32_c), + make_tuple(5, 6, subpel_variance32x64_c), + make_tuple(6, 5, subpel_variance64x32_c), + make_tuple(6, 6, subpel_variance64x64_c))); + +const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c = + vp9_sub_pixel_avg_variance4x4_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c = + vp9_sub_pixel_avg_variance4x8_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c = + vp9_sub_pixel_avg_variance8x4_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c = + vp9_sub_pixel_avg_variance8x8_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c = + vp9_sub_pixel_avg_variance8x16_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c = + vp9_sub_pixel_avg_variance16x8_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c = + vp9_sub_pixel_avg_variance16x16_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c = + vp9_sub_pixel_avg_variance16x32_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c = + vp9_sub_pixel_avg_variance32x16_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c = + vp9_sub_pixel_avg_variance32x32_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c = + vp9_sub_pixel_avg_variance32x64_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c = + vp9_sub_pixel_avg_variance64x32_c; +const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c = + vp9_sub_pixel_avg_variance64x64_c; +INSTANTIATE_TEST_CASE_P( + C, VP9SubpelAvgVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c), + make_tuple(2, 3, subpel_avg_variance4x8_c), + make_tuple(3, 2, subpel_avg_variance8x4_c), + make_tuple(3, 3, subpel_avg_variance8x8_c), + make_tuple(3, 4, subpel_avg_variance8x16_c), + make_tuple(4, 3, subpel_avg_variance16x8_c), + make_tuple(4, 4, subpel_avg_variance16x16_c), + make_tuple(4, 5, subpel_avg_variance16x32_c), + make_tuple(5, 4, subpel_avg_variance32x16_c), + make_tuple(5, 5, subpel_avg_variance32x32_c), + make_tuple(5, 6, subpel_avg_variance32x64_c), + make_tuple(6, 5, subpel_avg_variance64x32_c), + make_tuple(6, 6, subpel_avg_variance64x64_c))); #if HAVE_MMX const vp9_variance_fn_t variance4x4_mmx = vp9_variance4x4_mmx; @@ -183,26 +475,212 @@ const vp9_variance_fn_t variance16x8_mmx = vp9_variance16x8_mmx; const vp9_variance_fn_t variance16x16_mmx = vp9_variance16x16_mmx; INSTANTIATE_TEST_CASE_P( MMX, VP9VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_mmx), - make_tuple(8, 8, variance8x8_mmx), - make_tuple(8, 16, variance8x16_mmx), - make_tuple(16, 8, variance16x8_mmx), - make_tuple(16, 16, variance16x16_mmx))); + ::testing::Values(make_tuple(2, 2, variance4x4_mmx), + make_tuple(3, 3, variance8x8_mmx), + make_tuple(3, 4, variance8x16_mmx), + make_tuple(4, 3, variance16x8_mmx), + make_tuple(4, 4, variance16x16_mmx))); #endif #if HAVE_SSE2 -const vp9_variance_fn_t variance4x4_wmt = vp9_variance4x4_sse2; -const vp9_variance_fn_t variance8x8_wmt = vp9_variance8x8_sse2; -const vp9_variance_fn_t variance8x16_wmt = vp9_variance8x16_sse2; -const vp9_variance_fn_t variance16x8_wmt = vp9_variance16x8_sse2; -const vp9_variance_fn_t variance16x16_wmt = vp9_variance16x16_sse2; +const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2; +const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2; +const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2; +const vp9_variance_fn_t variance8x8_sse2 = vp9_variance8x8_sse2; +const vp9_variance_fn_t variance8x16_sse2 = vp9_variance8x16_sse2; +const vp9_variance_fn_t variance16x8_sse2 = vp9_variance16x8_sse2; +const vp9_variance_fn_t variance16x16_sse2 = vp9_variance16x16_sse2; +const vp9_variance_fn_t variance16x32_sse2 = vp9_variance16x32_sse2; +const vp9_variance_fn_t variance32x16_sse2 = vp9_variance32x16_sse2; +const vp9_variance_fn_t variance32x32_sse2 = vp9_variance32x32_sse2; +const vp9_variance_fn_t variance32x64_sse2 = vp9_variance32x64_sse2; +const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2; +const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2; INSTANTIATE_TEST_CASE_P( SSE2, VP9VarianceTest, - ::testing::Values(make_tuple(4, 4, variance4x4_wmt), - make_tuple(8, 8, variance8x8_wmt), - make_tuple(8, 16, variance8x16_wmt), - make_tuple(16, 8, variance16x8_wmt), - make_tuple(16, 16, variance16x16_wmt))); + ::testing::Values(make_tuple(2, 2, variance4x4_sse2), + make_tuple(2, 3, variance4x8_sse2), + make_tuple(3, 2, variance8x4_sse2), + make_tuple(3, 3, variance8x8_sse2), + make_tuple(3, 4, variance8x16_sse2), + make_tuple(4, 3, variance16x8_sse2), + make_tuple(4, 4, variance16x16_sse2), + make_tuple(4, 5, variance16x32_sse2), + make_tuple(5, 4, variance32x16_sse2), + make_tuple(5, 5, variance32x32_sse2), + make_tuple(5, 6, variance32x64_sse2), + make_tuple(6, 5, variance64x32_sse2), + make_tuple(6, 6, variance64x64_sse2))); + +const vp9_subpixvariance_fn_t subpel_variance4x4_sse = + vp9_sub_pixel_variance4x4_sse; +const vp9_subpixvariance_fn_t subpel_variance4x8_sse = + vp9_sub_pixel_variance4x8_sse; +const vp9_subpixvariance_fn_t subpel_variance8x4_sse2 = + vp9_sub_pixel_variance8x4_sse2; +const vp9_subpixvariance_fn_t subpel_variance8x8_sse2 = + vp9_sub_pixel_variance8x8_sse2; +const vp9_subpixvariance_fn_t subpel_variance8x16_sse2 = + vp9_sub_pixel_variance8x16_sse2; +const vp9_subpixvariance_fn_t subpel_variance16x8_sse2 = + vp9_sub_pixel_variance16x8_sse2; +const vp9_subpixvariance_fn_t subpel_variance16x16_sse2 = + vp9_sub_pixel_variance16x16_sse2; +const vp9_subpixvariance_fn_t subpel_variance16x32_sse2 = + vp9_sub_pixel_variance16x32_sse2; +const vp9_subpixvariance_fn_t subpel_variance32x16_sse2 = + vp9_sub_pixel_variance32x16_sse2; +const vp9_subpixvariance_fn_t subpel_variance32x32_sse2 = + vp9_sub_pixel_variance32x32_sse2; +const vp9_subpixvariance_fn_t subpel_variance32x64_sse2 = + vp9_sub_pixel_variance32x64_sse2; +const vp9_subpixvariance_fn_t subpel_variance64x32_sse2 = + vp9_sub_pixel_variance64x32_sse2; +const vp9_subpixvariance_fn_t subpel_variance64x64_sse2 = + vp9_sub_pixel_variance64x64_sse2; +INSTANTIATE_TEST_CASE_P( + SSE2, VP9SubpelVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse), + make_tuple(2, 3, subpel_variance4x8_sse), + make_tuple(3, 2, subpel_variance8x4_sse2), + make_tuple(3, 3, subpel_variance8x8_sse2), + make_tuple(3, 4, subpel_variance8x16_sse2), + make_tuple(4, 3, subpel_variance16x8_sse2), + make_tuple(4, 4, subpel_variance16x16_sse2), + make_tuple(4, 5, subpel_variance16x32_sse2), + make_tuple(5, 4, subpel_variance32x16_sse2), + make_tuple(5, 5, subpel_variance32x32_sse2), + make_tuple(5, 6, subpel_variance32x64_sse2), + make_tuple(6, 5, subpel_variance64x32_sse2), + make_tuple(6, 6, subpel_variance64x64_sse2))); + +const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse = + vp9_sub_pixel_avg_variance4x4_sse; +const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse = + vp9_sub_pixel_avg_variance4x8_sse; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 = + vp9_sub_pixel_avg_variance8x4_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 = + vp9_sub_pixel_avg_variance8x8_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 = + vp9_sub_pixel_avg_variance8x16_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 = + vp9_sub_pixel_avg_variance16x8_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 = + vp9_sub_pixel_avg_variance16x16_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 = + vp9_sub_pixel_avg_variance16x32_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 = + vp9_sub_pixel_avg_variance32x16_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 = + vp9_sub_pixel_avg_variance32x32_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 = + vp9_sub_pixel_avg_variance32x64_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 = + vp9_sub_pixel_avg_variance64x32_sse2; +const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 = + vp9_sub_pixel_avg_variance64x64_sse2; +INSTANTIATE_TEST_CASE_P( + SSE2, VP9SubpelAvgVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse), + make_tuple(2, 3, subpel_avg_variance4x8_sse), + make_tuple(3, 2, subpel_avg_variance8x4_sse2), + make_tuple(3, 3, subpel_avg_variance8x8_sse2), + make_tuple(3, 4, subpel_avg_variance8x16_sse2), + make_tuple(4, 3, subpel_avg_variance16x8_sse2), + make_tuple(4, 4, subpel_avg_variance16x16_sse2), + make_tuple(4, 5, subpel_avg_variance16x32_sse2), + make_tuple(5, 4, subpel_avg_variance32x16_sse2), + make_tuple(5, 5, subpel_avg_variance32x32_sse2), + make_tuple(5, 6, subpel_avg_variance32x64_sse2), + make_tuple(6, 5, subpel_avg_variance64x32_sse2), + make_tuple(6, 6, subpel_avg_variance64x64_sse2))); +#endif + +#if HAVE_SSSE3 +const vp9_subpixvariance_fn_t subpel_variance4x4_ssse3 = + vp9_sub_pixel_variance4x4_ssse3; +const vp9_subpixvariance_fn_t subpel_variance4x8_ssse3 = + vp9_sub_pixel_variance4x8_ssse3; +const vp9_subpixvariance_fn_t subpel_variance8x4_ssse3 = + vp9_sub_pixel_variance8x4_ssse3; +const vp9_subpixvariance_fn_t subpel_variance8x8_ssse3 = + vp9_sub_pixel_variance8x8_ssse3; +const vp9_subpixvariance_fn_t subpel_variance8x16_ssse3 = + vp9_sub_pixel_variance8x16_ssse3; +const vp9_subpixvariance_fn_t subpel_variance16x8_ssse3 = + vp9_sub_pixel_variance16x8_ssse3; +const vp9_subpixvariance_fn_t subpel_variance16x16_ssse3 = + vp9_sub_pixel_variance16x16_ssse3; +const vp9_subpixvariance_fn_t subpel_variance16x32_ssse3 = + vp9_sub_pixel_variance16x32_ssse3; +const vp9_subpixvariance_fn_t subpel_variance32x16_ssse3 = + vp9_sub_pixel_variance32x16_ssse3; +const vp9_subpixvariance_fn_t subpel_variance32x32_ssse3 = + vp9_sub_pixel_variance32x32_ssse3; +const vp9_subpixvariance_fn_t subpel_variance32x64_ssse3 = + vp9_sub_pixel_variance32x64_ssse3; +const vp9_subpixvariance_fn_t subpel_variance64x32_ssse3 = + vp9_sub_pixel_variance64x32_ssse3; +const vp9_subpixvariance_fn_t subpel_variance64x64_ssse3 = + vp9_sub_pixel_variance64x64_ssse3; +INSTANTIATE_TEST_CASE_P( + SSSE3, VP9SubpelVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3), + make_tuple(2, 3, subpel_variance4x8_ssse3), + make_tuple(3, 2, subpel_variance8x4_ssse3), + make_tuple(3, 3, subpel_variance8x8_ssse3), + make_tuple(3, 4, subpel_variance8x16_ssse3), + make_tuple(4, 3, subpel_variance16x8_ssse3), + make_tuple(4, 4, subpel_variance16x16_ssse3), + make_tuple(4, 5, subpel_variance16x32_ssse3), + make_tuple(5, 4, subpel_variance32x16_ssse3), + make_tuple(5, 5, subpel_variance32x32_ssse3), + make_tuple(5, 6, subpel_variance32x64_ssse3), + make_tuple(6, 5, subpel_variance64x32_ssse3), + make_tuple(6, 6, subpel_variance64x64_ssse3))); + +const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 = + vp9_sub_pixel_avg_variance4x4_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 = + vp9_sub_pixel_avg_variance4x8_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 = + vp9_sub_pixel_avg_variance8x4_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 = + vp9_sub_pixel_avg_variance8x8_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 = + vp9_sub_pixel_avg_variance8x16_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 = + vp9_sub_pixel_avg_variance16x8_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 = + vp9_sub_pixel_avg_variance16x16_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 = + vp9_sub_pixel_avg_variance16x32_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 = + vp9_sub_pixel_avg_variance32x16_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 = + vp9_sub_pixel_avg_variance32x32_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 = + vp9_sub_pixel_avg_variance32x64_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 = + vp9_sub_pixel_avg_variance64x32_ssse3; +const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 = + vp9_sub_pixel_avg_variance64x64_ssse3; +INSTANTIATE_TEST_CASE_P( + SSSE3, VP9SubpelAvgVarianceTest, + ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3), + make_tuple(2, 3, subpel_avg_variance4x8_ssse3), + make_tuple(3, 2, subpel_avg_variance8x4_ssse3), + make_tuple(3, 3, subpel_avg_variance8x8_ssse3), + make_tuple(3, 4, subpel_avg_variance8x16_ssse3), + make_tuple(4, 3, subpel_avg_variance16x8_ssse3), + make_tuple(4, 4, subpel_avg_variance16x16_ssse3), + make_tuple(4, 5, subpel_avg_variance16x32_ssse3), + make_tuple(5, 4, subpel_avg_variance32x16_ssse3), + make_tuple(5, 5, subpel_avg_variance32x32_ssse3), + make_tuple(5, 6, subpel_avg_variance32x64_ssse3), + make_tuple(6, 5, subpel_avg_variance64x32_ssse3), + make_tuple(6, 6, subpel_avg_variance64x64_ssse3))); #endif #endif // CONFIG_VP9_ENCODER diff --git a/libvpx/test/vp9_lossless_test.cc b/libvpx/test/vp9_lossless_test.cc new file mode 100644 index 0000000..441cc44 --- /dev/null +++ b/libvpx/test/vp9_lossless_test.cc @@ -0,0 +1,75 @@ +/* + Copyright (c) 2012 The WebM project authors. All Rights Reserved. + + Use of this source code is governed by a BSD-style license + that can be found in the LICENSE file in the root of the source + tree. An additional intellectual property rights grant can be found + in the file PATENTS. All contributing project authors may + be found in the AUTHORS file in the root of the source tree. +*/ + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +const int kMaxPsnr = 100; + +class LossLessTest : public ::libvpx_test::EncoderTest, + public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> { + protected: + LossLessTest() : EncoderTest(GET_PARAM(0)), + psnr_(kMaxPsnr), + nframes_(0), + encoding_mode_(GET_PARAM(1)) { + } + + virtual ~LossLessTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + } + + virtual void BeginPassHook(unsigned int /*pass*/) { + psnr_ = 0.0; + nframes_ = 0; + } + + virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { + if (pkt->data.psnr.psnr[0] < psnr_) + psnr_= pkt->data.psnr.psnr[0]; + } + + double GetMinPsnr() const { + return psnr_; + } + + private: + double psnr_; + unsigned int nframes_; + libvpx_test::TestMode encoding_mode_; +}; + +TEST_P(LossLessTest, TestLossLessEncoding) { + const vpx_rational timebase = { 33333333, 1000000000 }; + cfg_.g_timebase = timebase; + cfg_.rc_target_bitrate = 2000; + cfg_.g_lag_in_frames = 25; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 0; + + init_flags_ = VPX_CODEC_USE_PSNR; + + // intentionally changed the dimension for better testing coverage + libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 356, 284, + timebase.den, timebase.num, 0, 30); + + const double psnr_lossless = GetMinPsnr(); + EXPECT_GE(psnr_lossless, kMaxPsnr); +} +VP9_INSTANTIATE_TEST_CASE(LossLessTest, ALL_TEST_MODES); +} // namespace diff --git a/libvpx/test/vp9_subtract_test.cc b/libvpx/test/vp9_subtract_test.cc new file mode 100644 index 0000000..3e5fe8d --- /dev/null +++ b/libvpx/test/vp9_subtract_test.cc @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +extern "C" { +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_blockd.h" +#include "vpx_mem/vpx_mem.h" +} + +typedef void (*subtract_fn_t)(int rows, int cols, + int16_t *diff_ptr, ptrdiff_t diff_stride, + const uint8_t *src_ptr, ptrdiff_t src_stride, + const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +namespace vp9 { + +class VP9SubtractBlockTest : public ::testing::TestWithParam<subtract_fn_t> { + public: + virtual void TearDown() { + libvpx_test::ClearSystemState(); + } +}; + +using libvpx_test::ACMRandom; + +TEST_P(VP9SubtractBlockTest, SimpleSubtract) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + // FIXME(rbultje) split in its own file + for (BLOCK_SIZE_TYPE bsize = BLOCK_SIZE_AB4X4; bsize < BLOCK_SIZE_TYPES; + bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) { + const int block_width = 4 << b_width_log2(bsize); + const int block_height = 4 << b_height_log2(bsize); + int16_t *diff = reinterpret_cast<int16_t *>( + vpx_memalign(16, sizeof(*diff) * block_width * block_height * 2)); + uint8_t *pred = reinterpret_cast<uint8_t *>( + vpx_memalign(16, block_width * block_height * 2)); + uint8_t *src = reinterpret_cast<uint8_t *>( + vpx_memalign(16, block_width * block_height * 2)); + + for (int n = 0; n < 100; n++) { + for (int r = 0; r < block_height; ++r) { + for (int c = 0; c < block_width * 2; ++c) { + src[r * block_width * 2 + c] = rnd.Rand8(); + pred[r * block_width * 2 + c] = rnd.Rand8(); + } + } + + GetParam()(block_height, block_width, diff, block_width, + src, block_width, pred, block_width); + + for (int r = 0; r < block_height; ++r) { + for (int c = 0; c < block_width; ++c) { + EXPECT_EQ(diff[r * block_width + c], + (src[r * block_width + c] - + pred[r * block_width + c])) << "r = " << r + << ", c = " << c + << ", bs = " << bsize; + } + } + + GetParam()(block_height, block_width, diff, block_width * 2, + src, block_width * 2, pred, block_width * 2); + + for (int r = 0; r < block_height; ++r) { + for (int c = 0; c < block_width; ++c) { + EXPECT_EQ(diff[r * block_width * 2 + c], + (src[r * block_width * 2 + c] - + pred[r * block_width * 2 + c])) << "r = " << r + << ", c = " << c + << ", bs = " << bsize; + } + } + } + vpx_free(diff); + vpx_free(pred); + vpx_free(src); + } +} + +INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest, + ::testing::Values(vp9_subtract_block_c)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest, + ::testing::Values(vp9_subtract_block_sse2)); +#endif + +} // namespace vp9 diff --git a/libvpx/test/webm_video_source.h b/libvpx/test/webm_video_source.h index c7919a9..9fc8545 100644 --- a/libvpx/test/webm_video_source.h +++ b/libvpx/test/webm_video_source.h @@ -99,7 +99,7 @@ class WebMVideoSource : public CompressedVideoSource { virtual void Begin() { input_file_ = OpenTestDataFile(file_name_); - ASSERT_TRUE(input_file_) << "Input file open failed. Filename: " + ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: " << file_name_; nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb, @@ -130,6 +130,7 @@ class WebMVideoSource : public CompressedVideoSource { } void FillFrame() { + ASSERT_TRUE(input_file_ != NULL); if (chunk_ >= chunks_) { unsigned int track; diff --git a/libvpx/third_party/libyuv/source/scale.c b/libvpx/third_party/libyuv/source/scale.c index 72a817d..3c30b55 100644 --- a/libvpx/third_party/libyuv/source/scale.c +++ b/libvpx/third_party/libyuv/source/scale.c @@ -1370,12 +1370,12 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + shr eax, 1 cmp eax, 0 je xloop1 - cmp eax, 128 + cmp eax, 64 je xloop2 - shr eax, 1 mov ah,al neg al add al, 128 @@ -2132,11 +2132,11 @@ void ScaleFilterRows_SSSE3(uint8* dst_ptr, "mov 0x14(%esp),%edx \n" "mov 0x18(%esp),%ecx \n" "mov 0x1c(%esp),%eax \n" + "shr %eax \n" "cmp $0x0,%eax \n" "je 2f \n" - "cmp $0x80,%eax \n" + "cmp $0x40,%eax \n" "je 3f \n" - "shr %eax \n" "mov %al,%ah \n" "neg %al \n" "add $0x80,%al \n" @@ -2662,6 +2662,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, int src_stride, int dst_width, int source_y_fraction) { + source_y_fraction >>= 1; if (source_y_fraction == 0) { asm volatile ( "1:" @@ -2680,7 +2681,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, : "memory", "cc", "rax" ); return; - } else if (source_y_fraction == 128) { + } else if (source_y_fraction == 64) { asm volatile ( "1:" "movdqa (%1),%%xmm0 \n" @@ -2703,7 +2704,6 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, } else { asm volatile ( "mov %3,%%eax \n" - "shr %%eax \n" "mov %%al,%%ah \n" "neg %%al \n" "add $0x80,%%al \n" diff --git a/libvpx/vp8/common/alloccommon.c b/libvpx/vp8/common/alloccommon.c index 8af9e90..54afc13 100644 --- a/libvpx/vp8/common/alloccommon.c +++ b/libvpx/vp8/common/alloccommon.c @@ -173,7 +173,6 @@ void vp8_create_common(VP8_COMMON *oci) oci->use_bilinear_mc_filter = 0; oci->full_pixel = 0; oci->multi_token_partition = ONE_PARTITION; - oci->clr_type = REG_YUV; oci->clamp_type = RECON_CLAMP_REQUIRED; /* Initialize reference frame sign bias structure to defaults */ diff --git a/libvpx/vp8/common/onyxc_int.h b/libvpx/vp8/common/onyxc_int.h index 276dd72..e9bb7af 100644 --- a/libvpx/vp8/common/onyxc_int.h +++ b/libvpx/vp8/common/onyxc_int.h @@ -72,7 +72,6 @@ typedef struct VP8Common int horiz_scale; int vert_scale; - YUV_TYPE clr_type; CLAMP_TYPE clamp_type; YV12_BUFFER_CONFIG *frame_to_show; @@ -115,9 +114,6 @@ typedef struct VP8Common int uvdc_delta_q; int uvac_delta_q; - unsigned int frames_since_golden; - unsigned int frames_till_alt_ref_frame; - /* We allocate a MODE_INFO struct for each macroblock, together with an extra row on top and column on the left to simplify prediction. */ @@ -157,7 +153,6 @@ typedef struct VP8Common unsigned int current_video_frame; - int near_boffset[3]; int version; TOKEN_PARTITION multi_token_partition; @@ -165,8 +160,10 @@ typedef struct VP8Common #ifdef PACKET_TESTING VP8_HEADER oh; #endif +#if CONFIG_POSTPROC_VISUALIZER double bitrate; double framerate; +#endif #if CONFIG_MULTITHREAD int processor_core_count; diff --git a/libvpx/vp8/common/postproc.c b/libvpx/vp8/common/postproc.c index 0266f4c..dd998f1 100644 --- a/libvpx/vp8/common/postproc.c +++ b/libvpx/vp8/common/postproc.c @@ -923,7 +923,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t if (flags & VP8D_DEBUG_TXT_RATE_INFO) { char message[512]; - sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate); + sprintf(message, "Bitrate: %10.2f framerate: %10.2f ", oci->bitrate, oci->framerate); vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride); } diff --git a/libvpx/vp8/common/vp8_asm_com_offsets.c b/libvpx/vp8/common/vp8_asm_com_offsets.c deleted file mode 100644 index 7bab90f..0000000 --- a/libvpx/vp8/common/vp8_asm_com_offsets.c +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_config.h" -#include "vpx/vpx_codec.h" -#include "vpx_ports/asm_offsets.h" -#include "vp8/common/blockd.h" - -#if CONFIG_POSTPROC -#include "postproc.h" -#endif /* CONFIG_POSTPROC */ - -BEGIN - -#if CONFIG_POSTPROC -/* mfqe.c / filter_by_weight */ -DEFINE(MFQE_PRECISION_VAL, MFQE_PRECISION); -#endif /* CONFIG_POSTPROC */ - -END - -/* add asserts for any offset that is not supported by assembly code */ -/* add asserts for any size that is not supported by assembly code */ - -#if HAVE_MEDIA -/* switch case in vp8_intra4x4_predict_armv6 is based on these enumerated values */ -ct_assert(B_DC_PRED, B_DC_PRED == 0); -ct_assert(B_TM_PRED, B_TM_PRED == 1); -ct_assert(B_VE_PRED, B_VE_PRED == 2); -ct_assert(B_HE_PRED, B_HE_PRED == 3); -ct_assert(B_LD_PRED, B_LD_PRED == 4); -ct_assert(B_RD_PRED, B_RD_PRED == 5); -ct_assert(B_VR_PRED, B_VR_PRED == 6); -ct_assert(B_VL_PRED, B_VL_PRED == 7); -ct_assert(B_HD_PRED, B_HD_PRED == 8); -ct_assert(B_HU_PRED, B_HU_PRED == 9); -#endif - -#if HAVE_SSE2 -#if CONFIG_POSTPROC -/* vp8_filter_by_weight16x16 and 8x8 */ -ct_assert(MFQE_PRECISION_VAL, MFQE_PRECISION == 4) -#endif /* CONFIG_POSTPROC */ -#endif /* HAVE_SSE2 */ diff --git a/libvpx/vp8/decoder/dboolhuff.c b/libvpx/vp8/decoder/dboolhuff.c index 546fb2d..0007d7a 100644 --- a/libvpx/vp8/decoder/dboolhuff.c +++ b/libvpx/vp8/decoder/dboolhuff.c @@ -47,8 +47,8 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br) unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1]; if (br->decrypt_cb) { - int n = bytes_left > sizeof(decrypted) ? sizeof(decrypted) : bytes_left; - br->decrypt_cb(br->decrypt_state, bufptr, decrypted, n); + size_t n = bytes_left > sizeof(decrypted) ? sizeof(decrypted) : bytes_left; + br->decrypt_cb(br->decrypt_state, bufptr, decrypted, (int)n); bufptr = decrypted; } diff --git a/libvpx/vp8/decoder/decodframe.c b/libvpx/vp8/decoder/decodframe.c index 44c35ef..51eeb02 100644 --- a/libvpx/vp8/decoder/decodframe.c +++ b/libvpx/vp8/decoder/decodframe.c @@ -1095,7 +1095,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, "Failed to allocate bool decoder 0"); if (pc->frame_type == KEY_FRAME) { - pc->clr_type = (YUV_TYPE)vp8_read_bit(bc); + (void)vp8_read_bit(bc); // colorspace pc->clamp_type = (CLAMP_TYPE)vp8_read_bit(bc); } diff --git a/libvpx/vp8/decoder/onyxd_if.c b/libvpx/vp8/decoder/onyxd_if.c index 2db3096..2d9e343 100644 --- a/libvpx/vp8/decoder/onyxd_if.c +++ b/libvpx/vp8/decoder/onyxd_if.c @@ -430,7 +430,6 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_st *time_stamp = pbi->last_time_stamp; *time_end_stamp = 0; - sd->clrtype = pbi->common.clr_type; #if CONFIG_POSTPROC ret = vp8_post_proc_frame(&pbi->common, sd, flags); #else diff --git a/libvpx/vp8/decoder/vp8_asm_dec_offsets.c b/libvpx/vp8/decoder/vp8_asm_dec_offsets.c deleted file mode 100644 index 842a0d5..0000000 --- a/libvpx/vp8/decoder/vp8_asm_dec_offsets.c +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/asm_offsets.h" -#include "onyxd_int.h" - -BEGIN - -DEFINE(bool_decoder_user_buffer_end, offsetof(BOOL_DECODER, user_buffer_end)); -DEFINE(bool_decoder_user_buffer, offsetof(BOOL_DECODER, user_buffer)); -DEFINE(bool_decoder_value, offsetof(BOOL_DECODER, value)); -DEFINE(bool_decoder_count, offsetof(BOOL_DECODER, count)); -DEFINE(bool_decoder_range, offsetof(BOOL_DECODER, range)); - -END - -/* add asserts for any offset that is not supported by assembly code */ -/* add asserts for any size that is not supported by assembly code */ diff --git a/libvpx/vp8/encoder/bitstream.c b/libvpx/vp8/encoder/bitstream.c index 4707ae5..5f0c1f7 100644 --- a/libvpx/vp8/encoder/bitstream.c +++ b/libvpx/vp8/encoder/bitstream.c @@ -1322,7 +1322,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest vp8_start_encode(bc, cx_data, cx_data_end); /* signal clr type */ - vp8_write_bit(bc, pc->clr_type); + vp8_write_bit(bc, 0); vp8_write_bit(bc, pc->clamp_type); } diff --git a/libvpx/vp8/encoder/firstpass.c b/libvpx/vp8/encoder/firstpass.c index 433726d..ded0c43 100644 --- a/libvpx/vp8/encoder/firstpass.c +++ b/libvpx/vp8/encoder/firstpass.c @@ -1325,7 +1325,7 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta return Q; } -extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate); +extern void vp8_new_framerate(VP8_COMP *cpi, double framerate); void vp8_init_second_pass(VP8_COMP *cpi) { @@ -1349,9 +1349,9 @@ void vp8_init_second_pass(VP8_COMP *cpi) * sum duration is not. Its calculated based on the actual durations of * all frames from the first pass. */ - vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration); + vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration); - cpi->output_frame_rate = cpi->frame_rate; + cpi->output_framerate = cpi->framerate; cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ; cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0); @@ -2398,7 +2398,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) target_frame_size += cpi->min_frame_bandwidth; /* Every other frame gets a few extra bits */ - if ( (cpi->common.frames_since_golden & 0x01) && + if ( (cpi->frames_since_golden & 0x01) && (cpi->frames_till_gf_update_due > 0) ) { target_frame_size += cpi->twopass.alt_extra_bits; @@ -2529,7 +2529,7 @@ void vp8_second_pass(VP8_COMP *cpi) /* Set nominal per second bandwidth for this frame */ cpi->target_bandwidth = (int) - (cpi->per_frame_bandwidth * cpi->output_frame_rate); + (cpi->per_frame_bandwidth * cpi->output_framerate); if (cpi->target_bandwidth < 0) cpi->target_bandwidth = 0; @@ -3185,7 +3185,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) /* Convert to a per second bitrate */ cpi->target_bandwidth = (int)(cpi->twopass.kf_bits * - cpi->output_frame_rate); + cpi->output_framerate); } /* Note the total error score of the kf group minus the key frame itself */ @@ -3224,7 +3224,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) cpi->common.vert_scale = NORMAL; /* Calculate Average bits per frame. */ - av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate); + av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate); /* CBR... Use the clip average as the target for deciding resample */ if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) @@ -3299,7 +3299,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) } else { - int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate)); + int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->framerate)); int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level; /* If triggered last time the threshold for triggering again is diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c index 73f6583..7c07975 100644 --- a/libvpx/vp8/encoder/onyx_if.c +++ b/libvpx/vp8/encoder/onyx_if.c @@ -301,11 +301,11 @@ static int rescale(int val, int num, int denom) static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf, const int layer, - double prev_layer_frame_rate) + double prev_layer_framerate) { LAYER_CONTEXT *lc = &cpi->layer_context[layer]; - lc->frame_rate = cpi->output_frame_rate / cpi->oxcf.rate_decimator[layer]; + lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer]; lc->target_bandwidth = cpi->oxcf.target_bitrate[layer] * 1000; lc->starting_buffer_level_in_ms = oxcf->starting_buffer_level; @@ -335,7 +335,7 @@ static void init_temporal_layer_context(VP8_COMP *cpi, lc->avg_frame_size_for_layer = (int)((cpi->oxcf.target_bitrate[layer] - cpi->oxcf.target_bitrate[layer-1]) * 1000 / - (lc->frame_rate - prev_layer_frame_rate)); + (lc->framerate - prev_layer_framerate)); lc->active_worst_quality = cpi->oxcf.worst_allowed_q; lc->active_best_quality = cpi->oxcf.best_allowed_q; @@ -363,7 +363,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, const int prev_num_layers) { int i; - double prev_layer_frame_rate = 0; + double prev_layer_framerate = 0; const int curr_num_layers = cpi->oxcf.number_of_layers; // If the previous state was 1 layer, get current layer context from cpi. // We need this to set the layer context for the new layers below. @@ -377,7 +377,7 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, LAYER_CONTEXT *lc = &cpi->layer_context[i]; if (i >= prev_num_layers) { - init_temporal_layer_context(cpi, oxcf, i, prev_layer_frame_rate); + init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate); } // The initial buffer levels are set based on their starting levels. // We could set the buffer levels based on the previous state (normalized @@ -403,8 +403,8 @@ static void reset_temporal_layer_change(VP8_COMP *cpi, lc->bits_off_target = lc->buffer_level; restore_layer_context(cpi, 0); } - prev_layer_frame_rate = cpi->output_frame_rate / - cpi->oxcf.rate_decimator[i]; + prev_layer_framerate = cpi->output_framerate / + cpi->oxcf.rate_decimator[i]; } } @@ -1282,21 +1282,21 @@ int vp8_reverse_trans(int x) return 63; } -void vp8_new_frame_rate(VP8_COMP *cpi, double framerate) +void vp8_new_framerate(VP8_COMP *cpi, double framerate) { if(framerate < .1) framerate = 30; - cpi->frame_rate = framerate; - cpi->output_frame_rate = framerate; + cpi->framerate = framerate; + cpi->output_framerate = framerate; cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / - cpi->output_frame_rate); + cpi->output_framerate); cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth; cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); /* Set Maximum gf/arf interval */ - cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2); + cpi->max_gf_interval = ((int)(cpi->output_framerate / 2.0) + 2); if(cpi->max_gf_interval < 12) cpi->max_gf_interval = 12; @@ -1337,13 +1337,13 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) * seems like a reasonable framerate, then use that as a guess, otherwise * use 30. */ - cpi->frame_rate = (double)(oxcf->timebase.den) / - (double)(oxcf->timebase.num); + cpi->framerate = (double)(oxcf->timebase.den) / + (double)(oxcf->timebase.num); - if (cpi->frame_rate > 180) - cpi->frame_rate = 30; + if (cpi->framerate > 180) + cpi->framerate = 30; - cpi->ref_frame_rate = cpi->frame_rate; + cpi->ref_framerate = cpi->framerate; /* change includes all joint functionality */ vp8_change_config(cpi, oxcf); @@ -1369,13 +1369,13 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) if (cpi->oxcf.number_of_layers > 1) { unsigned int i; - double prev_layer_frame_rate=0; + double prev_layer_framerate=0; for (i=0; i<cpi->oxcf.number_of_layers; i++) { - init_temporal_layer_context(cpi, oxcf, i, prev_layer_frame_rate); - prev_layer_frame_rate = cpi->output_frame_rate / - cpi->oxcf.rate_decimator[i]; + init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate); + prev_layer_framerate = cpi->output_framerate / + cpi->oxcf.rate_decimator[i]; } } @@ -1399,14 +1399,14 @@ static void update_layer_contexts (VP8_COMP *cpi) if (oxcf->number_of_layers > 1) { unsigned int i; - double prev_layer_frame_rate=0; + double prev_layer_framerate=0; for (i=0; i<oxcf->number_of_layers; i++) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; - lc->frame_rate = - cpi->ref_frame_rate / oxcf->rate_decimator[i]; + lc->framerate = + cpi->ref_framerate / oxcf->rate_decimator[i]; lc->target_bandwidth = oxcf->target_bitrate[i] * 1000; lc->starting_buffer_level = rescale( @@ -1432,9 +1432,9 @@ static void update_layer_contexts (VP8_COMP *cpi) lc->avg_frame_size_for_layer = (int)((oxcf->target_bitrate[i] - oxcf->target_bitrate[i-1]) * 1000 / - (lc->frame_rate - prev_layer_frame_rate)); + (lc->framerate - prev_layer_framerate)); - prev_layer_frame_rate = lc->frame_rate; + prev_layer_framerate = lc->framerate; } } } @@ -1625,7 +1625,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) cpi->oxcf.target_bandwidth, 1000); /* Set up frame rate and related parameters rate control values. */ - vp8_new_frame_rate(cpi, cpi->frame_rate); + vp8_new_framerate(cpi, cpi->framerate); /* Set absolute upper and lower quality limits */ cpi->worst_quality = cpi->oxcf.worst_allowed_q; @@ -1945,7 +1945,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) for (i = 0; i < KEY_FRAME_CONTEXT; i++) { - cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate; + cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate; } #ifdef OUTPUT_YUV_SRC @@ -2273,7 +2273,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) { extern int count_mb_seg[4]; FILE *f = fopen("modes.stt", "a"); - double dr = (double)cpi->frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ; + double dr = (double)cpi->framerate * (double)bytes * (double)8 / (double)count / (double)1000 ; fprintf(f, "intra_mode in Intra Frames:\n"); fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]); fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]); @@ -2750,7 +2750,7 @@ static void update_alt_ref_frame_stats(VP8_COMP *cpi) cpi->gf_active_count = cm->mb_rows * cm->mb_cols; /* this frame refreshes means next frames don't unless specified by user */ - cpi->common.frames_since_golden = 0; + cpi->frames_since_golden = 0; /* Clear the alternate reference update pending flag. */ cpi->source_alt_ref_pending = 0; @@ -2802,7 +2802,7 @@ static void update_golden_frame_stats(VP8_COMP *cpi) * user */ cm->refresh_golden_frame = 0; - cpi->common.frames_since_golden = 0; + cpi->frames_since_golden = 0; cpi->recent_ref_frame_usage[INTRA_FRAME] = 1; cpi->recent_ref_frame_usage[LAST_FRAME] = 1; @@ -2834,12 +2834,12 @@ static void update_golden_frame_stats(VP8_COMP *cpi) if (cpi->frames_till_gf_update_due > 0) cpi->frames_till_gf_update_due--; - if (cpi->common.frames_till_alt_ref_frame) - cpi->common.frames_till_alt_ref_frame --; + if (cpi->frames_till_alt_ref_frame) + cpi->frames_till_alt_ref_frame --; - cpi->common.frames_since_golden ++; + cpi->frames_since_golden ++; - if (cpi->common.frames_since_golden > 1) + if (cpi->frames_since_golden > 1) { cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->mb.count_mb_ref_frame_usage[INTRA_FRAME]; @@ -2890,11 +2890,11 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi) cpi->prob_last_coded = 200; cpi->prob_gf_coded = 1; } - else if (cpi->common.frames_since_golden == 0) + else if (cpi->frames_since_golden == 0) { cpi->prob_last_coded = 214; } - else if (cpi->common.frames_since_golden == 1) + else if (cpi->frames_since_golden == 1) { cpi->prob_last_coded = 192; cpi->prob_gf_coded = 220; @@ -3368,12 +3368,12 @@ static void encode_frame_to_data_rate cpi->per_frame_bandwidth = cpi->twopass.gf_bits; /* per second target bitrate */ cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * - cpi->output_frame_rate); + cpi->output_framerate); } } else #endif - cpi->per_frame_bandwidth = (int)(cpi->target_bandwidth / cpi->output_frame_rate); + cpi->per_frame_bandwidth = (int)(cpi->target_bandwidth / cpi->output_framerate); /* Default turn off buffer to buffer copying */ cm->copy_buffer_to_gf = 0; @@ -4557,7 +4557,7 @@ static void encode_frame_to_data_rate { LAYER_CONTEXT *lc = &cpi->layer_context[i]; int bits_off_for_this_layer = - (int)(lc->target_bandwidth / lc->frame_rate - + (int)(lc->target_bandwidth / lc->framerate - cpi->projected_frame_size); lc->bits_off_target += bits_off_for_this_layer; @@ -4805,7 +4805,7 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, { double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth *cpi->oxcf.two_pass_vbrmin_section / 100); - cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->frame_rate); + cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->framerate); } } #endif @@ -4821,8 +4821,10 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C { #if HAVE_NEON int64_t store_reg[8]; -#endif +#if CONFIG_RUNTIME_CPU_DETECT VP8_COMMON *cm = &cpi->common; +#endif +#endif struct vpx_usec_timer timer; int res = 0; @@ -4848,7 +4850,6 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C if(vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL)) res = -1; - cm->clr_type = sd->clrtype; vpx_usec_timer_mark(&timer); cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); @@ -4933,7 +4934,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l cpi->frames_till_gf_update_due); force_src_buffer = &cpi->alt_ref_buffer; } - cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due; + cpi->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due; cm->refresh_alt_ref_frame = 1; cm->refresh_golden_frame = 0; cm->refresh_last_frame = 0; @@ -5038,7 +5039,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l if (this_duration) { if (step) - cpi->ref_frame_rate = 10000000.0 / this_duration; + cpi->ref_framerate = 10000000.0 / this_duration; else { double avg_duration, interval; @@ -5052,11 +5053,11 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l if(interval > 10000000.0) interval = 10000000; - avg_duration = 10000000.0 / cpi->ref_frame_rate; + avg_duration = 10000000.0 / cpi->ref_framerate; avg_duration *= (interval - avg_duration + this_duration); avg_duration /= interval; - cpi->ref_frame_rate = 10000000.0 / avg_duration; + cpi->ref_framerate = 10000000.0 / avg_duration; } if (cpi->oxcf.number_of_layers > 1) @@ -5067,12 +5068,12 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l for (i=0; i<cpi->oxcf.number_of_layers; i++) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; - lc->frame_rate = cpi->ref_frame_rate / - cpi->oxcf.rate_decimator[i]; + lc->framerate = cpi->ref_framerate / + cpi->oxcf.rate_decimator[i]; } } else - vp8_new_frame_rate(cpi, cpi->ref_frame_rate); + vp8_new_framerate(cpi, cpi->ref_framerate); } cpi->last_time_stamp_seen = cpi->source->ts_start; @@ -5089,7 +5090,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l layer = cpi->oxcf.layer_id[ cpi->temporal_pattern_counter % cpi->oxcf.periodicity]; restore_layer_context (cpi, layer); - vp8_new_frame_rate (cpi, cpi->layer_context[layer].frame_rate); + vp8_new_framerate(cpi, cpi->layer_context[layer].framerate); } if (cpi->compressor_speed == 2) diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h index 5120fcc..3ab0fe8 100644 --- a/libvpx/vp8/encoder/onyx_int.h +++ b/libvpx/vp8/encoder/onyx_int.h @@ -232,7 +232,7 @@ enum typedef struct { /* Layer configuration */ - double frame_rate; + double framerate; int target_bandwidth; /* Layer specific coding parameters */ @@ -320,6 +320,7 @@ typedef struct VP8_COMP YV12_BUFFER_CONFIG scaled_source; YV12_BUFFER_CONFIG *last_frame_unscaled_source; + unsigned int frames_till_alt_ref_frame; /* frame in src_buffers has been identified to be encoded as an alt ref */ int source_alt_ref_pending; /* an alt ref frame has been encoded and is usable */ @@ -369,6 +370,7 @@ typedef struct VP8_COMP double key_frame_rate_correction_factor; double gf_rate_correction_factor; + unsigned int frames_since_golden; /* Count down till next GF */ int frames_till_gf_update_due; @@ -401,7 +403,7 @@ typedef struct VP8_COMP /* Minimum allocation that should be used for any frame */ int min_frame_bandwidth; int inter_frame_target; - double output_frame_rate; + double output_framerate; int64_t last_time_stamp_seen; int64_t last_end_time_stamp_seen; int64_t first_time_stamp_ever; @@ -415,8 +417,8 @@ typedef struct VP8_COMP int buffered_mode; - double frame_rate; - double ref_frame_rate; + double framerate; + double ref_framerate; int64_t buffer_level; int64_t bits_off_target; diff --git a/libvpx/vp8/encoder/ratectrl.c b/libvpx/vp8/encoder/ratectrl.c index 8e3c01d..1e8259c 100644 --- a/libvpx/vp8/encoder/ratectrl.c +++ b/libvpx/vp8/encoder/ratectrl.c @@ -234,7 +234,7 @@ void vp8_save_coding_context(VP8_COMP *cpi) cc->frames_since_key = cpi->frames_since_key; cc->filter_level = cpi->common.filter_level; cc->frames_till_gf_update_due = cpi->frames_till_gf_update_due; - cc->frames_since_golden = cpi->common.frames_since_golden; + cc->frames_since_golden = cpi->frames_since_golden; vp8_copy(cc->mvc, cpi->common.fc.mvc); vp8_copy(cc->mvcosts, cpi->rd_costs.mvcosts); @@ -271,7 +271,7 @@ void vp8_restore_coding_context(VP8_COMP *cpi) cpi->frames_since_key = cc->frames_since_key; cpi->common.filter_level = cc->filter_level; cpi->frames_till_gf_update_due = cc->frames_till_gf_update_due; - cpi->common.frames_since_golden = cc->frames_since_golden; + cpi->frames_since_golden = cc->frames_since_golden; vp8_copy(cpi->common.fc.mvc, cc->mvc); @@ -388,7 +388,7 @@ static void calc_iframe_target_size(VP8_COMP *cpi) int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */ /* Boost depends somewhat on frame rate: only used for 1 layer case. */ if (cpi->oxcf.number_of_layers == 1) { - kf_boost = MAX(initial_boost, (int)(2 * cpi->output_frame_rate - 16)); + kf_boost = MAX(initial_boost, (int)(2 * cpi->output_framerate - 16)); } else { /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */ @@ -399,9 +399,9 @@ static void calc_iframe_target_size(VP8_COMP *cpi) kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100; /* frame separation adjustment ( down) */ - if (cpi->frames_since_key < cpi->output_frame_rate / 2) + if (cpi->frames_since_key < cpi->output_framerate / 2) kf_boost = (int)(kf_boost - * cpi->frames_since_key / (cpi->output_frame_rate / 2)); + * cpi->frames_since_key / (cpi->output_framerate / 2)); /* Minimal target size is |2* per_frame_bandwidth|. */ if (kf_boost < 16) @@ -715,7 +715,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) if (Adjustment > (cpi->this_frame_target - min_frame_target)) Adjustment = (cpi->this_frame_target - min_frame_target); - if (cpi->common.frames_since_golden == (cpi->current_gf_interval >> 1)) + if (cpi->frames_since_golden == (cpi->current_gf_interval >> 1)) cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment); else cpi->this_frame_target -= Adjustment; @@ -1360,7 +1360,7 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi) * whichever is smaller. */ int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1; - av_key_frame_frequency = 1 + (int)cpi->output_frame_rate * 2; + av_key_frame_frequency = 1 + (int)cpi->output_framerate * 2; if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq) av_key_frame_frequency = key_freq; diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c index 8579614..521e84f 100644 --- a/libvpx/vp8/encoder/rdopt.c +++ b/libvpx/vp8/encoder/rdopt.c @@ -341,7 +341,7 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue) void vp8_auto_select_speed(VP8_COMP *cpi) { - int milliseconds_for_compress = (int)(1000000 / cpi->frame_rate); + int milliseconds_for_compress = (int)(1000000 / cpi->framerate); milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16; diff --git a/libvpx/vp8/vp8_common.mk b/libvpx/vp8/vp8_common.mk index cde2651..f98eb31 100644 --- a/libvpx/vp8/vp8_common.mk +++ b/libvpx/vp8/vp8_common.mk @@ -66,7 +66,6 @@ VP8_COMMON_SRCS-yes += common/setupintrarecon.c VP8_COMMON_SRCS-yes += common/swapyv12buffer.c VP8_COMMON_SRCS-yes += common/variance_c.c VP8_COMMON_SRCS-yes += common/variance.h -VP8_COMMON_SRCS-yes += common/vp8_asm_com_offsets.c VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h @@ -192,7 +191,4 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance8x8_neon$(A VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM) -$(eval $(call asm_offsets_template,\ - vp8_asm_com_offsets.asm, $(VP8_PREFIX)common/vp8_asm_com_offsets.c)) - $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh)) diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c index 4531d5a..9a7b9c5 100644 --- a/libvpx/vp8/vp8_cx_iface.c +++ b/libvpx/vp8/vp8_cx_iface.c @@ -695,7 +695,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->uv_stride = img->stride[VPX_PLANE_U]; yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2; - yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12); return res; } @@ -1079,11 +1078,7 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx) ctx->preview_img.planes[VPX_PLANE_U] = sd.u_buffer; ctx->preview_img.planes[VPX_PLANE_V] = sd.v_buffer; - if (sd.clrtype == REG_YUV) - ctx->preview_img.fmt = VPX_IMG_FMT_I420; - else - ctx->preview_img.fmt = VPX_IMG_FMT_VPXI420; - + ctx->preview_img.fmt = VPX_IMG_FMT_I420; ctx->preview_img.x_chroma_shift = 1; ctx->preview_img.y_chroma_shift = 1; diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c index c826f69..871b8d3 100644 --- a/libvpx/vp8/vp8_dx_iface.c +++ b/libvpx/vp8/vp8_dx_iface.c @@ -41,15 +41,6 @@ typedef enum static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t); -typedef struct -{ - unsigned int id; - unsigned long sz; - unsigned int align; - unsigned int flags; - unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t); -} mem_req_t; - static const mem_req_t vp8_mem_req_segs[] = { {VP8_SEG_ALG_PRIV, 0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz}, @@ -93,65 +84,6 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_ return sizeof(vpx_codec_alg_priv_t); } - -static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap) -{ - free(mmap->priv); -} - -static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap) -{ - vpx_codec_err_t res; - unsigned int align; - - align = mmap->align ? mmap->align - 1 : 0; - - if (mmap->flags & VPX_CODEC_MEM_ZERO) - mmap->priv = calloc(1, mmap->sz + align); - else - mmap->priv = malloc(mmap->sz + align); - - res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR; - mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align); - mmap->dtor = vp8_mmap_dtor; - return res; -} - -static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si, - const vpx_codec_mmap_t *mmaps, - vpx_codec_flags_t init_flags) -{ - int i; - vpx_codec_err_t res = VPX_CODEC_OK; - - for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++) - { - /* Ensure the segment has been allocated */ - if (!mmaps[i].base) - { - res = VPX_CODEC_MEM_ERROR; - break; - } - - /* Verify variable size segment is big enough for the current si. */ - if (vp8_mem_req_segs[i].calc_sz) - { - vpx_codec_dec_cfg_t cfg; - - cfg.w = si->w; - cfg.h = si->h; - - if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags)) - { - res = VPX_CODEC_MEM_ERROR; - break; - } - } - } - - return res; -} - static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) { int i; @@ -178,16 +110,6 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) } } -static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id) -{ - int i; - - for (i = 0; i < NELEMENTS(ctx->mmaps); i++) - if (ctx->mmaps[i].id == id) - return ctx->mmaps[i].base; - - return NULL; -} static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) { /* nothing to clean up */ @@ -214,7 +136,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, mmap.align = vp8_mem_req_segs[0].align; mmap.flags = vp8_mem_req_segs[0].flags; - res = vp8_mmap_alloc(&mmap); + res = vpx_mmap_alloc(&mmap); if (res != VPX_CODEC_OK) return res; vp8_init_ctx(ctx, &mmap); @@ -366,8 +288,7 @@ static void yuvconfig2image(vpx_image_t *img, * the Y, U, and V planes, nor other alignment adjustments that * might be representable by a YV12_BUFFER_CONFIG, so we just * initialize all the fields.*/ - img->fmt = yv12->clrtype == REG_YUV ? - VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420; + img->fmt = VPX_IMG_FMT_I420; img->w = yv12->y_stride; img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15; img->d_w = yv12->y_width; @@ -488,7 +409,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg, ctx->base.init_flags); - res = vp8_mmap_alloc(&ctx->mmaps[i]); + res = vpx_mmap_alloc(&ctx->mmaps[i]); } if (!res) @@ -500,7 +421,9 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, /* Initialize the decoder instance on the first frame*/ if (!res && !ctx->decoder_init) { - res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags); + res = vpx_validate_mmaps(&ctx->si, ctx->mmaps, + vp8_mem_req_segs, NELEMENTS(vp8_mem_req_segs), + ctx->base.init_flags); if (!res) { @@ -797,8 +720,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->uv_stride = img->stride[VPX_PLANE_U]; yv12->border = (img->stride[VPX_PLANE_Y] - img->d_w) / 2; - yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12); - return res; } diff --git a/libvpx/vp8/vp8dx.mk b/libvpx/vp8/vp8dx.mk index c26f42d..4a8f467 100644 --- a/libvpx/vp8/vp8dx.mk +++ b/libvpx/vp8/vp8dx.mk @@ -35,9 +35,5 @@ VP8_DX_SRCS-yes += decoder/onyxd_int.h VP8_DX_SRCS-yes += decoder/treereader.h VP8_DX_SRCS-yes += decoder/onyxd_if.c VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c -VP8_DX_SRCS-yes += decoder/vp8_asm_dec_offsets.c VP8_DX_SRCS-yes := $(filter-out $(VP8_DX_SRCS_REMOVE-yes),$(VP8_DX_SRCS-yes)) - -$(eval $(call asm_offsets_template,\ - vp8_asm_dec_offsets.asm, $(VP8_PREFIX)decoder/vp8_asm_dec_offsets.c)) diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm new file mode 100644 index 0000000..15039e2 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm @@ -0,0 +1,277 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + ; These functions are only valid when: + ; x_step_q4 == 16 + ; w%4 == 0 + ; h%4 == 0 + ; taps == 8 + ; VP9_FILTER_WEIGHT == 128 + ; VP9_FILTER_SHIFT == 7 + + EXPORT |vp9_convolve8_avg_horiz_neon| + EXPORT |vp9_convolve8_avg_vert_neon| + IMPORT |vp9_convolve8_avg_horiz_c| + IMPORT |vp9_convolve8_avg_vert_c| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Multiply and accumulate by q0 + MACRO + MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 + vmull.s16 $dst, $src0, d0[0] + vmlal.s16 $dst, $src1, d0[1] + vmlal.s16 $dst, $src2, d0[2] + vmlal.s16 $dst, $src3, d0[3] + vmlal.s16 $dst, $src4, d1[0] + vmlal.s16 $dst, $src5, d1[1] + vmlal.s16 $dst, $src6, d1[2] + vmlal.s16 $dst, $src7, d1[3] + MEND + +; r0 const uint8_t *src +; r1 int src_stride +; r2 uint8_t *dst +; r3 int dst_stride +; sp[]const int16_t *filter_x +; sp[]int x_step_q4 +; sp[]const int16_t *filter_y ; unused +; sp[]int y_step_q4 ; unused +; sp[]int w +; sp[]int h + +|vp9_convolve8_avg_horiz_neon| PROC + push {r4-r10, lr} + + sub r0, r0, #3 ; adjust for taps + + ldr r4, [sp, #36] ; x_step_q4 + ldr r5, [sp, #32] ; filter_x + cmp r4, #16 + bne call_horiz_c_convolve ; x_step_q4 != 16 + + ldr r6, [sp, #48] ; w + ldr r7, [sp, #52] ; h + + vld1.s16 {q0}, [r5] ; filter_x + + add r8, r1, r1, lsl #1 ; src_stride * 3 + add r8, r8, #4 ; src_stride * 3 + 4 + rsb r8, r8, #0 ; reset for src + + add r4, r3, r3, lsl #1 ; dst_stride * 3 + sub r4, r4, #4 ; dst_stride * 3 - 4 + rsb r4, r4, #0 ; reset for dst + + sub r9, r1, #8 ; post increment for src load + + rsb r1, r6, r1, lsl #2 ; reset src for outer loop + rsb r12, r6, r3, lsl #2 ; reset dst for outer loop + + mov r10, r6 ; w loop counter + +loop_horiz + vld4.u8 {d24[0], d25[0], d26[0], d27[0]}, [r0]! + vld4.u8 {d24[4], d25[4], d26[4], d27[4]}, [r0]! + vld3.u8 {d28[0], d29[0], d30[0]}, [r0], r9 + + vld4.u8 {d24[1], d25[1], d26[1], d27[1]}, [r0]! + vld4.u8 {d24[5], d25[5], d26[5], d27[5]}, [r0]! + vld3.u8 {d28[1], d29[1], d30[1]}, [r0], r9 + + vld4.u8 {d24[2], d25[2], d26[2], d27[2]}, [r0]! + vld4.u8 {d24[6], d25[6], d26[6], d27[6]}, [r0]! + vld3.u8 {d28[2], d29[2], d30[2]}, [r0], r9 + + vld4.u8 {d24[3], d25[3], d26[3], d27[3]}, [r0]! + vld4.u8 {d24[7], d25[7], d26[7], d27[7]}, [r0]! + vld3.u8 {d28[3], d29[3], d30[3]}, [r0], r8 + + ; extract to s16 + vmovl.u8 q8, d24 + vmovl.u8 q9, d25 + vmovl.u8 q10, d26 + vmovl.u8 q11, d27 + vtrn.32 d28, d29 ; only the first half is populated + vmovl.u8 q12, d28 + vmovl.u8 q13, d30 + + ; slightly out of order load to match the existing data + vld1.u32 {d6[0]}, [r2], r3 + vld1.u32 {d7[0]}, [r2], r3 + vld1.u32 {d6[1]}, [r2], r3 + vld1.u32 {d7[1]}, [r2], r3 + + sub r2, r2, r3, lsl #2 ; reset for store + + ; src[] * filter_x + MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23 + MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24 + MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25 + MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26 + + ; += 64 >> 7 + vqrshrun.s32 d2, q1, #7 + vqrshrun.s32 d3, q2, #7 + vqrshrun.s32 d4, q14, #7 + vqrshrun.s32 d5, q15, #7 + + ; saturate + vqshrn.u16 d2, q1, #0 + vqshrn.u16 d3, q2, #0 + + ; transpose + vtrn.16 d2, d3 + vtrn.32 d2, d3 + vtrn.8 d2, d3 + + ; average the new value and the dst value + vaddl.u8 q8, d2, d6 + vaddl.u8 q9, d3, d7 + vqrshrn.u16 d2, q8, #1 + vqrshrn.u16 d3, q9, #1 + + vst1.u32 {d2[0]}, [r2], r3 + vst1.u32 {d3[0]}, [r2], r3 + vst1.u32 {d2[1]}, [r2], r3 + vst1.u32 {d3[1]}, [r2], r4 + + subs r6, r6, #4 ; w -= 4 + bgt loop_horiz + + ; outer loop + mov r6, r10 ; restore w counter + add r0, r0, r1 ; src += src_stride * 4 - w + add r2, r2, r12 ; dst += dst_stride * 4 - w + subs r7, r7, #4 ; h -= 4 + bgt loop_horiz + + pop {r4-r10, pc} + +call_horiz_c_convolve + pop {r4-r10, lr} + add r0, r0, #3 ; un-adjust for taps + b vp9_convolve8_avg_horiz_c + + + ENDP + +|vp9_convolve8_avg_vert_neon| PROC + push {r4-r10, lr} + + ; adjust for taps + sub r0, r0, r1 + sub r0, r0, r1, lsl #1 + + ldr r6, [sp, #44] ; y_step_q4 + ldr r7, [sp, #40] ; filter_y + cmp r6, #16 + bne call_vert_c_convolve ; y_step_q4 != 16 + + ldr r8, [sp, #48] ; w + ldr r9, [sp, #52] ; h + + vld1.s16 {q0}, [r7] ; filter_y + + mov r5, r1, lsl #1 ; src_stride * 2 + add r5, r5, r1, lsl #3 ; src_stride * 10 + sub r5, r5, #4 ; src_stride * 10 + 4 + rsb r5, r5, #0 ; reset for src + + add r6, r3, r3, lsl #1 ; dst_stride * 3 + sub r6, r6, #4 ; dst_stride * 3 - 4 + rsb r6, r6, #0 ; reset for dst + + rsb r7, r8, r1, lsl #2 ; reset src for outer loop + rsb r12, r8, r3, lsl #2 ; reset dst for outer loop + + mov r10, r8 ; w loop counter + +loop_vert + ; always process a 4x4 block at a time + vld1.u32 {d16[0]}, [r0], r1 + vld1.u32 {d16[1]}, [r0], r1 + vld1.u32 {d18[0]}, [r0], r1 + vld1.u32 {d18[1]}, [r0], r1 + vld1.u32 {d20[0]}, [r0], r1 + vld1.u32 {d20[1]}, [r0], r1 + vld1.u32 {d22[0]}, [r0], r1 + vld1.u32 {d22[1]}, [r0], r1 + vld1.u32 {d24[0]}, [r0], r1 + vld1.u32 {d24[1]}, [r0], r1 + vld1.u32 {d26[0]}, [r0], r5 + + ; extract to s16 + vmovl.u8 q8, d16 + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 + vmovl.u8 q12, d24 + vmovl.u8 q13, d26 + + vld1.u32 {d6[0]}, [r2], r3 + vld1.u32 {d6[1]}, [r2], r3 + vld1.u32 {d7[0]}, [r2], r3 + vld1.u32 {d7[1]}, [r2], r3 + + sub r2, r2, r3, lsl #2 ; reset for store + + ; src[] * filter_y + MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23 + MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24 + MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25 + MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26 + + ; += 64 >> 7 + vqrshrun.s32 d2, q1, #7 + vqrshrun.s32 d3, q2, #7 + vqrshrun.s32 d4, q14, #7 + vqrshrun.s32 d5, q15, #7 + + ; saturate + vqshrn.u16 d2, q1, #0 + vqshrn.u16 d3, q2, #0 + + ; average the new value and the dst value + vaddl.u8 q8, d2, d6 + vaddl.u8 q9, d3, d7 + vqrshrn.u16 d2, q8, #1 + vqrshrn.u16 d3, q9, #1 + + vst1.u32 {d2[0]}, [r2], r3 + vst1.u32 {d2[1]}, [r2], r3 + vst1.u32 {d3[0]}, [r2], r3 + vst1.u32 {d3[1]}, [r2], r6 + + subs r8, r8, #4 ; w -= 4 + bgt loop_vert + + ; outer loop + mov r8, r10 ; restore w counter + add r0, r0, r7 ; src += 4 * src_stride - w + add r2, r2, r12 ; dst += 4 * dst_stride - w + subs r9, r9, #4 ; h -= 4 + bgt loop_vert + + pop {r4-r10, pc} + +call_vert_c_convolve + pop {r4-r10, lr} + ; un-adjust for taps + add r0, r0, r1 + add r0, r0, r1, lsl #1 + b vp9_convolve8_avg_vert_c + + ENDP + END diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm new file mode 100644 index 0000000..842c73c --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm @@ -0,0 +1,250 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + ; These functions are only valid when: + ; x_step_q4 == 16 + ; w%4 == 0 + ; h%4 == 0 + ; taps == 8 + ; VP9_FILTER_WEIGHT == 128 + ; VP9_FILTER_SHIFT == 7 + + EXPORT |vp9_convolve8_horiz_neon| + EXPORT |vp9_convolve8_vert_neon| + IMPORT |vp9_convolve8_horiz_c| + IMPORT |vp9_convolve8_vert_c| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Multiply and accumulate by q0 + MACRO + MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 + vmull.s16 $dst, $src0, d0[0] + vmlal.s16 $dst, $src1, d0[1] + vmlal.s16 $dst, $src2, d0[2] + vmlal.s16 $dst, $src3, d0[3] + vmlal.s16 $dst, $src4, d1[0] + vmlal.s16 $dst, $src5, d1[1] + vmlal.s16 $dst, $src6, d1[2] + vmlal.s16 $dst, $src7, d1[3] + MEND + +; r0 const uint8_t *src +; r1 int src_stride +; r2 uint8_t *dst +; r3 int dst_stride +; sp[]const int16_t *filter_x +; sp[]int x_step_q4 +; sp[]const int16_t *filter_y ; unused +; sp[]int y_step_q4 ; unused +; sp[]int w +; sp[]int h + +|vp9_convolve8_horiz_neon| PROC + push {r4-r10, lr} + + sub r0, r0, #3 ; adjust for taps + + ldr r4, [sp, #36] ; x_step_q4 + ldr r5, [sp, #32] ; filter_x + cmp r4, #16 + bne call_horiz_c_convolve ; x_step_q4 != 16 + + ldr r6, [sp, #48] ; w + ldr r7, [sp, #52] ; h + + vld1.s16 {q0}, [r5] ; filter_x + + add r8, r1, r1, lsl #1 ; src_stride * 3 + add r8, r8, #4 ; src_stride * 3 + 4 + rsb r8, r8, #0 ; reset for src + + add r4, r3, r3, lsl #1 ; dst_stride * 3 + sub r4, r4, #4 ; dst_stride * 3 - 4 + rsb r4, r4, #0 ; reset for dst + + sub r9, r1, #8 ; post increment for src load + + rsb r1, r6, r1, lsl #2 ; reset src for outer loop + rsb r12, r6, r3, lsl #2 ; reset dst for outer loop + + mov r10, r6 ; w loop counter + +loop_horiz + vld4.u8 {d24[0], d25[0], d26[0], d27[0]}, [r0]! + vld4.u8 {d24[4], d25[4], d26[4], d27[4]}, [r0]! + vld3.u8 {d28[0], d29[0], d30[0]}, [r0], r9 + + vld4.u8 {d24[1], d25[1], d26[1], d27[1]}, [r0]! + vld4.u8 {d24[5], d25[5], d26[5], d27[5]}, [r0]! + vld3.u8 {d28[1], d29[1], d30[1]}, [r0], r9 + + vld4.u8 {d24[2], d25[2], d26[2], d27[2]}, [r0]! + vld4.u8 {d24[6], d25[6], d26[6], d27[6]}, [r0]! + vld3.u8 {d28[2], d29[2], d30[2]}, [r0], r9 + + vld4.u8 {d24[3], d25[3], d26[3], d27[3]}, [r0]! + vld4.u8 {d24[7], d25[7], d26[7], d27[7]}, [r0]! + vld3.u8 {d28[3], d29[3], d30[3]}, [r0], r8 + + ; extract to s16 + vmovl.u8 q8, d24 + vmovl.u8 q9, d25 + vmovl.u8 q10, d26 + vmovl.u8 q11, d27 + vtrn.32 d28, d29 ; only the first half is populated + vmovl.u8 q12, d28 + vmovl.u8 q13, d30 + + ; src[] * filter_x + MULTIPLY_BY_Q0 q1, d16, d18, d20, d22, d17, d19, d21, d23 + MULTIPLY_BY_Q0 q2, d18, d20, d22, d17, d19, d21, d23, d24 + MULTIPLY_BY_Q0 q14, d20, d22, d17, d19, d21, d23, d24, d25 + MULTIPLY_BY_Q0 q15, d22, d17, d19, d21, d23, d24, d25, d26 + + ; += 64 >> 7 + vqrshrun.s32 d2, q1, #7 + vqrshrun.s32 d3, q2, #7 + vqrshrun.s32 d4, q14, #7 + vqrshrun.s32 d5, q15, #7 + + ; saturate + vqshrn.u16 d2, q1, #0 + vqshrn.u16 d3, q2, #0 + + ; transpose + vtrn.16 d2, d3 + vtrn.32 d2, d3 + vtrn.8 d2, d3 + + vst1.u32 {d2[0]}, [r2], r3 + vst1.u32 {d3[0]}, [r2], r3 + vst1.u32 {d2[1]}, [r2], r3 + vst1.u32 {d3[1]}, [r2], r4 + + subs r6, r6, #4 ; w -= 4 + bgt loop_horiz + + ; outer loop + mov r6, r10 ; restore w counter + add r0, r0, r1 ; src += src_stride * 4 - w + add r2, r2, r12 ; dst += dst_stride * 4 - w + subs r7, r7, #4 ; h -= 4 + bgt loop_horiz + + pop {r4-r10, pc} + +call_horiz_c_convolve + pop {r4-r10, lr} + add r0, r0, #3 ; un-adjust for taps + b vp9_convolve8_horiz_c + + + ENDP + +|vp9_convolve8_vert_neon| PROC + push {r4-r10, lr} + + ; adjust for taps + sub r0, r0, r1 + sub r0, r0, r1, lsl #1 + + ldr r6, [sp, #44] ; y_step_q4 + ldr r7, [sp, #40] ; filter_y + cmp r6, #16 + bne call_vert_c_convolve ; y_step_q4 != 16 + + ldr r8, [sp, #48] ; w + ldr r9, [sp, #52] ; h + + vld1.s16 {q0}, [r7] ; filter_y + + mov r5, r1, lsl #1 ; src_stride * 2 + add r5, r5, r1, lsl #3 ; src_stride * 10 + sub r5, r5, #4 ; src_stride * 10 + 4 + rsb r5, r5, #0 ; reset for src + + add r6, r3, r3, lsl #1 ; dst_stride * 3 + sub r6, r6, #4 ; dst_stride * 3 - 4 + rsb r6, r6, #0 ; reset for dst + + rsb r7, r8, r1, lsl #2 ; reset src for outer loop + rsb r12, r8, r3, lsl #2 ; reset dst for outer loop + + mov r10, r8 ; w loop counter + +loop_vert + ; always process a 4x4 block at a time + vld1.u32 {d16[0]}, [r0], r1 + vld1.u32 {d16[1]}, [r0], r1 + vld1.u32 {d18[0]}, [r0], r1 + vld1.u32 {d18[1]}, [r0], r1 + vld1.u32 {d20[0]}, [r0], r1 + vld1.u32 {d20[1]}, [r0], r1 + vld1.u32 {d22[0]}, [r0], r1 + vld1.u32 {d22[1]}, [r0], r1 + vld1.u32 {d24[0]}, [r0], r1 + vld1.u32 {d24[1]}, [r0], r1 + vld1.u32 {d26[0]}, [r0], r5 + + ; extract to s16 + vmovl.u8 q8, d16 + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 + vmovl.u8 q12, d24 + vmovl.u8 q13, d26 + + ; src[] * filter_y + MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d23 + MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d23, d24 + MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d23, d24, d25 + MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d23, d24, d25, d26 + + ; += 64 >> 7 + vqrshrun.s32 d2, q1, #7 + vqrshrun.s32 d3, q2, #7 + vqrshrun.s32 d4, q14, #7 + vqrshrun.s32 d5, q15, #7 + + ; saturate + vqshrn.u16 d2, q1, #0 + vqshrn.u16 d3, q2, #0 + + vst1.u32 {d2[0]}, [r2], r3 + vst1.u32 {d2[1]}, [r2], r3 + vst1.u32 {d3[0]}, [r2], r3 + vst1.u32 {d3[1]}, [r2], r6 + + subs r8, r8, #4 ; w -= 4 + bgt loop_vert + + ; outer loop + mov r8, r10 ; restore w counter + add r0, r0, r7 ; src += 4 * src_stride - w + add r2, r2, r12 ; dst += 4 * dst_stride - w + subs r9, r9, #4 ; h -= 4 + bgt loop_vert + + pop {r4-r10, pc} + +call_vert_c_convolve + pop {r4-r10, lr} + ; un-adjust for taps + add r0, r0, r1 + add r0, r0, r1, lsl #1 + b vp9_convolve8_vert_c + + ENDP + END diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c b/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c new file mode 100644 index 0000000..6e37ff6 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" + +void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the + * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). + */ + uint8_t temp[64 * 72]; + + // Account for the vertical phase needing 3 lines prior and 4 lines post + int intermediate_height = h + 7; + + if (x_step_q4 != 16 || y_step_q4 != 16) + return vp9_convolve8_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + + /* Filter starting 3 lines back. The neon implementation will ignore the + * given height and filter a multiple of 4 lines. Since this goes in to + * the temp buffer which has lots of extra room and is subsequently discarded + * this is safe if somewhat less than ideal. + */ + vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride, + temp, 64, + filter_x, x_step_q4, filter_y, y_step_q4, + w, intermediate_height); + + /* Step into the temp buffer 3 lines to get the actual frame data */ + vp9_convolve8_vert_neon(temp + 64 * 3, 64, + dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); +} + +void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + uint8_t temp[64 * 72]; + int intermediate_height = h + 7; + + if (x_step_q4 != 16 || y_step_q4 != 16) + return vp9_convolve8_avg_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + + /* This implementation has the same issues as above. In addition, we only want + * to average the values after both passes. + */ + vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride, + temp, 64, + filter_x, x_step_q4, filter_y, y_step_q4, + w, intermediate_height); + vp9_convolve8_avg_vert_neon(temp + 64 * 3, + 64, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); +} diff --git a/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm new file mode 100644 index 0000000..60a0d98 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm @@ -0,0 +1,69 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp9_dc_only_idct_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr, +; uint8_t *dst_ptr, int pitch, int stride) +; +; r0 int input_dc +; r1 uint8_t *pred_ptr +; r2 uint8_t *dst_ptr +; r3 int pitch +; sp int stride + +|vp9_dc_only_idct_add_neon| PROC + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; dct_const_round_shift(input_dc * cospi_16_64) + mul r0, r0, r12 ; input_dc * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; ROUND_POWER_OF_TWO(out, 4) + add r0, r0, #8 ; + (1 <<((4) - 1)) + asr r0, r0, #4 ; >> 4 + + vdup.16 q0, r0; ; duplicate a1 + ldr r12, [sp] ; load stride + + vld1.32 {d2[0]}, [r1], r3 + vld1.32 {d2[1]}, [r1], r3 + vld1.32 {d4[0]}, [r1], r3 + vld1.32 {d4[1]}, [r1] + + vaddw.u8 q1, q0, d2 ; a1 + pred_ptr[c] + vaddw.u8 q2, q0, d4 + + vqmovun.s16 d2, q1 ; clip_pixel + vqmovun.s16 d4, q2 + + vst1.32 {d2[0]}, [r2], r12 + vst1.32 {d2[1]}, [r2], r12 + vst1.32 {d4[0]}, [r2], r12 + vst1.32 {d4[1]}, [r2] + + bx lr + ENDP ; |vp9_dc_only_idct_add_neon| + + END diff --git a/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm new file mode 100644 index 0000000..8b4fe5d --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm @@ -0,0 +1,708 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_loop_filter_horizontal_edge_neon| + EXPORT |vp9_loop_filter_vertical_edge_neon| + EXPORT |vp9_mbloop_filter_horizontal_edge_neon| + EXPORT |vp9_mbloop_filter_vertical_edge_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter +; works on 16 iterations at a time. +; TODO(fgalligan): See about removing the count code as this function is only +; called with a count of 1. +; +; void vp9_loop_filter_horizontal_edge_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; sp+4 int count +|vp9_loop_filter_horizontal_edge_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r12, [sp, #8] ; load count + ldr r2, [sp, #4] ; load thresh + add r1, r1, r1 ; double pitch + + cmp r12, #0 + beq end_vp9_lf_h_edge + + vld1.8 {d1[]}, [r3] ; duplicate *limit + vld1.8 {d2[]}, [r2] ; duplicate *thresh + +count_lf_h_loop + sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines + add r3, r2, r1, lsr #1 ; set to 3 lines down + + vld1.u8 {d3}, [r2@64], r1 ; p3 + vld1.u8 {d4}, [r3@64], r1 ; p2 + vld1.u8 {d5}, [r2@64], r1 ; p1 + vld1.u8 {d6}, [r3@64], r1 ; p0 + vld1.u8 {d7}, [r2@64], r1 ; q0 + vld1.u8 {d16}, [r3@64], r1 ; q1 + vld1.u8 {d17}, [r2@64] ; q2 + vld1.u8 {d18}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl vp9_loop_filter_neon + + vst1.u8 {d4}, [r2@64], r1 ; store op1 + vst1.u8 {d5}, [r3@64], r1 ; store op0 + vst1.u8 {d6}, [r2@64], r1 ; store oq0 + vst1.u8 {d7}, [r3@64], r1 ; store oq1 + + add r0, r0, #8 + subs r12, r12, #1 + bne count_lf_h_loop + +end_vp9_lf_h_edge + pop {pc} + ENDP ; |vp9_loop_filter_horizontal_edge_neon| + +; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter +; works on 16 iterations at a time. +; TODO(fgalligan): See about removing the count code as this function is only +; called with a count of 1. +; +; void vp9_loop_filter_vertical_edge_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; sp+4 int count +|vp9_loop_filter_vertical_edge_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r12, [sp, #8] ; load count + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #4] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns + cmp r12, #0 + beq end_vp9_lf_v_edge + + vld1.8 {d2[]}, [r3] ; duplicate *thresh + +count_lf_v_loop + vld1.u8 {d3}, [r2], r1 ; load s data + vld1.u8 {d4}, [r2], r1 + vld1.u8 {d5}, [r2], r1 + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d7}, [r2], r1 + vld1.u8 {d16}, [r2], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r2] + + ;transpose to 8x16 matrix + vtrn.32 d3, d7 + vtrn.32 d4, d16 + vtrn.32 d5, d17 + vtrn.32 d6, d18 + + vtrn.16 d3, d5 + vtrn.16 d4, d6 + vtrn.16 d7, d17 + vtrn.16 d16, d18 + + vtrn.8 d3, d4 + vtrn.8 d5, d6 + vtrn.8 d7, d16 + vtrn.8 d17, d18 + + bl vp9_loop_filter_neon + + sub r0, r0, #2 + + ;store op1, op0, oq0, oq1 + vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 + vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 + vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 + vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 + vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 + vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 + vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 + vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] + + add r0, r0, r1, lsl #3 ; s += pitch * 8 + subs r12, r12, #1 + subne r2, r0, #4 ; move s pointer down by 4 columns + bne count_lf_v_loop + +end_vp9_lf_v_edge + pop {pc} + ENDP ; |vp9_loop_filter_vertical_edge_neon| + +; void vp9_loop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. The function does not use +; registers d8-d15. +; +; Inputs: +; r0-r3, r12 PRESERVE +; d0 blimit +; d1 limit +; d2 thresh +; d3 p3 +; d4 p2 +; d5 p1 +; d6 p0 +; d7 q0 +; d16 q1 +; d17 q2 +; d18 q3 +; +; Outputs: +; d4 op1 +; d5 op0 +; d6 oq0 +; d7 oq1 +|vp9_loop_filter_neon| PROC + ; filter_mask + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) + + vabd.u8 d17, d6, d7 ; abs(p0 - q0) + + vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) + + vmov.u8 d18, #0x80 + + vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) + + ; hevmask + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) + + vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) + vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 + + veor d7, d7, d18 ; qs0 + + vcge.u8 d23, d1, d23 ; abs(m1) > limit + + ; filter() function + ; convert to signed + + vshr.u8 d28, d28, #1 ; a = a / 2 + veor d6, d6, d18 ; ps0 + + veor d5, d5, d18 ; ps1 + vqadd.u8 d17, d17, d28 ; a = b + a + + veor d16, d16, d18 ; qs1 + + vmov.u8 d19, #3 + + vsub.s8 d28, d7, d6 ; ( qs0 - ps0) + + vcge.u8 d17, d0, d17 ; a > blimit + + vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) + vorr d22, d21, d22 ; hevmask + + vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) + + vand d27, d27, d22 ; filter &= hev + vand d23, d23, d17 ; filter_mask + + vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d17, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d27, q12 + + vand d27, d27, d23 ; filter &= mask + + vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) + vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) + vshr.s8 d28, d28, #3 ; filter2 >>= 3 + vshr.s8 d27, d27, #3 ; filter1 >>= 3 + + vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) + vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 + + veor d6, d26, d18 ; *oq0 = u^0x80 + + vbic d27, d27, d22 ; filter &= ~hev + + vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) + vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) + + veor d5, d19, d18 ; *op0 = u^0x80 + veor d4, d21, d18 ; *op1 = u^0x80 + veor d7, d20, d18 ; *oq1 = u^0x80 + + bx lr + ENDP ; |vp9_loop_filter_neon| + +; void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; sp+4 int count +|vp9_mbloop_filter_horizontal_edge_neon| PROC + push {r4-r5, lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r12, [sp, #16] ; load count + ldr r2, [sp, #12] ; load thresh + add r1, r1, r1 ; double pitch + + cmp r12, #0 + beq end_vp9_mblf_h_edge + + vld1.8 {d1[]}, [r3] ; duplicate *limit + vld1.8 {d2[]}, [r2] ; duplicate *thresh + +count_mblf_h_loop + sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines + add r2, r3, r1, lsr #1 ; set to 3 lines down + + vld1.u8 {d3}, [r3@64], r1 ; p3 + vld1.u8 {d4}, [r2@64], r1 ; p2 + vld1.u8 {d5}, [r3@64], r1 ; p1 + vld1.u8 {d6}, [r2@64], r1 ; p0 + vld1.u8 {d7}, [r3@64], r1 ; q0 + vld1.u8 {d16}, [r2@64], r1 ; q1 + vld1.u8 {d17}, [r3@64] ; q2 + vld1.u8 {d18}, [r2@64], r1 ; q3 + + sub r3, r3, r1, lsl #1 + sub r2, r2, r1, lsl #2 + + bl vp9_mbloop_filter_neon + + vst1.u8 {d0}, [r2@64], r1 ; store op2 + vst1.u8 {d1}, [r3@64], r1 ; store op1 + vst1.u8 {d2}, [r2@64], r1 ; store op0 + vst1.u8 {d3}, [r3@64], r1 ; store oq0 + vst1.u8 {d4}, [r2@64], r1 ; store oq1 + vst1.u8 {d5}, [r3@64], r1 ; store oq2 + + add r0, r0, #8 + subs r12, r12, #1 + bne count_mblf_h_loop + +end_vp9_mblf_h_edge + pop {r4-r5, pc} + + ENDP ; |vp9_mbloop_filter_horizontal_edge_neon| + +; void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s, +; int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; sp+4 int count +|vp9_mbloop_filter_vertical_edge_neon| PROC + push {r4-r5, lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r12, [sp, #16] ; load count + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #12] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns + cmp r12, #0 + beq end_vp9_mblf_v_edge + + vld1.8 {d2[]}, [r3] ; duplicate *thresh + +count_mblf_v_loop + vld1.u8 {d3}, [r2], r1 ; load s data + vld1.u8 {d4}, [r2], r1 + vld1.u8 {d5}, [r2], r1 + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d7}, [r2], r1 + vld1.u8 {d16}, [r2], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r2] + + ;transpose to 8x16 matrix + vtrn.32 d3, d7 + vtrn.32 d4, d16 + vtrn.32 d5, d17 + vtrn.32 d6, d18 + + vtrn.16 d3, d5 + vtrn.16 d4, d6 + vtrn.16 d7, d17 + vtrn.16 d16, d18 + + vtrn.8 d3, d4 + vtrn.8 d5, d6 + vtrn.8 d7, d16 + vtrn.8 d17, d18 + + sub r2, r0, #3 + add r3, r0, #1 + + bl vp9_mbloop_filter_neon + + ;store op2, op1, op0, oq0 + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1 + vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1 + vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1 + vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1 + vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1 + vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1 + vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1 + vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2] + + ;store oq1, oq2 + vst2.8 {d4[0], d5[0]}, [r3], r1 + vst2.8 {d4[1], d5[1]}, [r3], r1 + vst2.8 {d4[2], d5[2]}, [r3], r1 + vst2.8 {d4[3], d5[3]}, [r3], r1 + vst2.8 {d4[4], d5[4]}, [r3], r1 + vst2.8 {d4[5], d5[5]}, [r3], r1 + vst2.8 {d4[6], d5[6]}, [r3], r1 + vst2.8 {d4[7], d5[7]}, [r3] + + add r0, r0, r1, lsl #3 ; s += pitch * 8 + subs r12, r12, #1 + subne r2, r0, #4 ; move s pointer down by 4 columns + bne count_mblf_v_loop + +end_vp9_mblf_v_edge + pop {r4-r5, pc} + ENDP ; |vp9_mbloop_filter_vertical_edge_neon| + +; void vp9_mbloop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. The function does not use +; registers d8-d15. +; +; Inputs: +; r0-r3, r12 PRESERVE +; d0 blimit +; d1 limit +; d2 thresh +; d3 p3 +; d4 p2 +; d5 p1 +; d6 p0 +; d7 q0 +; d16 q1 +; d17 q2 +; d18 q3 +; +; Outputs: +; d0 op2 +; d1 op1 +; d2 op0 +; d3 oq0 +; d4 oq1 +; d5 oq2 +|vp9_mbloop_filter_neon| PROC + ; filter_mask + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) + + vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2) + + vmax.u8 d23, d23, d24 ; m3 = max(m5, m6) + + vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2) + + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0) + vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0) + vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d5, d16 ; a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 + + ; abs () > limit + vcge.u8 d19, d1, d19 + + ; only compare the largest value to thresh + vmax.u8 d25, d25, d26 ; m4 = max(m7, m8) + vmax.u8 d26, d27, d28 ; m5 = max(m10, m11) + + vshr.u8 d23, d23, #1 ; a = a / 2 + + vmax.u8 d25, d25, d26 ; m4 = max(m4, m5) + + vqadd.u8 d24, d24, d23 ; a = b + a + + vmax.u8 d20, d20, d25 ; m2 = max(m2, m4) + + vmov.u8 d23, #1 + vcge.u8 d24, d0, d24 ; a > blimit + + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + + vcge.u8 d20, d23, d20 ; flat + + vand d19, d19, d24 ; mask + + vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + + vand d20, d20, d19 ; flat & mask + + vmov.u8 d22, #0x80 + + vorr d23, d21, d23 ; hev + + ; This instruction will truncate the "flat & mask" masks down to 4 bits + ; each to fit into one 32 bit arm register. The values are stored in + ; q10.64[0]. + vshrn.u16 d30, q10, #4 + vmov.u32 r4, d30[0] ; flat & mask 4bits + + adds r5, r4, #1 ; Check for all 1's + + ; If mask and flat are 1's for all vectors, then we only need to execute + ; the power branch for all vectors. + beq power_branch_only + + cmp r4, #0 ; Check for 0, set flag for later + + ; mbfilter() function + ; filter() function + ; convert to signed + veor d21, d7, d22 ; qs0 + veor d24, d6, d22 ; ps0 + veor d25, d5, d22 ; ps1 + veor d26, d16, d22 ; qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d21, d24 ; ( qs0 - ps0) + + vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) + + vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) + + vand d29, d29, d23 ; filter &= hev + + vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d29, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 ; filter &= mask + + vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 ; filter2 >>= 3 + vshr.s8 d29, d29, #3 ; filter1 >>= 3 + + vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) + vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1) + + ; outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d23 ; filter &= ~hev + + vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) + + ; If mask and flat are 0's for all vectors, then we only need to execute + ; the filter branch for all vectors. + beq filter_branch_only + + ; If mask and flat are mixed then we must perform both branches and + ; combine the data. + veor d24, d24, d22 ; *f_op0 = u^0x80 + veor d21, d21, d22 ; *f_oq0 = u^0x80 + veor d25, d25, d22 ; *f_op1 = u^0x80 + veor d26, d26, d22 ; *f_oq1 = u^0x80 + + ; At this point we have already executed the filter branch. The filter + ; branch does not set op2 or oq2, so use p2 and q2. Execute the power + ; branch and combine the data. + vmov.u8 d23, #2 + vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0 + vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3 + vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2 + + vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask) + + vaddw.u8 q14, d5 ; r_op2 += p1 + + vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask) + + vqrshrn.u16 d30, q14, #3 ; r_op2 + + vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3 + vsubw.u8 q14, d4 ; r_op1 -= p2 + vaddw.u8 q14, d5 ; r_op1 += p1 + vaddw.u8 q14, d16 ; r_op1 += q1 + + vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask) + + vqrshrn.u16 d31, q14, #3 ; r_op1 + + vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3 + vsubw.u8 q14, d5 ; r_op0 -= p1 + vaddw.u8 q14, d6 ; r_op0 += p0 + vaddw.u8 q14, d17 ; r_op0 += q2 + + vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask) + + vqrshrn.u16 d23, q14, #3 ; r_op0 + + vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3 + vsubw.u8 q14, d6 ; r_oq0 -= p0 + vaddw.u8 q14, d7 ; r_oq0 += q0 + + vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask) + + vaddw.u8 q14, d18 ; oq0 += q3 + + vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask) + + vqrshrn.u16 d22, q14, #3 ; r_oq0 + + vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2 + vsubw.u8 q14, d7 ; r_oq1 -= q0 + vaddw.u8 q14, d16 ; r_oq1 += q1 + + vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask) + + vaddw.u8 q14, d18 ; r_oq1 += q3 + + vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask) + + vqrshrn.u16 d6, q14, #3 ; r_oq1 + + vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1 + vsubw.u8 q14, d16 ; r_oq2 -= q1 + vaddw.u8 q14, d17 ; r_oq2 += q2 + vaddw.u8 q14, d18 ; r_oq2 += q3 + + vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask) + + vqrshrn.u16 d7, q14, #3 ; r_oq2 + + vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask) + vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask) + vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask) + + bx lr + +power_branch_only + vmov.u8 d27, #3 + vmov.u8 d21, #2 + vaddl.u8 q14, d6, d7 ; op2 = p0 + q0 + vmlal.u8 q14, d3, d27 ; op2 += p3 * 3 + vmlal.u8 q14, d4, d21 ; op2 += p2 * 2 + vaddw.u8 q14, d5 ; op2 += p1 + vqrshrn.u16 d0, q14, #3 ; op2 + + vsubw.u8 q14, d3 ; op1 = op2 - p3 + vsubw.u8 q14, d4 ; op1 -= p2 + vaddw.u8 q14, d5 ; op1 += p1 + vaddw.u8 q14, d16 ; op1 += q1 + vqrshrn.u16 d1, q14, #3 ; op1 + + vsubw.u8 q14, d3 ; op0 = op1 - p3 + vsubw.u8 q14, d5 ; op0 -= p1 + vaddw.u8 q14, d6 ; op0 += p0 + vaddw.u8 q14, d17 ; op0 += q2 + vqrshrn.u16 d2, q14, #3 ; op0 + + vsubw.u8 q14, d3 ; oq0 = op0 - p3 + vsubw.u8 q14, d6 ; oq0 -= p0 + vaddw.u8 q14, d7 ; oq0 += q0 + vaddw.u8 q14, d18 ; oq0 += q3 + vqrshrn.u16 d3, q14, #3 ; oq0 + + vsubw.u8 q14, d4 ; oq1 = oq0 - p2 + vsubw.u8 q14, d7 ; oq1 -= q0 + vaddw.u8 q14, d16 ; oq1 += q1 + vaddw.u8 q14, d18 ; oq1 += q3 + vqrshrn.u16 d4, q14, #3 ; oq1 + + vsubw.u8 q14, d5 ; oq2 = oq1 - p1 + vsubw.u8 q14, d16 ; oq2 -= q1 + vaddw.u8 q14, d17 ; oq2 += q2 + vaddw.u8 q14, d18 ; oq2 += q3 + vqrshrn.u16 d5, q14, #3 ; oq2 + + bx lr + +filter_branch_only + ; TODO(fgalligan): See if we can rearange registers so we do not need to + ; do the 2 vswp. + vswp d0, d4 ; op2 + vswp d5, d17 ; oq2 + veor d2, d24, d22 ; *op0 = u^0x80 + veor d3, d21, d22 ; *oq0 = u^0x80 + veor d1, d25, d22 ; *op1 = u^0x80 + veor d4, d26, d22 ; *oq1 = u^0x80 + + bx lr + + ENDP ; |vp9_mbloop_filter_neon| + + END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm new file mode 100644 index 0000000..8e4aada --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm @@ -0,0 +1,356 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_short_idct8x8_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are + ; loaded in q8-q15. The output will be stored back into q8-q15 registers. + ; This macro will touch q0-q7 registers and use them as buffer during + ; calculation. + MACRO + IDCT8x8_1D + ; stage 1 + vdup.16 d0, r3; ; duplicate cospi_28_64 + vdup.16 d1, r4; ; duplicate cospi_4_64 + + ; input[1] * cospi_28_64 + vmull.s16 q2, d18, d0 + vmull.s16 q3, d19, d0 + + ; input[7] * cospi_4_64 + vmull.s16 q4, d30, d1 + vmull.s16 q5, d31, d1 + + ; input[1]*cospi_28_64-input[7]*cospi_4_64 + vsub.s32 q6, q2, q4 + vsub.s32 q7, q3, q5 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d8, q6, #14 ; >> 14 + vqrshrn.s32 d9, q7, #14 ; >> 14 + + ; input[1] * cospi_4_64 + vmull.s16 q2, d18, d1 + vmull.s16 q3, d19, d1 + + ; input[7] * cospi_28_64 + vmull.s16 q1, d30, d0 + vmull.s16 q5, d31, d0 + + ; input[1]*cospi_4_64+input[7]*cospi_28_64 + vadd.s32 q2, q2, q1 + vadd.s32 q3, q3, q5 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d14, q2, #14 ; >> 14 + vqrshrn.s32 d15, q3, #14 ; >> 14 + + vdup.16 d0, r5; ; duplicate cospi_12_64 + vdup.16 d1, r6; ; duplicate cospi_20_64 + + ; input[5] * cospi_12_64 + vmull.s16 q2, d26, d0 + vmull.s16 q3, d27, d0 + + ; input[3] * cospi_20_64 + vmull.s16 q5, d22, d1 + vmull.s16 q6, d23, d1 + + ; input[5] * cospi_12_64 - input[3] * cospi_20_64 + vsub.s32 q2, q2, q5 + vsub.s32 q3, q3, q6 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q2, #14 ; >> 14 + vqrshrn.s32 d11, q3, #14 ; >> 14 + + ; input[5] * cospi_20_64 + vmull.s16 q2, d26, d1 + vmull.s16 q3, d27, d1 + + ; input[3] * cospi_12_64 + vmull.s16 q9, d22, d0 + vmull.s16 q15, d23, d0 + + ; input[5] * cospi_20_64 + input[3] * cospi_12_64 + vadd.s32 q0, q2, q9 + vadd.s32 q1, q3, q15 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q0, #14 ; >> 14 + vqrshrn.s32 d13, q1, #14 ; >> 14 + + ; stage 2 & stage 3 - even half + vdup.16 d0, r7; ; duplicate cospi_16_64 + + ; input[0] * cospi_16_64 + vmull.s16 q2, d16, d0 + vmull.s16 q3, d17, d0 + + ; input[2] * cospi_16_64 + vmull.s16 q9, d24, d0 + vmull.s16 q11, d25, d0 + + ; (input[0] + input[2]) * cospi_16_64 + vadd.s32 q9, q2, q9 + vadd.s32 q11, q3, q11 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d18, q9, #14 ; >> 14 + vqrshrn.s32 d19, q11, #14 ; >> 14 + + ; input[0] * cospi_16_64 + vmull.s16 q2, d16, d0 + vmull.s16 q3, d17, d0 + + ; input[2] * cospi_16_64 + vmull.s16 q13, d24, d0 + vmull.s16 q15, d25, d0 + + ; (input[0] - input[2]) * cospi_16_64 + vsub.s32 q2, q2, q13 + vsub.s32 q3, q3, q15 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d22, q2, #14 ; >> 14 + vqrshrn.s32 d23, q3, #14 ; >> 14 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64 + vdup.16 d0, r8; ; duplicate cospi_24_64 + vdup.16 d1, r9; ; duplicate cospi_8_64 + + ; input[1] * cospi_24_64 + vmull.s16 q2, d20, d0 + vmull.s16 q3, d21, d0 + + ; input[3] * cospi_8_64 + vmull.s16 q13, d28, d1 + vmull.s16 q15, d29, d1 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64 + vsub.s32 q2, q2, q13 + vsub.s32 q3, q3, q15 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d26, q2, #14 ; >> 14 + vqrshrn.s32 d27, q3, #14 ; >> 14 + + ; input[1] * cospi_8_64 + vmull.s16 q2, d20, d1 + vmull.s16 q3, d21, d1 + + ; input[3] * cospi_24_64 + vmull.s16 q8, d28, d0 + vmull.s16 q10, d29, d0 + + ; input[1] * cospi_8_64 + input[3] * cospi_24_64 + vadd.s32 q0, q2, q8 + vadd.s32 q1, q3, q10 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d30, q0, #14 ; >> 14 + vqrshrn.s32 d31, q1, #14 ; >> 14 + + + vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] + vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] + vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] + vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] + + ; stage 2 - odd half + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] + vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] + + ; stage 3 -odd half + vdup.16 d16, r7; ; duplicate cospi_16_64 + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; step2[5] * cospi_16_64 + vmull.s16 q11, d26, d16 + vmull.s16 q12, d27, d16 + + ; (step2[6] - step2[5]) * cospi_16_64 + vsub.s32 q9, q9, q11 + vsub.s32 q10, q10, q12 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q9, #14 ; >> 14 + vqrshrn.s32 d11, q10, #14 ; >> 14 + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; step2[5] * cospi_16_64 + vmull.s16 q11, d26, d16 + vmull.s16 q12, d27, d16 + + ; (step2[5] + step2[6]) * cospi_16_64 + vadd.s32 q9, q9, q11 + vadd.s32 q10, q10, q12 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q10, #14 ; >> 14 + + ; stage 4 + vadd.s16 q8, q0, q7; ; output[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6; ; output[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5; ; output[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4; ; output[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4; ; output[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5; ; output[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6; ; output[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q7; ; output[7] = step1[0] - step1[7]; + MEND + + ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. + MACRO + TRANSPOSE8X8 + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + MEND + + AREA Block, CODE, READONLY ; name this block of code +;void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|vp9_short_idct8x8_add_neon| PROC + push {r4-r9} + vld1.s16 {q8}, [r0]! + vld1.s16 {q9}, [r0]! + vld1.s16 {q10}, [r0]! + vld1.s16 {q11}, [r0]! + vld1.s16 {q12}, [r0]! + vld1.s16 {q13}, [r0]! + vld1.s16 {q14}, [r0]! + vld1.s16 {q15}, [r0]! + + ; transpose the input data + TRANSPOSE8X8 + + ; generate cospi_28_64 = 3196 + mov r3, #0x0c00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r4, #0x3e00 + add r4, #0xc5 + + ; generate cospi_12_64 = 13623 + mov r5, #0x3500 + add r5, #0x37 + + ; generate cospi_20_64 = 9102 + mov r6, #0x2300 + add r6, #0x8e + + ; generate cospi_16_64 = 11585 + mov r7, #0x2d00 + add r7, #0x41 + + ; generate cospi_24_64 = 6270 + mov r8, #0x1800 + add r8, #0x7e + + ; generate cospi_8_64 = 15137 + mov r9, #0x3b00 + add r9, #0x21 + + ; First transform rows + IDCT8x8_1D + + ; Transpose the matrix + TRANSPOSE8X8 + + ; Then transform columns + IDCT8x8_1D + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + vrshr.s16 q8, q8, #5 + vrshr.s16 q9, q9, #5 + vrshr.s16 q10, q10, #5 + vrshr.s16 q11, q11, #5 + vrshr.s16 q12, q12, #5 + vrshr.s16 q13, q13, #5 + vrshr.s16 q14, q14, #5 + vrshr.s16 q15, q15, #5 + + ; save dest pointer + mov r0, r1 + + ; load destination data + vld1.u8 {d0}, [r1], r2 + vld1.u8 {d1}, [r1], r2 + vld1.s16 {d2}, [r1], r2 + vld1.s16 {d3}, [r1], r2 + vld1.s16 {d4}, [r1], r2 + vld1.s16 {d5}, [r1], r2 + vld1.s16 {d6}, [r1], r2 + vld1.s16 {d7}, [r1] + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d0 + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + vaddw.u8 q12, q12, d4 + vaddw.u8 q13, q13, d5 + vaddw.u8 q14, q14, d6 + vaddw.u8 q15, q15, d7 + + ; clip_pixel + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + vqmovun.s16 d4, q12 + vqmovun.s16 d5, q13 + vqmovun.s16 d6, q14 + vqmovun.s16 d7, q15 + + ; store the data + vst1.64 {d0}, [r0], r2 + vst1.64 {d1}, [r0], r2 + vst1.64 {d2}, [r0], r2 + vst1.64 {d3}, [r0], r2 + vst1.64 {d4}, [r0], r2 + vst1.64 {d5}, [r0], r2 + vst1.64 {d6}, [r0], r2 + vst1.64 {d7}, [r0], r2 + + pop {r4-r9} + bx lr + ENDP ; |vp9_short_idct8x8_add_neon| + + END diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c index 2660344..554a317 100644 --- a/libvpx/vp9/common/vp9_alloccommon.c +++ b/libvpx/vp9/common/vp9_alloccommon.c @@ -11,6 +11,7 @@ #include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" + #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_entropymv.h" @@ -52,7 +53,6 @@ void vp9_free_frame_buffers(VP9_COMMON *oci) { for (i = 0; i < NUM_YV12_BUFFERS; i++) vp9_free_frame_buffer(&oci->yv12_fb[i]); - vp9_free_frame_buffer(&oci->temp_scale_frame); vp9_free_frame_buffer(&oci->post_proc_buffer); vpx_free(oci->mip); @@ -62,9 +62,9 @@ void vp9_free_frame_buffers(VP9_COMMON *oci) { vpx_free(oci->above_context[0]); for (i = 0; i < MAX_MB_PLANE; i++) oci->above_context[i] = 0; - oci->mip = 0; - oci->prev_mip = 0; - oci->above_seg_context = 0; + oci->mip = NULL; + oci->prev_mip = NULL; + oci->above_seg_context = NULL; } static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) { @@ -74,7 +74,7 @@ static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) { cm->mi_cols = aligned_width >> LOG2_MI_SIZE; cm->mi_rows = aligned_height >> LOG2_MI_SIZE; - cm->mode_info_stride = cm->mi_cols + 64 / MI_SIZE; + cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE; } static void setup_mi(VP9_COMMON *cm) { @@ -94,11 +94,11 @@ static void setup_mi(VP9_COMMON *cm) { int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { int i, mi_cols; - // Our internal buffers are always multiples of 16 - const int aligned_width = multiple8(width); - const int aligned_height = multiple8(height); + const int aligned_width = ALIGN_POWER_OF_TWO(width, LOG2_MI_SIZE); + const int aligned_height = ALIGN_POWER_OF_TWO(height, LOG2_MI_SIZE); const int ss_x = oci->subsampling_x; const int ss_y = oci->subsampling_y; + int mi_size; vp9_free_frame_buffers(oci); @@ -120,10 +120,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { oci->fb_idx_ref_cnt[i] = 1; } - if (vp9_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, ss_x, ss_y, - VP9BORDERINPIXELS) < 0) - goto fail; - if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y, VP9BORDERINPIXELS) < 0) goto fail; @@ -131,14 +127,13 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { set_mb_mi(oci, aligned_width, aligned_height); // Allocation - oci->mip = vpx_calloc(oci->mode_info_stride * (oci->mi_rows + 64 / MI_SIZE), - sizeof(MODE_INFO)); + mi_size = oci->mode_info_stride * (oci->mi_rows + MI_BLOCK_SIZE); + + oci->mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); if (!oci->mip) goto fail; - oci->prev_mip = vpx_calloc(oci->mode_info_stride * - (oci->mi_rows + 64 / MI_SIZE), - sizeof(MODE_INFO)); + oci->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); if (!oci->prev_mip) goto fail; @@ -146,7 +141,7 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling // information is exposed at this level - mi_cols = mi_cols_aligned_to_sb(oci); + mi_cols = mi_cols_aligned_to_sb(oci->mi_cols); // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm // block where mi unit size is 8x8. @@ -158,10 +153,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { if (!oci->above_context[0]) goto fail; - for (i = 1; i < MAX_MB_PLANE; i++) - oci->above_context[i] = - oci->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols; - oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1); if (!oci->above_seg_context) goto fail; @@ -178,9 +169,8 @@ void vp9_create_common(VP9_COMMON *oci) { vp9_init_mbmode_probs(oci); - oci->txfm_mode = ONLY_4X4; + oci->tx_mode = ONLY_4X4; oci->comp_pred_mode = HYBRID_PREDICTION; - oci->clr_type = REG_YUV; // Initialize reference frame sign bias structure to defaults vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias)); @@ -197,9 +187,15 @@ void vp9_initialize_common() { } void vp9_update_frame_size(VP9_COMMON *cm) { - const int aligned_width = multiple8(cm->width); - const int aligned_height = multiple8(cm->height); + int i, mi_cols; + const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, LOG2_MI_SIZE); + const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, LOG2_MI_SIZE); set_mb_mi(cm, aligned_width, aligned_height); setup_mi(cm); + + mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + for (i = 1; i < MAX_MB_PLANE; i++) + cm->above_context[i] = + cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols; } diff --git a/libvpx/vp9/common/vp9_asm_com_offsets.c b/libvpx/vp9/common/vp9_asm_com_offsets.c deleted file mode 100644 index 94ccb6e..0000000 --- a/libvpx/vp9/common/vp9_asm_com_offsets.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_config.h" -#include "vpx/vpx_codec.h" -#include "vpx_ports/asm_offsets.h" - -BEGIN - -END - -/* add asserts for any offset that is not supported by assembly code */ -/* add asserts for any size that is not supported by assembly code */ diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h index 37d29af..1297114 100644 --- a/libvpx/vp9/common/vp9_blockd.h +++ b/libvpx/vp9/common/vp9_blockd.h @@ -13,28 +13,25 @@ #define VP9_COMMON_VP9_BLOCKD_H_ #include "./vpx_config.h" + +#include "vpx_ports/mem.h" #include "vpx_scale/yv12config.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_common_data.h" #include "vp9/common/vp9_convolve.h" +#include "vp9/common/vp9_enums.h" #include "vp9/common/vp9_mv.h" +#include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_treecoder.h" -#include "vpx_ports/mem.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_enums.h" #define BLOCK_SIZE_GROUPS 4 -#define MAX_MB_SEGMENTS 8 -#define MB_SEG_TREE_PROBS (MAX_MB_SEGMENTS-1) #define PREDICTION_PROBS 3 #define MBSKIP_CONTEXTS 3 -#define MAX_REF_LF_DELTAS 4 -#define MAX_MODE_LF_DELTAS 2 - /* Segment Feature Masks */ -#define SEGMENT_DELTADATA 0 -#define SEGMENT_ABSDATA 1 #define MAX_MV_REF_CANDIDATES 2 #define INTRA_INTER_CONTEXTS 4 @@ -87,56 +84,28 @@ typedef enum { MB_MODE_COUNT } MB_PREDICTION_MODE; +static INLINE int is_intra_mode(MB_PREDICTION_MODE mode) { + return mode <= TM_PRED; +} + static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) { return mode >= NEARESTMV && mode <= NEWMV; } -// Segment level features. -typedef enum { - SEG_LVL_ALT_Q = 0, // Use alternate Quantizer .... - SEG_LVL_ALT_LF = 1, // Use alternate loop filter value... - SEG_LVL_REF_FRAME = 2, // Optional Segment reference frame - SEG_LVL_SKIP = 3, // Optional Segment (0,0) + skip mode - SEG_LVL_MAX = 4 // Number of MB level features supported -} SEG_LVL_FEATURES; - -// Segment level features. -typedef enum { - TX_4X4 = 0, // 4x4 dct transform - TX_8X8 = 1, // 8x8 dct transform - TX_16X16 = 2, // 16x16 dct transform - TX_32X32 = 3, // 32x32 dct transform - TX_SIZE_MAX_SB, // Number of transforms available to SBs -} TX_SIZE; - -typedef enum { - DCT_DCT = 0, // DCT in both horizontal and vertical - ADST_DCT = 1, // ADST in vertical, DCT in horizontal - DCT_ADST = 2, // DCT in vertical, ADST in horizontal - ADST_ADST = 3 // ADST in both directions -} TX_TYPE; - #define VP9_INTRA_MODES (TM_PRED + 1) #define VP9_INTER_MODES (1 + NEWMV - NEARESTMV) -#define WHT_UPSCALE_FACTOR 2 - -#define TX_SIZE_PROBS 6 // (TX_SIZE_MAX_SB * (TX_SIZE_MAX_SB - 1) / 2) - -#define get_tx_probs(c, b) ((b) < BLOCK_SIZE_MB16X16 ? \ - (c)->fc.tx_probs_8x8p : \ - (b) < BLOCK_SIZE_SB32X32 ? \ - (c)->fc.tx_probs_16x16p : (c)->fc.tx_probs_32x32p) +static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) { + return (mode - NEARESTMV); +} /* For keyframes, intra block modes are predicted by the (already decoded) modes for the Y blocks to the left and above us; for interframes, there is a single probability table. */ union b_mode_info { - struct { - MB_PREDICTION_MODE first; - } as_mode; + MB_PREDICTION_MODE as_mode; int_mv as_mv[2]; // first, second inter predictor motion vectors }; @@ -150,60 +119,18 @@ typedef enum { } MV_REFERENCE_FRAME; static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) { - switch (sb_type) { - case BLOCK_SIZE_SB4X8: - case BLOCK_SIZE_AB4X4: return 0; - case BLOCK_SIZE_SB8X4: - case BLOCK_SIZE_SB8X8: - case BLOCK_SIZE_SB8X16: return 1; - case BLOCK_SIZE_SB16X8: - case BLOCK_SIZE_MB16X16: - case BLOCK_SIZE_SB16X32: return 2; - case BLOCK_SIZE_SB32X16: - case BLOCK_SIZE_SB32X32: - case BLOCK_SIZE_SB32X64: return 3; - case BLOCK_SIZE_SB64X32: - case BLOCK_SIZE_SB64X64: return 4; - default: assert(0); - return -1; - } + return b_width_log2_lookup[sb_type]; } - static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) { - switch (sb_type) { - case BLOCK_SIZE_SB8X4: - case BLOCK_SIZE_AB4X4: return 0; - case BLOCK_SIZE_SB4X8: - case BLOCK_SIZE_SB8X8: - case BLOCK_SIZE_SB16X8: return 1; - case BLOCK_SIZE_SB8X16: - case BLOCK_SIZE_MB16X16: - case BLOCK_SIZE_SB32X16: return 2; - case BLOCK_SIZE_SB16X32: - case BLOCK_SIZE_SB32X32: - case BLOCK_SIZE_SB64X32: return 3; - case BLOCK_SIZE_SB32X64: - case BLOCK_SIZE_SB64X64: return 4; - default: assert(0); - return -1; - } + return b_height_log2_lookup[sb_type]; } static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) { - int a = b_width_log2(sb_type) - 1; - // align 4x4 block to mode_info - if (a < 0) - a = 0; - assert(a >= 0); - return a; + return mi_width_log2_lookup[sb_type]; } static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) { - int a = b_height_log2(sb_type) - 1; - if (a < 0) - a = 0; - assert(a >= 0); - return a; + return mi_height_log2_lookup[sb_type]; } typedef struct { @@ -214,7 +141,7 @@ typedef struct { int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; int_mv best_mv, best_second_mv; - int mb_mode_context[MAX_REF_FRAMES]; + uint8_t mb_mode_context[MAX_REF_FRAMES]; unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */ unsigned char segment_id; // Segment id for current frame @@ -237,7 +164,14 @@ typedef struct { union b_mode_info bmi[4]; } MODE_INFO; +enum mv_precision { + MV_PRECISION_Q3, + MV_PRECISION_Q4 +}; + #define VP9_REF_SCALE_SHIFT 14 +#define VP9_REF_NO_SCALE (1 << VP9_REF_SCALE_SHIFT) + struct scale_factors { int x_scale_fp; // horizontal fixed point scale factor int y_scale_fp; // vertical fixed point scale factor @@ -249,9 +183,8 @@ struct scale_factors { int (*scale_value_x)(int val, const struct scale_factors *scale); int (*scale_value_y)(int val, const struct scale_factors *scale); void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col); - int_mv32 (*scale_mv_q3_to_q4)(const int_mv *src_mv, - const struct scale_factors *scale); - int32_t (*scale_mv_component_q4)(int mv_q4, int scale_fp, int offset_q4); + MV32 (*scale_mv_q3_to_q4)(const MV *mv, const struct scale_factors *scale); + MV32 (*scale_mv_q4)(const MV *mv, const struct scale_factors *scale); convolve_fn_t predict[2][2][2]; // horiz, vert, avg }; @@ -283,71 +216,53 @@ struct macroblockd_plane { #define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n)) +#define MAX_REF_LF_DELTAS 4 +#define MAX_MODE_LF_DELTAS 2 + +struct loopfilter { + int filter_level; + + int sharpness_level; + int last_sharpness_level; + + uint8_t mode_ref_delta_enabled; + uint8_t mode_ref_delta_update; + + // 0 = Intra, Last, GF, ARF + signed char ref_deltas[MAX_REF_LF_DELTAS]; + signed char last_ref_deltas[MAX_REF_LF_DELTAS]; + + // 0 = ZERO_MV, MV + signed char mode_deltas[MAX_MODE_LF_DELTAS]; + signed char last_mode_deltas[MAX_MODE_LF_DELTAS]; +}; + typedef struct macroblockd { struct macroblockd_plane plane[MAX_MB_PLANE]; struct scale_factors scale_factor[2]; - struct scale_factors scale_factor_uv[2]; MODE_INFO *prev_mode_info_context; MODE_INFO *mode_info_context; int mode_info_stride; - FRAME_TYPE frame_type; - int up_available; int left_available; int right_available; + struct segmentation seg; + struct loopfilter lf; + // partition contexts PARTITION_CONTEXT *above_seg_context; PARTITION_CONTEXT *left_seg_context; - /* 0 (disable) 1 (enable) segmentation */ - unsigned char segmentation_enabled; - - /* 0 (do not update) 1 (update) the macroblock segmentation map. */ - unsigned char update_mb_segmentation_map; - - /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ - unsigned char update_mb_segmentation_data; - - /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ - unsigned char mb_segment_abs_delta; - - /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */ - /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */ - - // Probability Tree used to code Segment number - vp9_prob mb_segment_tree_probs[MB_SEG_TREE_PROBS]; - - // Segment features - int16_t segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX]; - unsigned int segment_feature_mask[MAX_MB_SEGMENTS]; - - /* mode_based Loop filter adjustment */ - unsigned char mode_ref_lf_delta_enabled; - unsigned char mode_ref_lf_delta_update; - - /* Delta values have the range +/- MAX_LOOP_FILTER */ - /* 0 = Intra, Last, GF, ARF */ - signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; - /* 0 = Intra, Last, GF, ARF */ - signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; - /* 0 = ZERO_MV, MV */ - signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; - /* 0 = ZERO_MV, MV */ - signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; - /* Distance of MB away from frame edges */ int mb_to_left_edge; int mb_to_right_edge; int mb_to_top_edge; int mb_to_bottom_edge; - unsigned int frames_since_golden; - unsigned int frames_till_alt_ref_frame; - int lossless; /* Inverse transform function pointers. */ void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride); @@ -360,15 +275,16 @@ typedef struct macroblockd { int corrupted; - int sb_index; // index of 32x32 block inside the 64x64 block - int mb_index; // index of 16x16 block inside the 32x32 block - int b_index; // index of 8x8 block inside the 16x16 block - int ab_index; // index of 4x4 block inside the 8x8 block + unsigned char sb_index; // index of 32x32 block inside the 64x64 block + unsigned char mb_index; // index of 16x16 block inside the 32x32 block + unsigned char b_index; // index of 8x8 block inside the 16x16 block + unsigned char ab_index; // index of 4x4 block inside the 8x8 block + int q_index; } MACROBLOCKD; -static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) { +static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) { switch (subsize) { case BLOCK_SIZE_SB64X64: case BLOCK_SIZE_SB64X32: @@ -396,38 +312,21 @@ static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) { static INLINE void update_partition_context(MACROBLOCKD *xd, BLOCK_SIZE_TYPE sb_type, BLOCK_SIZE_TYPE sb_size) { - int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2; - int bwl = b_width_log2(sb_type); - int bhl = b_height_log2(sb_type); - int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl; - int i; + const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2; + const int bwl = b_width_log2(sb_type); + const int bhl = b_height_log2(sb_type); + const int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl; + const char pcval0 = ~(0xe << boffset); + const char pcval1 = ~(0xf << boffset); + const char pcvalue[2] = {pcval0, pcval1}; + + assert(MAX(bwl, bhl) <= bsl); // update the partition context at the end notes. set partition bits // of block sizes larger than the current one to be one, and partition // bits of smaller block sizes to be zero. - if ((bwl == bsl) && (bhl == bsl)) { - for (i = 0; i < bs; i++) - xd->left_seg_context[i] = ~(0xf << boffset); - for (i = 0; i < bs; i++) - xd->above_seg_context[i] = ~(0xf << boffset); - } else if ((bwl == bsl) && (bhl < bsl)) { - for (i = 0; i < bs; i++) - xd->left_seg_context[i] = ~(0xe << boffset); - for (i = 0; i < bs; i++) - xd->above_seg_context[i] = ~(0xf << boffset); - } else if ((bwl < bsl) && (bhl == bsl)) { - for (i = 0; i < bs; i++) - xd->left_seg_context[i] = ~(0xf << boffset); - for (i = 0; i < bs; i++) - xd->above_seg_context[i] = ~(0xe << boffset); - } else if ((bwl < bsl) && (bhl < bsl)) { - for (i = 0; i < bs; i++) - xd->left_seg_context[i] = ~(0xe << boffset); - for (i = 0; i < bs; i++) - xd->above_seg_context[i] = ~(0xe << boffset); - } else { - assert(0); - } + vpx_memset(xd->above_seg_context, pcvalue[bwl == bsl], bs); + vpx_memset(xd->left_seg_context, pcvalue[bhl == bsl], bs); } static INLINE int partition_plane_context(MACROBLOCKD *xd, @@ -453,134 +352,57 @@ static INLINE int partition_plane_context(MACROBLOCKD *xd, static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize, PARTITION_TYPE partition) { - BLOCK_SIZE_TYPE subsize; - switch (partition) { - case PARTITION_NONE: - subsize = bsize; - break; - case PARTITION_HORZ: - if (bsize == BLOCK_SIZE_SB64X64) - subsize = BLOCK_SIZE_SB64X32; - else if (bsize == BLOCK_SIZE_SB32X32) - subsize = BLOCK_SIZE_SB32X16; - else if (bsize == BLOCK_SIZE_MB16X16) - subsize = BLOCK_SIZE_SB16X8; - else if (bsize == BLOCK_SIZE_SB8X8) - subsize = BLOCK_SIZE_SB8X4; - else - assert(0); - break; - case PARTITION_VERT: - if (bsize == BLOCK_SIZE_SB64X64) - subsize = BLOCK_SIZE_SB32X64; - else if (bsize == BLOCK_SIZE_SB32X32) - subsize = BLOCK_SIZE_SB16X32; - else if (bsize == BLOCK_SIZE_MB16X16) - subsize = BLOCK_SIZE_SB8X16; - else if (bsize == BLOCK_SIZE_SB8X8) - subsize = BLOCK_SIZE_SB4X8; - else - assert(0); - break; - case PARTITION_SPLIT: - if (bsize == BLOCK_SIZE_SB64X64) - subsize = BLOCK_SIZE_SB32X32; - else if (bsize == BLOCK_SIZE_SB32X32) - subsize = BLOCK_SIZE_MB16X16; - else if (bsize == BLOCK_SIZE_MB16X16) - subsize = BLOCK_SIZE_SB8X8; - else if (bsize == BLOCK_SIZE_SB8X8) - subsize = BLOCK_SIZE_AB4X4; - else - assert(0); - break; - default: - assert(0); - } + BLOCK_SIZE_TYPE subsize = subsize_lookup[partition][bsize]; + assert(subsize != BLOCK_SIZE_TYPES); return subsize; } -// transform mapping -static TX_TYPE txfm_map(MB_PREDICTION_MODE bmode) { - switch (bmode) { - case TM_PRED : - case D135_PRED : - return ADST_ADST; +extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT]; - case V_PRED : - case D117_PRED : - case D63_PRED: - return ADST_DCT; +static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type, + const MACROBLOCKD *xd, int ib) { + const MODE_INFO *const mi = xd->mode_info_context; + const MB_MODE_INFO *const mbmi = &mi->mbmi; - case H_PRED : - case D153_PRED : - case D27_PRED : - return DCT_ADST; + if (plane_type != PLANE_TYPE_Y_WITH_DC || + xd->lossless || + mbmi->ref_frame[0] != INTRA_FRAME) + return DCT_DCT; - default: - return DCT_DCT; - } + return mode2txfm_map[mbmi->sb_type < BLOCK_SIZE_SB8X8 ? + mi->bmi[ib].as_mode : mbmi->mode]; } -static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) { - TX_TYPE tx_type; - MODE_INFO *mi = xd->mode_info_context; - MB_MODE_INFO *const mbmi = &mi->mbmi; - if (xd->lossless || mbmi->ref_frame[0] != INTRA_FRAME) - return DCT_DCT; - if (mbmi->sb_type < BLOCK_SIZE_SB8X8) { - tx_type = txfm_map(mi->bmi[ib].as_mode.first); - } else { - assert(mbmi->mode <= TM_PRED); - tx_type = txfm_map(mbmi->mode); - } - return tx_type; +static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type, + const MACROBLOCKD *xd) { + return plane_type == PLANE_TYPE_Y_WITH_DC ? + mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT; } -static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, int ib) { - TX_TYPE tx_type = DCT_DCT; - if (xd->mode_info_context->mbmi.mode <= TM_PRED) { - tx_type = txfm_map(xd->mode_info_context->mbmi.mode); - } - return tx_type; +static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type, + const MACROBLOCKD *xd) { + return plane_type == PLANE_TYPE_Y_WITH_DC ? + mode2txfm_map[xd->mode_info_context->mbmi.mode] : DCT_DCT; } -static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, int ib) { - TX_TYPE tx_type = DCT_DCT; - if (xd->mode_info_context->mbmi.mode <= TM_PRED) { - tx_type = txfm_map(xd->mode_info_context->mbmi.mode); +static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) { + int i; + + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC; + xd->plane[i].subsampling_x = i ? ss_x : 0; + xd->plane[i].subsampling_y = i ? ss_y : 0; } - return tx_type; +#if CONFIG_ALPHA + // TODO(jkoleszar): Using the Y w/h for now + xd->plane[3].subsampling_x = 0; + xd->plane[3].subsampling_y = 0; +#endif } -void vp9_setup_block_dptrs(MACROBLOCKD *xd, - int subsampling_x, int subsampling_y); - -static TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) { - const TX_SIZE size = mbmi->txfm_size; - - switch (mbmi->sb_type) { - case BLOCK_SIZE_SB64X64: - return size; - case BLOCK_SIZE_SB64X32: - case BLOCK_SIZE_SB32X64: - case BLOCK_SIZE_SB32X32: - if (size == TX_32X32) - return TX_16X16; - else - return size; - case BLOCK_SIZE_SB32X16: - case BLOCK_SIZE_SB16X32: - case BLOCK_SIZE_MB16X16: - if (size == TX_16X16) - return TX_8X8; - else - return size; - default: - return TX_4X4; - } - return size; +static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) { + return MIN(mbmi->txfm_size, max_uv_txsize_lookup[mbmi->sb_type]); } struct plane_block_idx { @@ -619,6 +441,16 @@ static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize, return 4 << (b_height_log2(bsize) - plane->subsampling_y); } +static INLINE int plane_block_width_log2by4( + BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) { + return (b_width_log2(bsize) - plane->subsampling_x); +} + +static INLINE int plane_block_height_log2by4( + BLOCK_SIZE_TYPE bsize, const struct macroblockd_plane* plane) { + return (b_height_log2(bsize) - plane->subsampling_y); +} + typedef void (*foreach_transformed_block_visitor)(int plane, int block, BLOCK_SIZE_TYPE bsize, int ss_txfrm_size, @@ -795,11 +627,11 @@ static int txfrm_block_to_raster_block(MACROBLOCKD *xd, int ss_txfrm_size) { const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; const int txwl = ss_txfrm_size / 2; - const int tx_cols_lg2 = bwl - txwl; - const int tx_cols = 1 << tx_cols_lg2; + const int tx_cols_log2 = bwl - txwl; + const int tx_cols = 1 << tx_cols_log2; const int raster_mb = block >> ss_txfrm_size; const int x = (raster_mb & (tx_cols - 1)) << (txwl); - const int y = raster_mb >> tx_cols_lg2 << (txwl); + const int y = raster_mb >> tx_cols_log2 << (txwl); return x + (y << bwl); } @@ -810,11 +642,11 @@ static void txfrm_block_to_raster_xy(MACROBLOCKD *xd, int *x, int *y) { const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; const int txwl = ss_txfrm_size / 2; - const int tx_cols_lg2 = bwl - txwl; - const int tx_cols = 1 << tx_cols_lg2; + const int tx_cols_log2 = bwl - txwl; + const int tx_cols = 1 << tx_cols_log2; const int raster_mb = block >> ss_txfrm_size; *x = (raster_mb & (tx_cols - 1)) << (txwl); - *y = raster_mb >> tx_cols_lg2 << (txwl); + *y = raster_mb >> tx_cols_log2 << (txwl); } static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block, diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h index 0d7babf..1796906 100644 --- a/libvpx/vp9/common/vp9_common.h +++ b/libvpx/vp9/common/vp9_common.h @@ -22,12 +22,11 @@ #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) -#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n)) +#define ROUND_POWER_OF_TWO(value, n) \ + (((value) + (1 << ((n) - 1))) >> (n)) -/* If we don't want to use ROUND_POWER_OF_TWO macro -static INLINE int16_t round_power_of_two(int16_t value, int n) { - return (value + (1 << (n - 1))) >> n; -}*/ +#define ALIGN_POWER_OF_TWO(value, n) \ + (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) // Only need this for fixed-size arrays, for structs just assign. #define vp9_copy(dest, src) { \ @@ -56,10 +55,35 @@ static INLINE double fclamp(double value, double low, double high) { return value < low ? low : (value > high ? high : value); } -static INLINE int multiple8(int value) { - return (value + 7) & ~7; +static int get_unsigned_bits(unsigned int num_values) { + int cat = 0; + if (num_values <= 1) + return 0; + num_values--; + while (num_values > 0) { + cat++; + num_values >>= 1; + } + return cat; } +#if CONFIG_DEBUG +#define CHECK_MEM_ERROR(cm, lval, expr) do { \ + lval = (expr); \ + if (!lval) \ + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \ + "Failed to allocate "#lval" at %s:%d", \ + __FILE__, __LINE__); \ + } while (0) +#else +#define CHECK_MEM_ERROR(cm, lval, expr) do { \ + lval = (expr); \ + if (!lval) \ + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, \ + "Failed to allocate "#lval); \ + } while (0) +#endif + #define SYNC_CODE_0 0x49 #define SYNC_CODE_1 0x83 #define SYNC_CODE_2 0x42 diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c new file mode 100644 index 0000000..dee44ec --- /dev/null +++ b/libvpx/vp9/common/vp9_common_data.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_common_data.h" + +// Log 2 conversion lookup tables for block width and height +const int b_width_log2_lookup[BLOCK_SIZE_TYPES] = + {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4}; +const int b_height_log2_lookup[BLOCK_SIZE_TYPES] = + {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4}; +const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES] = + {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16}; +const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES] = + {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16}; +// Log 2 conversion lookup tables for modeinfo width and height +const int mi_width_log2_lookup[BLOCK_SIZE_TYPES] = + {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3}; +const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES] = + {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8}; +const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] = + {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3}; +const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] = + {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8}; + +const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = { + { // 4X4 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID + }, { // 8X8 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID + }, { // 16X16 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID + }, { // 32X32 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, + PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID + }, { // 64X64 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, + PARTITION_NONE + } +}; + +const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = { + { // PARTITION_NONE + BLOCK_SIZE_AB4X4, BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB8X4, + BLOCK_SIZE_SB8X8, BLOCK_SIZE_SB8X16, BLOCK_SIZE_SB16X8, + BLOCK_SIZE_MB16X16, BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB32X16, + BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64, BLOCK_SIZE_SB64X32, + BLOCK_SIZE_SB64X64, + }, { // PARTITION_HORZ + BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_SIZE_SB8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_SIZE_SB16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_SIZE_SB32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_SIZE_SB64X32, + }, { // PARTITION_VERT + BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_SIZE_SB4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_SIZE_SB8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_SIZE_SB16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_SIZE_SB32X64, + }, { // PARTITION_SPLIT + BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_SIZE_AB4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_SIZE_SB8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_SIZE_MB16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_SIZE_SB32X32, + } +}; + +const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES] = { + TX_4X4, TX_4X4, TX_4X4, + TX_8X8, TX_8X8, TX_8X8, + TX_16X16, TX_16X16, TX_16X16, + TX_32X32, TX_32X32, TX_32X32, TX_32X32 +}; +const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES] = { + TX_4X4, TX_4X4, TX_4X4, + TX_4X4, TX_4X4, TX_4X4, + TX_8X8, TX_8X8, TX_8X8, + TX_16X16, TX_16X16, TX_16X16, TX_32X32 +}; + +const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5] = { + {BLOCK_SIZE_AB4X4, BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB4X8, + BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB4X8}, + {BLOCK_SIZE_SB8X4, BLOCK_SIZE_SB8X8, BLOCK_SIZE_SB8X16, + BLOCK_SIZE_SB8X16, BLOCK_SIZE_SB8X16}, + {BLOCK_SIZE_SB16X8, BLOCK_SIZE_SB16X8, BLOCK_SIZE_MB16X16, + BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB16X32}, + {BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16, + BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64}, + {BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32, + BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X64} +}; diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h new file mode 100644 index 0000000..8b0f8a5 --- /dev/null +++ b/libvpx/vp9/common/vp9_common_data.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_COMMON_DATA_H_ +#define VP9_COMMON_VP9_COMMON_DATA_H_ + +#include "vp9/common/vp9_enums.h" + +extern const int b_width_log2_lookup[BLOCK_SIZE_TYPES]; +extern const int b_height_log2_lookup[BLOCK_SIZE_TYPES]; +extern const int mi_width_log2_lookup[BLOCK_SIZE_TYPES]; +extern const int mi_height_log2_lookup[BLOCK_SIZE_TYPES]; +extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES]; +extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES]; +extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES]; +extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES]; +extern const PARTITION_TYPE + partition_lookup[][BLOCK_SIZE_TYPES]; + + +extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES]; +extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES]; +extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES]; +extern const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5]; + +#endif // VP9_COMMON_VP9_COMMON_DATA_H diff --git a/libvpx/vp9/common/vp9_convolve.c b/libvpx/vp9/common/vp9_convolve.c index 46ae503..6f1e418 100644 --- a/libvpx/vp9/common/vp9_convolve.c +++ b/libvpx/vp9/common/vp9_convolve.c @@ -38,8 +38,8 @@ */ #define ALIGN_FILTERS_256 1 -static void convolve_horiz_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x0, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int taps) { @@ -80,8 +80,8 @@ static void convolve_horiz_c(const uint8_t *src, int src_stride, } } -static void convolve_avg_horiz_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x0, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int taps) { @@ -122,8 +122,8 @@ static void convolve_avg_horiz_c(const uint8_t *src, int src_stride, } } -static void convolve_vert_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y0, int y_step_q4, int w, int h, int taps) { @@ -164,8 +164,8 @@ static void convolve_vert_c(const uint8_t *src, int src_stride, } } -static void convolve_avg_vert_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y0, int y_step_q4, int w, int h, int taps) { @@ -207,8 +207,8 @@ static void convolve_avg_vert_c(const uint8_t *src, int src_stride, } } -static void convolve_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +static void convolve_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int taps) { @@ -217,12 +217,13 @@ static void convolve_c(const uint8_t *src, int src_stride, * h == 64, taps == 8. */ uint8_t temp[64 * 135]; - int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1; + int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1; assert(w <= 64); assert(h <= 64); assert(taps <= 8); assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); if (intermediate_height < h) intermediate_height = h; @@ -236,8 +237,8 @@ static void convolve_c(const uint8_t *src, int src_stride, w, h, taps); } -static void convolve_avg_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int taps) { @@ -246,12 +247,13 @@ static void convolve_avg_c(const uint8_t *src, int src_stride, * h == 64, taps == 8. */ uint8_t temp[64 * 135]; - int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1; + int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1; assert(w <= 64); assert(h <= 64); assert(taps <= 8); assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); if (intermediate_height < h) intermediate_height = h; @@ -265,8 +267,8 @@ static void convolve_avg_c(const uint8_t *src, int src_stride, w, h, taps); } -void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -275,8 +277,8 @@ void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, w, h, 8); } -void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -285,8 +287,8 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, w, h, 8); } -void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -295,8 +297,8 @@ void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, w, h, 8); } -void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -305,8 +307,8 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, w, h, 8); } -void vp9_convolve8_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -315,8 +317,8 @@ void vp9_convolve8_c(const uint8_t *src, int src_stride, w, h, 8); } -void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -337,33 +339,25 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, w, h); } -void vp9_convolve_copy(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h) { - if (w == 16 && h == 16) { - vp9_copy_mem16x16(src, src_stride, dst, dst_stride); - } else if (w == 8 && h == 8) { - vp9_copy_mem8x8(src, src_stride, dst, dst_stride); - } else if (w == 8 && h == 4) { - vp9_copy_mem8x4(src, src_stride, dst, dst_stride); - } else { - int r; - - for (r = h; r > 0; --r) { - memcpy(dst, src, w); - src += src_stride; - dst += dst_stride; - } +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int r; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; } } -void vp9_convolve_avg(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h) { +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { int x, y; for (y = 0; y < h; ++y) { diff --git a/libvpx/vp9/common/vp9_convolve.h b/libvpx/vp9/common/vp9_convolve.h index 0596080..3de8111 100644 --- a/libvpx/vp9/common/vp9_convolve.h +++ b/libvpx/vp9/common/vp9_convolve.h @@ -13,26 +13,12 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" -typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -// Not a convolution, a block copy conforming to the convolution prototype -void vp9_convolve_copy(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); - -// Not a convolution, a block average conforming to the convolution prototype -void vp9_convolve_avg(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); - struct subpix_fn_table { const int16_t (*filter_x)[8]; const int16_t (*filter_y)[8]; diff --git a/libvpx/vp9/common/vp9_debugmodes.c b/libvpx/vp9/common/vp9_debugmodes.c index 5841f80..370ebe8 100644 --- a/libvpx/vp9/common/vp9_debugmodes.c +++ b/libvpx/vp9/common/vp9_debugmodes.c @@ -11,126 +11,68 @@ #include <stdio.h> #include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_onyxc_int.h" -void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, - int frame, char *file) { +static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) { + fprintf(f, "%s", str); + fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame, + cm->show_frame, cm->base_qindex); +} +/* This function dereferences a pointer to the mbmi structure + * and uses the passed in member offset to print out the value of an integer + * for each mbmi member value in the mi structure. + */ +static void print_mi_data(VP9_COMMON *common, FILE *file, char *descriptor, + size_t member_offset) { int mi_row; int mi_col; int mi_index = 0; - FILE *mvs = fopen(file, "a"); - - // Print out the macroblock Y modes - fprintf(mvs, "SB Types for Frame %d\n", frame); - - for (mi_row = 0; mi_row < rows; mi_row++) { - for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%2d ", mi[mi_index].mbmi.sb_type); - - mi_index++; - } - - fprintf(mvs, "\n"); - mi_index += 8; - } + MODE_INFO *mi = common->mi; + int rows = common->mi_rows; + int cols = common->mi_cols; + char prefix = descriptor[0]; - // Print out the macroblock Y modes - fprintf(mvs, "Mb Modes for Frame %d\n", frame); + log_frame_info(common, descriptor, file); mi_index = 0; for (mi_row = 0; mi_row < rows; mi_row++) { + fprintf(file, "%c ", prefix); for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%2d ", mi[mi_index].mbmi.mode); - + fprintf(file, "%2d ", + *((int*) ((char *) (&mi[mi_index].mbmi) + member_offset))); mi_index++; } - - fprintf(mvs, "\n"); + fprintf(file, "\n"); mi_index += 8; } - - fprintf(mvs, "\n"); - - mi_index = 0; - fprintf(mvs, "Mb mv ref for Frame %d\n", frame); - - for (mi_row = 0; mi_row < rows; mi_row++) { - for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%2d ", mi[mi_index].mbmi.ref_frame[0]); - - mi_index++; - } - - fprintf(mvs, "\n"); - mi_index += 8; - } - fprintf(mvs, "\n"); - - mi_index = 0; - fprintf(mvs, "Mb mv ref for Frame %d\n", frame); - + fprintf(file, "\n"); +} +void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) { + int mi_row; + int mi_col; + int mi_index = 0; + FILE *mvs = fopen(file, "a"); + MODE_INFO *mi = cm->mi; + int rows = cm->mi_rows; + int cols = cm->mi_cols; + + print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type)); + print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode)); + print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, mb_skip_coeff)); + print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0])); + print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, txfm_size)); + print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode)); + + log_frame_info(cm, "Vectors ",mvs); for (mi_row = 0; mi_row < rows; mi_row++) { + fprintf(mvs,"V "); for (mi_col = 0; mi_col < cols; mi_col++) { fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row, mi[mi_index].mbmi.mv[0].as_mv.col); - mi_index++; } - fprintf(mvs, "\n"); mi_index += 8; } - - fprintf(mvs, "\n"); - - /* print out the macroblock txform sizes */ - mi_index = 0; - fprintf(mvs, "TXFM size for Frame %d\n", frame); - - for (mi_row = 0; mi_row < rows; mi_row++) { - for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%2d ", mi[mi_index].mbmi.txfm_size); - - mi_index++; - } - - mi_index += 8; - fprintf(mvs, "\n"); - } - - fprintf(mvs, "\n"); - - /* print out the macroblock UV modes */ - mi_index = 0; - fprintf(mvs, "UV Modes for Frame %d\n", frame); - - for (mi_row = 0; mi_row < rows; mi_row++) { - for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%2d ", mi[mi_index].mbmi.uv_mode); - - mi_index++; - } - - mi_index += 8; - fprintf(mvs, "\n"); - } - - fprintf(mvs, "\n"); - - /* print out the macroblock mvs */ - mi_index = 0; - fprintf(mvs, "MVs for Frame %d\n", frame); - - for (mi_row = 0; mi_row < rows; mi_row++) { - for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%5d:%-5d", mi[mi_index].mbmi.mv[0].as_mv.row / 2, - mi[mi_index].mbmi.mv[0].as_mv.col / 2); - - mi_index++; - } - - mi_index += 8; - fprintf(mvs, "\n"); - } - fprintf(mvs, "\n"); fclose(mvs); diff --git a/libvpx/vp9/common/vp9_default_coef_probs.h b/libvpx/vp9/common/vp9_default_coef_probs.h index 1954093..185fced 100644 --- a/libvpx/vp9/common/vp9_default_coef_probs.h +++ b/libvpx/vp9/common/vp9_default_coef_probs.h @@ -8,695 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ - /*Generated file, included by vp9_entropy.c*/ - -#if CONFIG_BALANCED_COEFTREE -static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = { - { /* block Type 0 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 6, 213, 178 }, - { 26, 113, 132 }, - { 34, 17, 68 } - }, { /* Coeff Band 1 */ - { 66, 96, 178 }, - { 63, 96, 174 }, - { 67, 54, 154 }, - { 62, 28, 126 }, - { 48, 9, 84 }, - { 20, 1, 32 } - }, { /* Coeff Band 2 */ - { 64, 144, 206 }, - { 70, 99, 191 }, - { 69, 36, 152 }, - { 55, 9, 106 }, - { 35, 1, 60 }, - { 14, 1, 22 } - }, { /* Coeff Band 3 */ - { 82, 154, 222 }, - { 83, 112, 205 }, - { 81, 31, 164 }, - { 62, 7, 118 }, - { 42, 1, 74 }, - { 18, 1, 30 } - }, { /* Coeff Band 4 */ - { 52, 179, 233 }, - { 64, 132, 214 }, - { 73, 36, 170 }, - { 59, 8, 116 }, - { 38, 1, 65 }, - { 15, 1, 26 } - }, { /* Coeff Band 5 */ - { 29, 175, 238 }, - { 26, 169, 223 }, - { 41, 80, 182 }, - { 39, 32, 127 }, - { 26, 10, 69 }, - { 11, 2, 28 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 21, 226, 234 }, - { 52, 182, 212 }, - { 80, 112, 177 } - }, { /* Coeff Band 1 */ - { 111, 164, 243 }, - { 88, 152, 231 }, - { 90, 43, 186 }, - { 70, 12, 132 }, - { 44, 2, 76 }, - { 19, 1, 33 } - }, { /* Coeff Band 2 */ - { 96, 185, 246 }, - { 99, 127, 231 }, - { 88, 21, 177 }, - { 64, 5, 122 }, - { 38, 1, 69 }, - { 18, 1, 30 } - }, { /* Coeff Band 3 */ - { 84, 206, 249 }, - { 94, 147, 237 }, - { 95, 33, 187 }, - { 71, 8, 131 }, - { 47, 1, 83 }, - { 26, 1, 44 } - }, { /* Coeff Band 4 */ - { 38, 221, 252 }, - { 58, 177, 241 }, - { 78, 46, 188 }, - { 59, 9, 122 }, - { 34, 1, 66 }, - { 18, 1, 34 } - }, { /* Coeff Band 5 */ - { 21, 216, 253 }, - { 21, 206, 244 }, - { 42, 93, 200 }, - { 43, 41, 146 }, - { 36, 13, 93 }, - { 31, 1, 55 } - } - } - }, { /* block Type 1 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 7, 213, 219 }, - { 23, 139, 182 }, - { 38, 60, 125 } - }, { /* Coeff Band 1 */ - { 69, 156, 220 }, - { 52, 178, 213 }, - { 69, 111, 190 }, - { 69, 58, 155 }, - { 58, 21, 104 }, - { 39, 7, 60 } - }, { /* Coeff Band 2 */ - { 68, 189, 228 }, - { 70, 158, 221 }, - { 83, 64, 189 }, - { 73, 18, 141 }, - { 48, 4, 88 }, - { 23, 1, 41 } - }, { /* Coeff Band 3 */ - { 99, 194, 236 }, - { 91, 138, 224 }, - { 91, 53, 189 }, - { 74, 20, 142 }, - { 48, 6, 90 }, - { 22, 1, 41 } - }, { /* Coeff Band 4 */ - { 52, 203, 244 }, - { 60, 168, 231 }, - { 75, 62, 189 }, - { 61, 18, 132 }, - { 38, 4, 72 }, - { 17, 1, 39 } - }, { /* Coeff Band 5 */ - { 33, 192, 247 }, - { 31, 185, 234 }, - { 46, 85, 185 }, - { 39, 35, 132 }, - { 28, 15, 80 }, - { 13, 5, 38 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 5, 247, 246 }, - { 28, 209, 228 }, - { 65, 137, 203 } - }, { /* Coeff Band 1 */ - { 69, 208, 250 }, - { 54, 207, 242 }, - { 81, 92, 204 }, - { 70, 54, 153 }, - { 58, 40, 108 }, - { 58, 35, 71 } - }, { /* Coeff Band 2 */ - { 65, 215, 250 }, - { 72, 185, 239 }, - { 92, 50, 197 }, - { 75, 14, 147 }, - { 49, 2, 99 }, - { 26, 1, 53 } - }, { /* Coeff Band 3 */ - { 70, 220, 251 }, - { 76, 186, 241 }, - { 90, 65, 198 }, - { 75, 26, 151 }, - { 58, 12, 112 }, - { 34, 6, 49 } - }, { /* Coeff Band 4 */ - { 34, 224, 253 }, - { 44, 204, 245 }, - { 69, 85, 204 }, - { 64, 31, 150 }, - { 44, 2, 78 }, - { 1, 1, 128 } - }, { /* Coeff Band 5 */ - { 25, 216, 253 }, - { 21, 215, 248 }, - { 47, 108, 214 }, - { 47, 48, 160 }, - { 26, 20, 90 }, - { 64, 171, 128 } - } - } - } -}; -static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = { - { /* block Type 0 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 9, 203, 199 }, - { 26, 92, 128 }, - { 28, 11, 55 } - }, { /* Coeff Band 1 */ - { 99, 54, 160 }, - { 78, 99, 155 }, - { 80, 44, 138 }, - { 71, 17, 115 }, - { 51, 5, 80 }, - { 27, 1, 40 } - }, { /* Coeff Band 2 */ - { 135, 81, 190 }, - { 113, 61, 182 }, - { 93, 16, 153 }, - { 70, 4, 115 }, - { 41, 1, 68 }, - { 16, 1, 27 } - }, { /* Coeff Band 3 */ - { 155, 103, 214 }, - { 129, 48, 199 }, - { 95, 10, 159 }, - { 63, 1, 110 }, - { 32, 1, 58 }, - { 12, 1, 21 } - }, { /* Coeff Band 4 */ - { 163, 149, 231 }, - { 137, 69, 213 }, - { 95, 11, 164 }, - { 62, 3, 108 }, - { 32, 1, 57 }, - { 13, 1, 22 } - }, { /* Coeff Band 5 */ - { 136, 189, 239 }, - { 123, 102, 223 }, - { 97, 19, 170 }, - { 66, 4, 111 }, - { 38, 1, 60 }, - { 18, 1, 26 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 24, 226, 244 }, - { 54, 178, 211 }, - { 80, 74, 152 } - }, { /* Coeff Band 1 */ - { 145, 153, 236 }, - { 101, 163, 223 }, - { 108, 50, 187 }, - { 90, 22, 145 }, - { 66, 8, 97 }, - { 42, 4, 50 } - }, { /* Coeff Band 2 */ - { 150, 159, 238 }, - { 128, 90, 218 }, - { 94, 9, 163 }, - { 64, 3, 110 }, - { 34, 1, 61 }, - { 13, 1, 24 } - }, { /* Coeff Band 3 */ - { 151, 162, 242 }, - { 135, 80, 222 }, - { 93, 9, 166 }, - { 61, 3, 111 }, - { 31, 1, 59 }, - { 12, 1, 22 } - }, { /* Coeff Band 4 */ - { 161, 170, 245 }, - { 140, 84, 228 }, - { 99, 8, 174 }, - { 64, 1, 116 }, - { 34, 1, 63 }, - { 14, 1, 26 } - }, { /* Coeff Band 5 */ - { 138, 197, 246 }, - { 127, 109, 233 }, - { 100, 16, 179 }, - { 66, 3, 119 }, - { 37, 1, 66 }, - { 16, 1, 30 } - } - } - }, { /* block Type 1 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 6, 216, 212 }, - { 25, 134, 171 }, - { 43, 48, 118 } - }, { /* Coeff Band 1 */ - { 93, 112, 209 }, - { 66, 159, 206 }, - { 82, 78, 184 }, - { 75, 28, 148 }, - { 46, 4, 82 }, - { 18, 1, 28 } - }, { /* Coeff Band 2 */ - { 108, 148, 220 }, - { 90, 130, 216 }, - { 92, 40, 186 }, - { 73, 10, 135 }, - { 46, 1, 79 }, - { 20, 1, 35 } - }, { /* Coeff Band 3 */ - { 125, 173, 232 }, - { 109, 117, 223 }, - { 97, 31, 183 }, - { 71, 7, 127 }, - { 44, 1, 76 }, - { 21, 1, 36 } - }, { /* Coeff Band 4 */ - { 133, 195, 236 }, - { 112, 121, 224 }, - { 97, 23, 178 }, - { 69, 3, 122 }, - { 42, 1, 72 }, - { 19, 1, 34 } - }, { /* Coeff Band 5 */ - { 132, 180, 238 }, - { 119, 102, 225 }, - { 101, 18, 179 }, - { 71, 3, 124 }, - { 42, 1, 70 }, - { 17, 1, 28 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 5, 242, 250 }, - { 26, 198, 226 }, - { 58, 98, 168 } - }, { /* Coeff Band 1 */ - { 82, 201, 246 }, - { 50, 219, 237 }, - { 94, 107, 205 }, - { 89, 61, 167 }, - { 77, 31, 131 }, - { 57, 14, 91 } - }, { /* Coeff Band 2 */ - { 99, 202, 247 }, - { 96, 165, 234 }, - { 100, 31, 190 }, - { 72, 8, 131 }, - { 41, 1, 72 }, - { 14, 1, 24 } - }, { /* Coeff Band 3 */ - { 108, 204, 248 }, - { 107, 156, 235 }, - { 103, 27, 186 }, - { 71, 4, 124 }, - { 39, 1, 66 }, - { 14, 1, 19 } - }, { /* Coeff Band 4 */ - { 120, 211, 248 }, - { 118, 149, 234 }, - { 107, 19, 182 }, - { 72, 3, 126 }, - { 40, 1, 69 }, - { 16, 1, 24 } - }, { /* Coeff Band 5 */ - { 127, 199, 245 }, - { 122, 125, 232 }, - { 112, 20, 186 }, - { 82, 3, 136 }, - { 55, 1, 88 }, - { 10, 1, 38 } - } - } - } -}; -static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = { - { /* block Type 0 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 25, 9, 101 }, - { 25, 2, 67 }, - { 15, 1, 28 } - }, { /* Coeff Band 1 */ - { 67, 30, 118 }, - { 61, 56, 116 }, - { 60, 31, 105 }, - { 52, 11, 85 }, - { 34, 2, 54 }, - { 14, 1, 22 } - }, { /* Coeff Band 2 */ - { 107, 58, 149 }, - { 92, 53, 147 }, - { 78, 14, 123 }, - { 56, 3, 87 }, - { 35, 1, 56 }, - { 17, 1, 27 } - }, { /* Coeff Band 3 */ - { 142, 61, 171 }, - { 111, 30, 162 }, - { 80, 4, 128 }, - { 53, 1, 87 }, - { 31, 1, 52 }, - { 14, 1, 24 } - }, { /* Coeff Band 4 */ - { 171, 73, 200 }, - { 129, 28, 184 }, - { 86, 3, 140 }, - { 54, 1, 90 }, - { 28, 1, 49 }, - { 12, 1, 21 } - }, { /* Coeff Band 5 */ - { 193, 129, 227 }, - { 148, 28, 200 }, - { 90, 2, 144 }, - { 53, 1, 90 }, - { 28, 1, 50 }, - { 13, 1, 22 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 60, 7, 234 }, - { 64, 4, 184 }, - { 56, 1, 104 } - }, { /* Coeff Band 1 */ - { 150, 111, 210 }, - { 87, 185, 202 }, - { 101, 81, 177 }, - { 90, 34, 142 }, - { 67, 11, 95 }, - { 38, 2, 51 } - }, { /* Coeff Band 2 */ - { 153, 139, 218 }, - { 120, 72, 195 }, - { 90, 11, 147 }, - { 63, 3, 101 }, - { 39, 1, 61 }, - { 20, 1, 33 } - }, { /* Coeff Band 3 */ - { 171, 132, 223 }, - { 131, 56, 200 }, - { 92, 6, 147 }, - { 58, 1, 95 }, - { 32, 1, 52 }, - { 14, 1, 23 } - }, { /* Coeff Band 4 */ - { 183, 137, 227 }, - { 139, 48, 204 }, - { 91, 3, 148 }, - { 55, 1, 91 }, - { 28, 1, 47 }, - { 13, 1, 21 } - }, { /* Coeff Band 5 */ - { 198, 149, 234 }, - { 153, 32, 208 }, - { 95, 2, 148 }, - { 55, 1, 90 }, - { 30, 1, 51 }, - { 16, 1, 25 } - } - } - }, { /* block Type 1 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 7, 209, 217 }, - { 31, 106, 151 }, - { 40, 21, 86 } - }, { /* Coeff Band 1 */ - { 101, 71, 184 }, - { 74, 131, 177 }, - { 88, 50, 158 }, - { 78, 16, 129 }, - { 51, 2, 82 }, - { 18, 1, 29 } - }, { /* Coeff Band 2 */ - { 116, 115, 199 }, - { 102, 88, 191 }, - { 94, 22, 160 }, - { 74, 6, 122 }, - { 47, 1, 77 }, - { 18, 1, 30 } - }, { /* Coeff Band 3 */ - { 157, 124, 210 }, - { 130, 53, 201 }, - { 102, 10, 165 }, - { 73, 1, 120 }, - { 42, 1, 69 }, - { 16, 1, 27 } - }, { /* Coeff Band 4 */ - { 174, 147, 225 }, - { 134, 67, 212 }, - { 100, 10, 168 }, - { 66, 1, 111 }, - { 36, 1, 60 }, - { 16, 1, 27 } - }, { /* Coeff Band 5 */ - { 185, 165, 232 }, - { 147, 56, 214 }, - { 105, 5, 165 }, - { 66, 1, 108 }, - { 35, 1, 59 }, - { 16, 1, 27 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 3, 232, 245 }, - { 18, 162, 210 }, - { 38, 64, 131 } - }, { /* Coeff Band 1 */ - { 84, 187, 239 }, - { 35, 231, 231 }, - { 82, 150, 209 }, - { 87, 97, 181 }, - { 81, 64, 151 }, - { 67, 60, 119 } - }, { /* Coeff Band 2 */ - { 107, 185, 239 }, - { 100, 149, 224 }, - { 107, 34, 185 }, - { 83, 12, 141 }, - { 49, 4, 92 }, - { 21, 1, 40 } - }, { /* Coeff Band 3 */ - { 125, 184, 243 }, - { 121, 127, 228 }, - { 113, 25, 185 }, - { 82, 6, 134 }, - { 48, 1, 82 }, - { 26, 1, 38 } - }, { /* Coeff Band 4 */ - { 143, 185, 245 }, - { 133, 115, 231 }, - { 114, 14, 184 }, - { 77, 3, 126 }, - { 43, 1, 68 }, - { 34, 1, 40 } - }, { /* Coeff Band 5 */ - { 170, 194, 241 }, - { 151, 80, 226 }, - { 118, 9, 180 }, - { 81, 1, 130 }, - { 51, 1, 78 }, - { 18, 1, 49 } - } - } - } -}; -static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = { - { /* block Type 0 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 29, 42, 137 }, - { 26, 3, 60 }, - { 13, 1, 23 } - }, { /* Coeff Band 1 */ - { 69, 36, 122 }, - { 63, 57, 123 }, - { 60, 33, 112 }, - { 52, 11, 90 }, - { 32, 2, 52 }, - { 10, 1, 15 } - }, { /* Coeff Band 2 */ - { 107, 55, 143 }, - { 86, 69, 143 }, - { 74, 24, 116 }, - { 52, 5, 78 }, - { 29, 1, 44 }, - { 12, 1, 18 } - }, { /* Coeff Band 3 */ - { 137, 71, 160 }, - { 107, 34, 152 }, - { 73, 6, 114 }, - { 44, 1, 69 }, - { 25, 1, 40 }, - { 12, 1, 18 } - }, { /* Coeff Band 4 */ - { 165, 70, 174 }, - { 118, 24, 159 }, - { 74, 3, 117 }, - { 45, 1, 73 }, - { 26, 1, 43 }, - { 12, 1, 19 } - }, { /* Coeff Band 5 */ - { 220, 93, 223 }, - { 153, 10, 187 }, - { 86, 2, 131 }, - { 49, 1, 79 }, - { 26, 1, 43 }, - { 12, 1, 20 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 30, 58, 227 }, - { 35, 10, 172 }, - { 24, 23, 112 } - }, { /* Coeff Band 1 */ - { 117, 145, 219 }, - { 51, 221, 216 }, - { 75, 169, 196 }, - { 88, 96, 165 }, - { 77, 43, 117 }, - { 53, 18, 60 } - }, { /* Coeff Band 2 */ - { 128, 176, 225 }, - { 108, 114, 202 }, - { 92, 19, 152 }, - { 65, 4, 103 }, - { 38, 1, 61 }, - { 19, 1, 30 } - }, { /* Coeff Band 3 */ - { 146, 184, 228 }, - { 122, 95, 205 }, - { 92, 11, 149 }, - { 62, 1, 98 }, - { 35, 1, 57 }, - { 17, 1, 26 } - }, { /* Coeff Band 4 */ - { 165, 192, 230 }, - { 132, 81, 206 }, - { 93, 6, 147 }, - { 58, 1, 94 }, - { 32, 1, 52 }, - { 15, 1, 24 } - }, { /* Coeff Band 5 */ - { 204, 223, 234 }, - { 156, 49, 204 }, - { 97, 3, 145 }, - { 59, 1, 92 }, - { 33, 1, 52 }, - { 15, 1, 24 } - } - } - }, { /* block Type 1 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 7, 184, 200 }, - { 25, 67, 113 }, - { 30, 9, 59 } - }, { /* Coeff Band 1 */ - { 92, 42, 158 }, - { 65, 121, 159 }, - { 77, 56, 146 }, - { 70, 22, 120 }, - { 47, 4, 76 }, - { 18, 1, 26 } - }, { /* Coeff Band 2 */ - { 113, 81, 177 }, - { 96, 75, 167 }, - { 84, 24, 136 }, - { 63, 8, 100 }, - { 37, 1, 58 }, - { 13, 1, 19 } - }, { /* Coeff Band 3 */ - { 147, 85, 194 }, - { 119, 36, 178 }, - { 88, 8, 139 }, - { 59, 1, 93 }, - { 31, 1, 49 }, - { 10, 1, 18 } - }, { /* Coeff Band 4 */ - { 169, 108, 210 }, - { 131, 41, 191 }, - { 92, 5, 144 }, - { 56, 1, 88 }, - { 29, 1, 47 }, - { 14, 1, 22 } - }, { /* Coeff Band 5 */ - { 210, 106, 223 }, - { 148, 14, 192 }, - { 89, 2, 138 }, - { 52, 1, 84 }, - { 29, 1, 47 }, - { 14, 1, 23 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 3, 207, 245 }, - { 12, 102, 213 }, - { 18, 33, 144 } - }, { /* Coeff Band 1 */ - { 85, 205, 245 }, - { 18, 249, 242 }, - { 59, 221, 229 }, - { 91, 166, 213 }, - { 88, 117, 183 }, - { 70, 95, 149 } - }, { /* Coeff Band 2 */ - { 114, 193, 241 }, - { 104, 155, 221 }, - { 100, 33, 181 }, - { 78, 10, 132 }, - { 43, 2, 75 }, - { 15, 1, 48 } - }, { /* Coeff Band 3 */ - { 118, 198, 244 }, - { 117, 142, 224 }, - { 111, 25, 179 }, - { 83, 4, 134 }, - { 57, 1, 84 }, - { 1, 1, 1 } - }, { /* Coeff Band 4 */ - { 144, 201, 248 }, - { 136, 130, 234 }, - { 124, 12, 188 }, - { 83, 1, 130 }, - { 61, 1, 66 }, - { 64, 171, 128 } - }, { /* Coeff Band 5 */ - { 174, 227, 250 }, - { 165, 118, 242 }, - { 132, 21, 197 }, - { 84, 3, 134 }, - { 70, 1, 69 }, - { 1, 1, 1 } - } - } - } -}; -#else static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = { { /* block Type 0 */ { /* Intra */ @@ -1381,4 +693,4 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = { } } }; -#endif + diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c index 080867e..0ad0dbc 100644 --- a/libvpx/vp9/common/vp9_entropy.c +++ b/libvpx/vp9/common/vp9_entropy.c @@ -15,6 +15,8 @@ #include "vpx_mem/vpx_mem.h" #include "vpx/vpx_integer.h" +#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES) + DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = { 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, @@ -50,28 +52,28 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = { 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5 }; -DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]) = { +DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = { 0, 4, 1, 5, 8, 2, 12, 9, 3, 6, 13, 10, 7, 14, 11, 15, }; -DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]) = { +DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = { 0, 4, 8, 1, 12, 5, 9, 2, 13, 6, 10, 3, 7, 14, 11, 15, }; -DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]) = { +DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = { 0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15, }; -DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]) = { +DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]) = { 0, 8, 1, 16, 9, 2, 17, 24, 10, 3, 18, 25, 32, 11, 4, 26, 33, 19, 40, 12, 34, 27, 5, 41, @@ -82,7 +84,7 @@ DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]) = { 46, 39, 61, 54, 47, 62, 55, 63, }; -DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = { +DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = { 0, 8, 16, 1, 24, 9, 32, 17, 2, 40, 25, 10, 33, 18, 48, 3, 26, 41, 11, 56, 19, 34, 4, 49, @@ -93,7 +95,7 @@ DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = { 31, 61, 39, 54, 47, 62, 55, 63, }; -DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = { +DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = { 0, 1, 2, 8, 9, 3, 16, 10, 4, 17, 11, 24, 5, 18, 25, 12, 19, 26, 32, 6, 13, 20, 33, 27, @@ -104,7 +106,7 @@ DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = { 60, 39, 61, 47, 54, 55, 62, 63, }; -DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]) = { +DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = { 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80, 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52, 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69, @@ -123,7 +125,7 @@ DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]) = { 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255, }; -DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = { +DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = { 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81, 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4, 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21, @@ -142,7 +144,7 @@ DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = { 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255, }; -DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = { +DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = { 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20, 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52, 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69, @@ -161,7 +163,7 @@ DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = { 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255, }; -DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]) = { +DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = { 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160, 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193, 68, 131, 37, 100, 225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38, 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321, 102, 352, 8, 197, 71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292, 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293, 41, 417, 199, 136, @@ -200,13 +202,8 @@ DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]) = { const vp9_tree_index vp9_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */ { -#if CONFIG_BALANCED_COEFTREE - -ZERO_TOKEN, 2, /* 0 = ZERO */ - -DCT_EOB_TOKEN, 4, /* 1 = EOB */ -#else -DCT_EOB_TOKEN, 2, /* 0 = EOB */ -ZERO_TOKEN, 4, /* 1 = ZERO */ -#endif -ONE_TOKEN, 6, /* 2 = ONE */ 8, 12, /* 3 = LOW_VAL */ -TWO_TOKEN, 10, /* 4 = TWO */ @@ -233,13 +230,8 @@ static const vp9_prob Pcat6[] = { }; const vp9_tree_index vp9_coefmodel_tree[6] = { -#if CONFIG_BALANCED_COEFTREE - -ZERO_TOKEN, 2, - -DCT_EOB_MODEL_TOKEN, 4, -#else -DCT_EOB_MODEL_TOKEN, 2, /* 0 = EOB */ -ZERO_TOKEN, 4, /* 1 = ZERO */ -#endif -ONE_TOKEN, -TWO_TOKEN, }; @@ -252,7 +244,7 @@ const vp9_tree_index vp9_coefmodel_tree[6] = { // the probabilities for the rest of the nodes. // beta = 8 -const vp9_prob vp9_modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = { +static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = { { 3, 86, 128, 6, 86, 23, 88, 29}, { 9, 86, 129, 17, 88, 61, 94, 76}, { 15, 87, 129, 28, 89, 93, 100, 110}, @@ -386,8 +378,7 @@ const vp9_prob vp9_modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = { static void extend_model_to_full_distribution(vp9_prob p, vp9_prob *tree_probs) { const int l = ((p - 1) / 2); - const vp9_prob (*model)[MODEL_NODES]; - model = vp9_modelcoefprobs_pareto8; + const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8; if (p & 1) { vpx_memcpy(tree_probs + UNCONSTRAINED_NODES, model[l], MODEL_NODES * sizeof(vp9_prob)); @@ -406,16 +397,6 @@ void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) { extend_model_to_full_distribution(model[PIVOT_NODE], full); } -void vp9_model_to_full_probs_sb( - vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES], - vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) { - int c, p; - for (c = 0; c < COEF_BANDS; ++c) - for (p = 0; p < PREV_COEF_CONTEXTS; ++p) { - vp9_model_to_full_probs(model[c][p], full[c][p]); - } -} - static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28]; static void init_bit_tree(vp9_tree_index *p, int n) { @@ -455,32 +436,6 @@ vp9_extra_bit vp9_extra_bits[12] = { #include "vp9/common/vp9_default_coef_probs.h" -// This function updates and then returns n AC coefficient context -// This is currently a placeholder function to allow experimentation -// using various context models based on the energy earlier tokens -// within the current block. -// -// For now it just returns the previously used context. -#define MAX_NEIGHBORS 2 -int vp9_get_coef_context(const int *scan, const int *neighbors, - int nb_pad, uint8_t *token_cache, int c, int l) { - int eob = l; - assert(nb_pad == MAX_NEIGHBORS); - if (c == eob) { - return 0; - } else { - int ctx; - assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0); - if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) { - ctx = (1 + token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]] + - token_cache[scan[neighbors[MAX_NEIGHBORS * c + 1]]]) >> 1; - } else { - ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]]; - } - return ctx; - } -}; - void vp9_default_coef_probs(VP9_COMMON *pc) { vpx_memcpy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4, sizeof(pc->fc.coef_probs[TX_4X4])); @@ -496,28 +451,39 @@ void vp9_default_coef_probs(VP9_COMMON *pc) { // in {top, left, topleft, topright, bottomleft} order // for each position in raster scan order. // -1 indicates the neighbor does not exist. -DECLARE_ALIGNED(16, int, - vp9_default_scan_4x4_neighbors[16 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int, - vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int, - vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int, - vp9_col_scan_8x8_neighbors[64 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int, - vp9_row_scan_8x8_neighbors[64 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int, - vp9_default_scan_8x8_neighbors[64 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int, - vp9_col_scan_16x16_neighbors[256 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int, - vp9_row_scan_16x16_neighbors[256 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int, - vp9_default_scan_16x16_neighbors[256 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int, - vp9_default_scan_32x32_neighbors[1024 * MAX_NEIGHBORS]); - -static int find_in_scan(const int *scan, int l, int idx) { +DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); + +DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]); +DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]); +DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]); +DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]); +DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]); +DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]); +DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]); +DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]); +DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]); +DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]); + +static int find_in_scan(const int16_t *scan, int l, int idx) { int n, l2 = l * l; for (n = 0; n < l2; n++) { int rc = scan[n]; @@ -527,14 +493,19 @@ static int find_in_scan(const int *scan, int l, int idx) { assert(0); return -1; } -static void init_scan_neighbors(const int *scan, int l, int *neighbors, - int max_neighbors) { +static void init_scan_neighbors(const int16_t *scan, + int16_t *iscan, + int l, int16_t *neighbors) { int l2 = l * l; int n, i, j; - for (n = 0; n < l2; n++) { + // dc doesn't use this type of prediction + neighbors[MAX_NEIGHBORS * 0 + 0] = 0; + neighbors[MAX_NEIGHBORS * 0 + 1] = 0; + iscan[0] = find_in_scan(scan, l, 0); + for (n = 1; n < l2; n++) { int rc = scan[n]; - assert(max_neighbors == MAX_NEIGHBORS); + iscan[n] = find_in_scan(scan, l, n); i = rc / l; j = rc % l; if (i > 0 && j > 0) { @@ -546,93 +517,84 @@ static void init_scan_neighbors(const int *scan, int l, int *neighbors, // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff // as a context. If ADST or DCT is used in both directions, we // use the combination of the two as a context. - int a = find_in_scan(scan, l, (i - 1) * l + j); - int b = find_in_scan(scan, l, i * l + j - 1); + int a = (i - 1) * l + j; + int b = i * l + j - 1; if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 || scan == vp9_col_scan_16x16) { - neighbors[max_neighbors * n + 0] = a; - neighbors[max_neighbors * n + 1] = -1; + // in the col/row scan cases (as well as left/top edge cases), we set + // both contexts to the same value, so we can branchlessly do a+b+1>>1 + // which automatically becomes a if a == b + neighbors[MAX_NEIGHBORS * n + 0] = + neighbors[MAX_NEIGHBORS * n + 1] = a; } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 || scan == vp9_row_scan_16x16) { - neighbors[max_neighbors * n + 0] = b; - neighbors[max_neighbors * n + 1] = -1; + neighbors[MAX_NEIGHBORS * n + 0] = + neighbors[MAX_NEIGHBORS * n + 1] = b; } else { - neighbors[max_neighbors * n + 0] = a; - neighbors[max_neighbors * n + 1] = b; + neighbors[MAX_NEIGHBORS * n + 0] = a; + neighbors[MAX_NEIGHBORS * n + 1] = b; } } else if (i > 0) { - neighbors[max_neighbors * n + 0] = find_in_scan(scan, l, (i - 1) * l + j); - neighbors[max_neighbors * n + 1] = -1; - } else if (j > 0) { - neighbors[max_neighbors * n + 0] = - find_in_scan(scan, l, i * l + j - 1); - neighbors[max_neighbors * n + 1] = -1; + neighbors[MAX_NEIGHBORS * n + 0] = + neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j; } else { - assert(n == 0); - // dc predictor doesn't use previous tokens - neighbors[max_neighbors * n + 0] = -1; + assert(j > 0); + neighbors[MAX_NEIGHBORS * n + 0] = + neighbors[MAX_NEIGHBORS * n + 1] = i * l + j - 1; } - assert(neighbors[max_neighbors * n + 0] < n); + assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n); } + // one padding item so we don't have to add branches in code to handle + // calls to get_coef_context() for the token after the final dc token + neighbors[MAX_NEIGHBORS * l2 + 0] = 0; + neighbors[MAX_NEIGHBORS * l2 + 1] = 0; } void vp9_init_neighbors() { - init_scan_neighbors(vp9_default_scan_4x4, 4, - vp9_default_scan_4x4_neighbors, MAX_NEIGHBORS); - init_scan_neighbors(vp9_row_scan_4x4, 4, - vp9_row_scan_4x4_neighbors, MAX_NEIGHBORS); - init_scan_neighbors(vp9_col_scan_4x4, 4, - vp9_col_scan_4x4_neighbors, MAX_NEIGHBORS); - init_scan_neighbors(vp9_default_scan_8x8, 8, - vp9_default_scan_8x8_neighbors, MAX_NEIGHBORS); - init_scan_neighbors(vp9_row_scan_8x8, 8, - vp9_row_scan_8x8_neighbors, MAX_NEIGHBORS); - init_scan_neighbors(vp9_col_scan_8x8, 8, - vp9_col_scan_8x8_neighbors, MAX_NEIGHBORS); - init_scan_neighbors(vp9_default_scan_16x16, 16, - vp9_default_scan_16x16_neighbors, MAX_NEIGHBORS); - init_scan_neighbors(vp9_row_scan_16x16, 16, - vp9_row_scan_16x16_neighbors, MAX_NEIGHBORS); - init_scan_neighbors(vp9_col_scan_16x16, 16, - vp9_col_scan_16x16_neighbors, MAX_NEIGHBORS); - init_scan_neighbors(vp9_default_scan_32x32, 32, - vp9_default_scan_32x32_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4, + vp9_default_scan_4x4_neighbors); + init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4, + vp9_row_scan_4x4_neighbors); + init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4, + vp9_col_scan_4x4_neighbors); + init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8, + vp9_default_scan_8x8_neighbors); + init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8, + vp9_row_scan_8x8_neighbors); + init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8, + vp9_col_scan_8x8_neighbors); + init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16, + vp9_default_scan_16x16_neighbors); + init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16, + vp9_row_scan_16x16_neighbors); + init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16, + vp9_col_scan_16x16_neighbors); + init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32, + vp9_default_scan_32x32_neighbors); } -const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad) { +const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) { if (scan == vp9_default_scan_4x4) { - *pad = MAX_NEIGHBORS; return vp9_default_scan_4x4_neighbors; } else if (scan == vp9_row_scan_4x4) { - *pad = MAX_NEIGHBORS; return vp9_row_scan_4x4_neighbors; } else if (scan == vp9_col_scan_4x4) { - *pad = MAX_NEIGHBORS; return vp9_col_scan_4x4_neighbors; } else if (scan == vp9_default_scan_8x8) { - *pad = MAX_NEIGHBORS; return vp9_default_scan_8x8_neighbors; } else if (scan == vp9_row_scan_8x8) { - *pad = 2; return vp9_row_scan_8x8_neighbors; } else if (scan == vp9_col_scan_8x8) { - *pad = 2; return vp9_col_scan_8x8_neighbors; } else if (scan == vp9_default_scan_16x16) { - *pad = MAX_NEIGHBORS; return vp9_default_scan_16x16_neighbors; } else if (scan == vp9_row_scan_16x16) { - *pad = 2; return vp9_row_scan_16x16_neighbors; } else if (scan == vp9_col_scan_16x16) { - *pad = 2; return vp9_col_scan_16x16_neighbors; - } else if (scan == vp9_default_scan_32x32) { - *pad = MAX_NEIGHBORS; - return vp9_default_scan_32x32_neighbors; } else { - assert(0); - return NULL; + assert(scan == vp9_default_scan_32x32); + return vp9_default_scan_32x32_neighbors; } } @@ -651,38 +613,15 @@ void vp9_coef_tree_initialize() { #define COEF_COUNT_SAT_AFTER_KEY 24 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128 -void vp9_full_to_model_count(unsigned int *model_count, - unsigned int *full_count) { - int n; - model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN]; - model_count[ONE_TOKEN] = full_count[ONE_TOKEN]; - model_count[TWO_TOKEN] = full_count[TWO_TOKEN]; - for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n) - model_count[TWO_TOKEN] += full_count[n]; - model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN]; -} - -void vp9_full_to_model_counts( - vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) { - int i, j, k, l; - for (i = 0; i < BLOCK_TYPES; ++i) - for (j = 0; j < REF_TYPES; ++j) - for (k = 0; k < COEF_BANDS; ++k) - for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { - if (l >= 3 && k == 0) - continue; - vp9_full_to_model_count(model_count[i][j][k][l], - full_count[i][j][k][l]); - } -} - static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size, int count_sat, int update_factor) { + FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; + vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[txfm_size]; - vp9_coeff_probs_model *pre_coef_probs = cm->fc.pre_coef_probs[txfm_size]; - vp9_coeff_count_model *coef_counts = cm->fc.coef_counts[txfm_size]; + vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[txfm_size]; + vp9_coeff_count_model *coef_counts = cm->counts.coef[txfm_size]; unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] = - cm->fc.eob_branch_counts[txfm_size]; + cm->counts.eob_branch[txfm_size]; int t, i, j, k, l, count; int factor; unsigned int branch_ct[UNCONSTRAINED_NODES][2]; @@ -699,13 +638,8 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size, vp9_coefmodel_tree, coef_probs, branch_ct, coef_counts[i][j][k][l], 0); -#if CONFIG_BALANCED_COEFTREE - branch_ct[1][1] = eob_branch_count[i][j][k][l] - branch_ct[1][0]; - coef_probs[1] = get_binary_prob(branch_ct[1][0], branch_ct[1][1]); -#else branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0]; coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]); -#endif for (t = 0; t < entropy_nodes_adapt; ++t) { count = branch_ct[t][0] + branch_ct[t][1]; count = count > count_sat ? count_sat : count; diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h index 7f2bf3d..4ea727f 100644 --- a/libvpx/vp9/common/vp9_entropy.h +++ b/libvpx/vp9/common/vp9_entropy.h @@ -52,8 +52,6 @@ typedef struct { extern vp9_extra_bit vp9_extra_bits[12]; /* indexed by token value */ -#define PROB_UPDATE_BASELINE_COST 7 - #define MAX_PROB 255 #define DCT_MAX_VALUE 16384 @@ -99,22 +97,62 @@ typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] struct VP9Common; void vp9_default_coef_probs(struct VP9Common *); -extern DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]); +extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]); + +extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]); +extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]); + +extern DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]); + +extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]); +extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]); + +extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]); + +extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]); +extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]); + +extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]); -extern DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]); -extern DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]); +extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]); -extern DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]); +extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]); +extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]); -extern DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]); -extern DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]); +extern DECLARE_ALIGNED(64, int16_t, vp9_default_iscan_8x8[64]); -extern DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]); +extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]); +extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]); -extern DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]); -extern DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]); +extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]); -extern DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]); +extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]); +extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]); + +extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]); + +#define MAX_NEIGHBORS 2 + +extern DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +extern DECLARE_ALIGNED(16, int16_t, + vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); void vp9_coef_tree_initialize(void); void vp9_adapt_coef_probs(struct VP9Common *); @@ -148,9 +186,14 @@ static int get_coef_band(const uint8_t * band_translate, int coef_index) { ? (COEF_BANDS-1) : band_translate[coef_index]; } -extern int vp9_get_coef_context(const int *scan, const int *neighbors, - int nb_pad, uint8_t *token_cache, int c, int l); -const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad); +static INLINE int get_coef_context(const int16_t *neighbors, + uint8_t *token_cache, + int c) { + return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] + + token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1; +} + +const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan); // 128 lists of probabilities are stored for the following ONE node probs: @@ -160,7 +203,6 @@ const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad); #define COEFPROB_MODELS 128 #define UNCONSTRAINED_NODES 3 -#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES) #define PIVOT_NODE 2 // which node is pivot @@ -174,20 +216,10 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS] typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS] [PREV_COEF_CONTEXTS] [UNCONSTRAINED_NODES][2]; -extern void vp9_full_to_model_count(unsigned int *model_count, - unsigned int *full_count); -extern void vp9_full_to_model_counts( - vp9_coeff_count_model *model_count, vp9_coeff_count *full_count); void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full); -void vp9_model_to_full_probs_sb( - vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES], - vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]); - -extern const vp9_prob vp9_modelcoefprobs[COEFPROB_MODELS][ENTROPY_NODES - 1]; - -static INLINE const int* get_scan_4x4(TX_TYPE tx_type) { +static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) { switch (tx_type) { case ADST_DCT: return vp9_row_scan_4x4; @@ -198,7 +230,36 @@ static INLINE const int* get_scan_4x4(TX_TYPE tx_type) { } } -static INLINE const int* get_scan_8x8(TX_TYPE tx_type) { +static INLINE void get_scan_nb_4x4(TX_TYPE tx_type, + const int16_t **scan, const int16_t **nb) { + switch (tx_type) { + case ADST_DCT: + *scan = vp9_row_scan_4x4; + *nb = vp9_row_scan_4x4_neighbors; + break; + case DCT_ADST: + *scan = vp9_col_scan_4x4; + *nb = vp9_col_scan_4x4_neighbors; + break; + default: + *scan = vp9_default_scan_4x4; + *nb = vp9_default_scan_4x4_neighbors; + break; + } +} + +static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) { + switch (tx_type) { + case ADST_DCT: + return vp9_row_iscan_4x4; + case DCT_ADST: + return vp9_col_iscan_4x4; + default: + return vp9_default_iscan_4x4; + } +} + +static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) { switch (tx_type) { case ADST_DCT: return vp9_row_scan_8x8; @@ -209,7 +270,36 @@ static INLINE const int* get_scan_8x8(TX_TYPE tx_type) { } } -static INLINE const int* get_scan_16x16(TX_TYPE tx_type) { +static INLINE void get_scan_nb_8x8(TX_TYPE tx_type, + const int16_t **scan, const int16_t **nb) { + switch (tx_type) { + case ADST_DCT: + *scan = vp9_row_scan_8x8; + *nb = vp9_row_scan_8x8_neighbors; + break; + case DCT_ADST: + *scan = vp9_col_scan_8x8; + *nb = vp9_col_scan_8x8_neighbors; + break; + default: + *scan = vp9_default_scan_8x8; + *nb = vp9_default_scan_8x8_neighbors; + break; + } +} + +static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) { + switch (tx_type) { + case ADST_DCT: + return vp9_row_iscan_8x8; + case DCT_ADST: + return vp9_col_iscan_8x8; + default: + return vp9_default_iscan_8x8; + } +} + +static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) { switch (tx_type) { case ADST_DCT: return vp9_row_scan_16x16; @@ -220,6 +310,35 @@ static INLINE const int* get_scan_16x16(TX_TYPE tx_type) { } } +static INLINE void get_scan_nb_16x16(TX_TYPE tx_type, + const int16_t **scan, const int16_t **nb) { + switch (tx_type) { + case ADST_DCT: + *scan = vp9_row_scan_16x16; + *nb = vp9_row_scan_16x16_neighbors; + break; + case DCT_ADST: + *scan = vp9_col_scan_16x16; + *nb = vp9_col_scan_16x16_neighbors; + break; + default: + *scan = vp9_default_scan_16x16; + *nb = vp9_default_scan_16x16_neighbors; + break; + } +} + +static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) { + switch (tx_type) { + case ADST_DCT: + return vp9_row_iscan_16x16; + case DCT_ADST: + return vp9_col_iscan_16x16; + default: + return vp9_default_iscan_16x16; + } +} + enum { VP9_COEF_UPDATE_PROB = 252 }; #endif // VP9_COMMON_VP9_ENTROPY_H_ diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c index 3302814..ca188e4 100644 --- a/libvpx/vp9/common/vp9_entropymode.c +++ b/libvpx/vp9/common/vp9_entropymode.c @@ -8,15 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_onyxc_int.h" -#include "vp9/common/vp9_modecont.h" #include "vp9/common/vp9_seg_common.h" -#include "vp9/common/vp9_alloccommon.h" -#include "vpx_mem/vpx_mem.h" -static const vp9_prob default_kf_uv_probs[VP9_INTRA_MODES] - [VP9_INTRA_MODES - 1] = { +const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES] + [VP9_INTRA_MODES - 1] = { { 144, 11, 54, 157, 195, 130, 46, 58, 108 } /* y = dc */, { 118, 15, 123, 148, 131, 101, 44, 93, 131 } /* y = v */, { 113, 12, 23, 188, 226, 142, 26, 32, 125 } /* y = h */, @@ -51,8 +50,9 @@ static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES] { 101, 21, 107, 181, 192, 103, 19, 67, 125 } /* y = tm */ }; -const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS] - [PARTITION_TYPES - 1] = { +static const vp9_prob default_partition_probs[NUM_FRAME_TYPES] + [NUM_PARTITION_CONTEXTS] + [PARTITION_TYPES - 1] = { { /* frame_type = keyframe */ /* 8x8 -> 4x4 */ { 158, 97, 94 } /* a/l both not split */, @@ -98,6 +98,133 @@ const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS] } }; +const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES] + [VP9_INTRA_MODES] + [VP9_INTRA_MODES - 1] = { + { /* above = dc */ + { 137, 30, 42, 148, 151, 207, 70, 52, 91 } /* left = dc */, + { 92, 45, 102, 136, 116, 180, 74, 90, 100 } /* left = v */, + { 73, 32, 19, 187, 222, 215, 46, 34, 100 } /* left = h */, + { 91, 30, 32, 116, 121, 186, 93, 86, 94 } /* left = d45 */, + { 72, 35, 36, 149, 68, 206, 68, 63, 105 } /* left = d135 */, + { 73, 31, 28, 138, 57, 124, 55, 122, 151 } /* left = d117 */, + { 67, 23, 21, 140, 126, 197, 40, 37, 171 } /* left = d153 */, + { 86, 27, 28, 128, 154, 212, 45, 43, 53 } /* left = d27 */, + { 74, 32, 27, 107, 86, 160, 63, 134, 102 } /* left = d63 */, + { 59, 67, 44, 140, 161, 202, 78, 67, 119 } /* left = tm */ + }, { /* above = v */ + { 63, 36, 126, 146, 123, 158, 60, 90, 96 } /* left = dc */, + { 43, 46, 168, 134, 107, 128, 69, 142, 92 } /* left = v */, + { 44, 29, 68, 159, 201, 177, 50, 57, 77 } /* left = h */, + { 58, 38, 76, 114, 97, 172, 78, 133, 92 } /* left = d45 */, + { 46, 41, 76, 140, 63, 184, 69, 112, 57 } /* left = d135 */, + { 38, 32, 85, 140, 46, 112, 54, 151, 133 } /* left = d117 */, + { 39, 27, 61, 131, 110, 175, 44, 75, 136 } /* left = d153 */, + { 52, 30, 74, 113, 130, 175, 51, 64, 58 } /* left = d27 */, + { 47, 35, 80, 100, 74, 143, 64, 163, 74 } /* left = d63 */, + { 36, 61, 116, 114, 128, 162, 80, 125, 82 } /* left = tm */ + }, { /* above = h */ + { 82, 26, 26, 171, 208, 204, 44, 32, 105 } /* left = dc */, + { 55, 44, 68, 166, 179, 192, 57, 57, 108 } /* left = v */, + { 42, 26, 11, 199, 241, 228, 23, 15, 85 } /* left = h */, + { 68, 42, 19, 131, 160, 199, 55, 52, 83 } /* left = d45 */, + { 58, 50, 25, 139, 115, 232, 39, 52, 118 } /* left = d135 */, + { 50, 35, 33, 153, 104, 162, 64, 59, 131 } /* left = d117 */, + { 44, 24, 16, 150, 177, 202, 33, 19, 156 } /* left = d153 */, + { 55, 27, 12, 153, 203, 218, 26, 27, 49 } /* left = d27 */, + { 53, 49, 21, 110, 116, 168, 59, 80, 76 } /* left = d63 */, + { 38, 72, 19, 168, 203, 212, 50, 50, 107 } /* left = tm */ + }, { /* above = d45 */ + { 103, 26, 36, 129, 132, 201, 83, 80, 93 } /* left = dc */, + { 59, 38, 83, 112, 103, 162, 98, 136, 90 } /* left = v */, + { 62, 30, 23, 158, 200, 207, 59, 57, 50 } /* left = h */, + { 67, 30, 29, 84, 86, 191, 102, 91, 59 } /* left = d45 */, + { 60, 32, 33, 112, 71, 220, 64, 89, 104 } /* left = d135 */, + { 53, 26, 34, 130, 56, 149, 84, 120, 103 } /* left = d117 */, + { 53, 21, 23, 133, 109, 210, 56, 77, 172 } /* left = d153 */, + { 77, 19, 29, 112, 142, 228, 55, 66, 36 } /* left = d27 */, + { 61, 29, 29, 93, 97, 165, 83, 175, 162 } /* left = d63 */, + { 47, 47, 43, 114, 137, 181, 100, 99, 95 } /* left = tm */ + }, { /* above = d135 */ + { 69, 23, 29, 128, 83, 199, 46, 44, 101 } /* left = dc */, + { 53, 40, 55, 139, 69, 183, 61, 80, 110 } /* left = v */, + { 40, 29, 19, 161, 180, 207, 43, 24, 91 } /* left = h */, + { 60, 34, 19, 105, 61, 198, 53, 64, 89 } /* left = d45 */, + { 52, 31, 22, 158, 40, 209, 58, 62, 89 } /* left = d135 */, + { 44, 31, 29, 147, 46, 158, 56, 102, 198 } /* left = d117 */, + { 35, 19, 12, 135, 87, 209, 41, 45, 167 } /* left = d153 */, + { 55, 25, 21, 118, 95, 215, 38, 39, 66 } /* left = d27 */, + { 51, 38, 25, 113, 58, 164, 70, 93, 97 } /* left = d63 */, + { 47, 54, 34, 146, 108, 203, 72, 103, 151 } /* left = tm */ + }, { /* above = d117 */ + { 64, 19, 37, 156, 66, 138, 49, 95, 133 } /* left = dc */, + { 46, 27, 80, 150, 55, 124, 55, 121, 135 } /* left = v */, + { 36, 23, 27, 165, 149, 166, 54, 64, 118 } /* left = h */, + { 53, 21, 36, 131, 63, 163, 60, 109, 81 } /* left = d45 */, + { 40, 26, 35, 154, 40, 185, 51, 97, 123 } /* left = d135 */, + { 35, 19, 34, 179, 19, 97, 48, 129, 124 } /* left = d117 */, + { 36, 20, 26, 136, 62, 164, 33, 77, 154 } /* left = d153 */, + { 45, 18, 32, 130, 90, 157, 40, 79, 91 } /* left = d27 */, + { 45, 26, 28, 129, 45, 129, 49, 147, 123 } /* left = d63 */, + { 38, 44, 51, 136, 74, 162, 57, 97, 121 } /* left = tm */ + }, { /* above = d153 */ + { 75, 17, 22, 136, 138, 185, 32, 34, 166 } /* left = dc */, + { 56, 39, 58, 133, 117, 173, 48, 53, 187 } /* left = v */, + { 35, 21, 12, 161, 212, 207, 20, 23, 145 } /* left = h */, + { 56, 29, 19, 117, 109, 181, 55, 68, 112 } /* left = d45 */, + { 47, 29, 17, 153, 64, 220, 59, 51, 114 } /* left = d135 */, + { 46, 16, 24, 136, 76, 147, 41, 64, 172 } /* left = d117 */, + { 34, 17, 11, 108, 152, 187, 13, 15, 209 } /* left = d153 */, + { 51, 24, 14, 115, 133, 209, 32, 26, 104 } /* left = d27 */, + { 55, 30, 18, 122, 79, 179, 44, 88, 116 } /* left = d63 */, + { 37, 49, 25, 129, 168, 164, 41, 54, 148 } /* left = tm */ + }, { /* above = d27 */ + { 82, 22, 32, 127, 143, 213, 39, 41, 70 } /* left = dc */, + { 62, 44, 61, 123, 105, 189, 48, 57, 64 } /* left = v */, + { 47, 25, 17, 175, 222, 220, 24, 30, 86 } /* left = h */, + { 68, 36, 17, 106, 102, 206, 59, 74, 74 } /* left = d45 */, + { 57, 39, 23, 151, 68, 216, 55, 63, 58 } /* left = d135 */, + { 49, 30, 35, 141, 70, 168, 82, 40, 115 } /* left = d117 */, + { 51, 25, 15, 136, 129, 202, 38, 35, 139 } /* left = d153 */, + { 68, 26, 16, 111, 141, 215, 29, 28, 28 } /* left = d27 */, + { 59, 39, 19, 114, 75, 180, 77, 104, 42 } /* left = d63 */, + { 40, 61, 26, 126, 152, 206, 61, 59, 93 } /* left = tm */ + }, { /* above = d63 */ + { 78, 23, 39, 111, 117, 170, 74, 124, 94 } /* left = dc */, + { 48, 34, 86, 101, 92, 146, 78, 179, 134 } /* left = v */, + { 47, 22, 24, 138, 187, 178, 68, 69, 59 } /* left = h */, + { 56, 25, 33, 105, 112, 187, 95, 177, 129 } /* left = d45 */, + { 48, 31, 27, 114, 63, 183, 82, 116, 56 } /* left = d135 */, + { 43, 28, 37, 121, 63, 123, 61, 192, 169 } /* left = d117 */, + { 42, 17, 24, 109, 97, 177, 56, 76, 122 } /* left = d153 */, + { 58, 18, 28, 105, 139, 182, 70, 92, 63 } /* left = d27 */, + { 46, 23, 32, 74, 86, 150, 67, 183, 88 } /* left = d63 */, + { 36, 38, 48, 92, 122, 165, 88, 137, 91 } /* left = tm */ + }, { /* above = tm */ + { 65, 70, 60, 155, 159, 199, 61, 60, 81 } /* left = dc */, + { 44, 78, 115, 132, 119, 173, 71, 112, 93 } /* left = v */, + { 39, 38, 21, 184, 227, 206, 42, 32, 64 } /* left = h */, + { 58, 47, 36, 124, 137, 193, 80, 82, 78 } /* left = d45 */, + { 49, 50, 35, 144, 95, 205, 63, 78, 59 } /* left = d135 */, + { 41, 53, 52, 148, 71, 142, 65, 128, 51 } /* left = d117 */, + { 40, 36, 28, 143, 143, 202, 40, 55, 137 } /* left = d153 */, + { 52, 34, 29, 129, 183, 227, 42, 35, 43 } /* left = d27 */, + { 42, 44, 44, 104, 105, 164, 64, 130, 80 } /* left = d63 */, + { 43, 81, 53, 140, 169, 204, 68, 84, 72 } /* left = tm */ + } +}; + +static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS] + [VP9_INTER_MODES - 1] = { + {2, 173, 34}, // 0 = both zero mv + {7, 145, 85}, // 1 = one zero mv + one a predicted mv + {7, 166, 63}, // 2 = two predicted mvs + {7, 94, 66}, // 3 = one predicted/zero and one new mv + {8, 64, 46}, // 4 = two new mvs + {17, 81, 31}, // 5 = one intra neighbour + x + {25, 29, 30}, // 6 = two intra neighbours +}; + /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = { -DC_PRED, 2, /* 0 = DC_NODE */ @@ -111,7 +238,7 @@ const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = { -D153_PRED, -D27_PRED /* 8 = D153_NODE */ }; -const vp9_tree_index vp9_sb_mv_ref_tree[6] = { +const vp9_tree_index vp9_inter_mode_tree[6] = { -ZEROMV, 2, -NEARESTMV, 4, -NEARMV, -NEWMV @@ -124,8 +251,7 @@ const vp9_tree_index vp9_partition_tree[6] = { }; struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES]; - -struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES]; +struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES]; struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; @@ -149,20 +275,15 @@ static const vp9_prob default_single_ref_p[REF_CONTEXTS][2] = { { 238, 247 } }; -const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS] - [TX_SIZE_MAX_SB - 1] = { - { 3, 136, 37, }, - { 5, 52, 13, }, -}; -const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS] - [TX_SIZE_MAX_SB - 2] = { - { 20, 152, }, - { 15, 101, }, -}; -const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS] - [TX_SIZE_MAX_SB - 3] = { - { 100, }, - { 66, }, +static const struct tx_probs default_tx_probs = { + { { 3, 136, 37 }, + { 5, 52, 13 } }, + + { { 20, 152 }, + { 15, 101 } }, + + { { 100 }, + { 66 } } }; void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, @@ -181,52 +302,40 @@ void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p, unsigned int (*ct_16x16p)[2]) { ct_16x16p[0][0] = tx_count_16x16p[TX_4X4]; - ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + - tx_count_16x16p[TX_16X16]; + ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16]; ct_16x16p[1][0] = tx_count_16x16p[TX_8X8]; ct_16x16p[1][1] = tx_count_16x16p[TX_16X16]; } void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p, unsigned int (*ct_8x8p)[2]) { - ct_8x8p[0][0] = tx_count_8x8p[TX_4X4]; - ct_8x8p[0][1] = tx_count_8x8p[TX_8X8]; + ct_8x8p[0][0] = tx_count_8x8p[TX_4X4]; + ct_8x8p[0][1] = tx_count_8x8p[TX_8X8]; } -const vp9_prob vp9_default_mbskip_probs[MBSKIP_CONTEXTS] = { +static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = { 192, 128, 64 }; -void vp9_init_mbmode_probs(VP9_COMMON *x) { - vpx_memcpy(x->fc.uv_mode_prob, default_if_uv_probs, - sizeof(default_if_uv_probs)); - vpx_memcpy(x->kf_uv_mode_prob, default_kf_uv_probs, - sizeof(default_kf_uv_probs)); - vpx_memcpy(x->fc.y_mode_prob, default_if_y_probs, - sizeof(default_if_y_probs)); - - vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob, - sizeof(vp9_switchable_interp_prob)); - - vpx_memcpy(x->fc.partition_prob, vp9_partition_probs, - sizeof(vp9_partition_probs)); - - vpx_memcpy(x->fc.intra_inter_prob, default_intra_inter_p, - sizeof(default_intra_inter_p)); - vpx_memcpy(x->fc.comp_inter_prob, default_comp_inter_p, - sizeof(default_comp_inter_p)); - vpx_memcpy(x->fc.comp_ref_prob, default_comp_ref_p, - sizeof(default_comp_ref_p)); - vpx_memcpy(x->fc.single_ref_prob, default_single_ref_p, - sizeof(default_single_ref_p)); - vpx_memcpy(x->fc.tx_probs_32x32p, vp9_default_tx_probs_32x32p, - sizeof(vp9_default_tx_probs_32x32p)); - vpx_memcpy(x->fc.tx_probs_16x16p, vp9_default_tx_probs_16x16p, - sizeof(vp9_default_tx_probs_16x16p)); - vpx_memcpy(x->fc.tx_probs_8x8p, vp9_default_tx_probs_8x8p, - sizeof(vp9_default_tx_probs_8x8p)); - vpx_memcpy(x->fc.mbskip_probs, vp9_default_mbskip_probs, - sizeof(vp9_default_mbskip_probs)); +static const vp9_prob default_switchable_interp_prob[VP9_SWITCHABLE_FILTERS+1] + [VP9_SWITCHABLE_FILTERS-1] = { + { 235, 162, }, + { 36, 255, }, + { 34, 3, }, + { 149, 144, }, +}; + +void vp9_init_mbmode_probs(VP9_COMMON *cm) { + vp9_copy(cm->fc.uv_mode_prob, default_if_uv_probs); + vp9_copy(cm->fc.y_mode_prob, default_if_y_probs); + vp9_copy(cm->fc.switchable_interp_prob, default_switchable_interp_prob); + vp9_copy(cm->fc.partition_prob, default_partition_probs); + vp9_copy(cm->fc.intra_inter_prob, default_intra_inter_p); + vp9_copy(cm->fc.comp_inter_prob, default_comp_inter_p); + vp9_copy(cm->fc.comp_ref_prob, default_comp_ref_p); + vp9_copy(cm->fc.single_ref_prob, default_single_ref_p); + cm->fc.tx_probs = default_tx_probs; + vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs); } const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = { @@ -236,40 +345,22 @@ const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = { struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS]; const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = { EIGHTTAP, EIGHTTAP_SMOOTH, EIGHTTAP_SHARP}; -const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, 0, 2, -1, -1}; -const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1] - [VP9_SWITCHABLE_FILTERS-1] = { - { 235, 162, }, - { 36, 255, }, - { 34, 3, }, - { 149, 144, }, -}; - -// Indicates if the filter is interpolating or non-interpolating -const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {1, 1, 1, 1, -1}; +const int vp9_switchable_interp_map[SWITCHABLE + 1] = {1, 0, 2, -1, -1}; void vp9_entropy_mode_init() { vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree); vp9_tokens_from_tree(vp9_switchable_interp_encodings, vp9_switchable_interp_tree); vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree); - - vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array, - vp9_sb_mv_ref_tree, NEARESTMV); -} - -void vp9_init_mode_contexts(VP9_COMMON *pc) { - vpx_memset(pc->fc.inter_mode_counts, 0, sizeof(pc->fc.inter_mode_counts)); - vpx_memcpy(pc->fc.inter_mode_probs, - vp9_default_inter_mode_probs, - sizeof(vp9_default_inter_mode_probs)); + vp9_tokens_from_tree_offset(vp9_inter_mode_encodings, + vp9_inter_mode_tree, NEARESTMV); } void vp9_accum_mv_refs(VP9_COMMON *pc, MB_PREDICTION_MODE m, const int context) { unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] = - pc->fc.inter_mode_counts; + pc->counts.inter_mode; if (m == ZEROMV) { ++inter_mode_counts[context][0][0]; @@ -288,39 +379,32 @@ void vp9_accum_mv_refs(VP9_COMMON *pc, } } -#define MVREF_COUNT_SAT 20 -#define MVREF_MAX_UPDATE_FACTOR 128 -void vp9_adapt_mode_context(VP9_COMMON *pc) { - int i, j; - unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] = - pc->fc.inter_mode_counts; - vp9_prob (*mode_context)[VP9_INTER_MODES - 1] = pc->fc.inter_mode_probs; - - for (j = 0; j < INTER_MODE_CONTEXTS; j++) { - for (i = 0; i < VP9_INTER_MODES - 1; i++) { - int count = inter_mode_counts[j][i][0] + inter_mode_counts[j][i][1]; - int factor; - count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count; - factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT); - mode_context[j][i] = weighted_prob( - pc->fc.pre_inter_mode_probs[j][i], - get_binary_prob(inter_mode_counts[j][i][0], - inter_mode_counts[j][i][1]), - factor); - } - } -} +#define COUNT_SAT 20 +#define MAX_UPDATE_FACTOR 128 -#define MODE_COUNT_SAT 20 -#define MODE_MAX_UPDATE_FACTOR 128 -static int update_mode_ct(vp9_prob pre_prob, vp9_prob prob, - unsigned int branch_ct[2]) { - int factor, count = branch_ct[0] + branch_ct[1]; - count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; - factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); +static int update_ct(vp9_prob pre_prob, vp9_prob prob, + unsigned int ct[2]) { + const int count = MIN(ct[0] + ct[1], COUNT_SAT); + const int factor = MAX_UPDATE_FACTOR * count / COUNT_SAT; return weighted_prob(pre_prob, prob, factor); } +static int update_ct2(vp9_prob pre_prob, unsigned int ct[2]) { + return update_ct(pre_prob, get_binary_prob(ct[0], ct[1]), ct); +} + +void vp9_adapt_mode_context(VP9_COMMON *pc) { + int i, j; + FRAME_CONTEXT *const fc = &pc->fc; + FRAME_CONTEXT *const pre_fc = &pc->frame_contexts[pc->frame_context_idx]; + FRAME_COUNTS *const counts = &pc->counts; + + for (j = 0; j < INTER_MODE_CONTEXTS; j++) + for (i = 0; i < VP9_INTER_MODES - 1; i++) + fc->inter_mode_probs[j][i] = update_ct2(pre_fc->inter_mode_probs[j][i], + counts->inter_mode[j][i]); +} + static void update_mode_probs(int n_modes, const vp9_tree_index *tree, unsigned int *cnt, vp9_prob *pre_probs, vp9_prob *dst_probs, @@ -333,189 +417,127 @@ static void update_mode_probs(int n_modes, assert(n_modes - 1 < MAX_PROBS); vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset); for (t = 0; t < n_modes - 1; ++t) - dst_probs[t] = update_mode_ct(pre_probs[t], probs[t], branch_ct[t]); -} - -static int update_mode_ct2(vp9_prob pre_prob, unsigned int branch_ct[2]) { - return update_mode_ct(pre_prob, get_binary_prob(branch_ct[0], - branch_ct[1]), branch_ct); + dst_probs[t] = update_ct(pre_probs[t], probs[t], branch_ct[t]); } -// #define MODE_COUNT_TESTING void vp9_adapt_mode_probs(VP9_COMMON *cm) { int i, j; FRAME_CONTEXT *fc = &cm->fc; -#ifdef MODE_COUNT_TESTING - int t; - - printf("static const unsigned int\nymode_counts" - "[VP9_INTRA_MODES] = {\n"); - for (t = 0; t < VP9_INTRA_MODES; ++t) - printf("%d, ", fc->ymode_counts[t]); - printf("};\n"); - printf("static const unsigned int\nuv_mode_counts" - "[VP9_INTRA_MODES] [VP9_INTRA_MODES] = {\n"); - for (i = 0; i < VP9_INTRA_MODES; ++i) { - printf(" {"); - for (t = 0; t < VP9_INTRA_MODES; ++t) - printf("%d, ", fc->uv_mode_counts[i][t]); - printf("},\n"); - } - printf("};\n"); - printf("static const unsigned int\nbmode_counts" - "[VP9_NKF_BINTRAMODES] = {\n"); - for (t = 0; t < VP9_NKF_BINTRAMODES; ++t) - printf("%d, ", fc->bmode_counts[t]); - printf("};\n"); - printf("static const unsigned int\ni8x8_mode_counts" - "[VP9_I8X8_MODES] = {\n"); - for (t = 0; t < VP9_I8X8_MODES; ++t) - printf("%d, ", fc->i8x8_mode_counts[t]); - printf("};\n"); - printf("static const unsigned int\nmbsplit_counts" - "[VP9_NUMMBSPLITS] = {\n"); - for (t = 0; t < VP9_NUMMBSPLITS; ++t) - printf("%d, ", fc->mbsplit_counts[t]); - printf("};\n"); -#endif + FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; + FRAME_COUNTS *counts = &cm->counts; for (i = 0; i < INTRA_INTER_CONTEXTS; i++) - fc->intra_inter_prob[i] = update_mode_ct2(fc->pre_intra_inter_prob[i], - fc->intra_inter_count[i]); + fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i], + counts->intra_inter[i]); for (i = 0; i < COMP_INTER_CONTEXTS; i++) - fc->comp_inter_prob[i] = update_mode_ct2(fc->pre_comp_inter_prob[i], - fc->comp_inter_count[i]); + fc->comp_inter_prob[i] = update_ct2(pre_fc->comp_inter_prob[i], + counts->comp_inter[i]); for (i = 0; i < REF_CONTEXTS; i++) - fc->comp_ref_prob[i] = update_mode_ct2(fc->pre_comp_ref_prob[i], - fc->comp_ref_count[i]); + fc->comp_ref_prob[i] = update_ct2(pre_fc->comp_ref_prob[i], + counts->comp_ref[i]); for (i = 0; i < REF_CONTEXTS; i++) for (j = 0; j < 2; j++) - fc->single_ref_prob[i][j] = update_mode_ct2(fc->pre_single_ref_prob[i][j], - fc->single_ref_count[i][j]); + fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j], + counts->single_ref[i][j]); for (i = 0; i < BLOCK_SIZE_GROUPS; i++) update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree, - fc->y_mode_counts[i], fc->pre_y_mode_prob[i], + counts->y_mode[i], pre_fc->y_mode_prob[i], fc->y_mode_prob[i], 0); for (i = 0; i < VP9_INTRA_MODES; ++i) update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree, - fc->uv_mode_counts[i], fc->pre_uv_mode_prob[i], + counts->uv_mode[i], pre_fc->uv_mode_prob[i], fc->uv_mode_prob[i], 0); for (i = 0; i < NUM_PARTITION_CONTEXTS; i++) update_mode_probs(PARTITION_TYPES, vp9_partition_tree, - fc->partition_counts[i], fc->pre_partition_prob[i], + counts->partition[i], + pre_fc->partition_prob[INTER_FRAME][i], fc->partition_prob[INTER_FRAME][i], 0); if (cm->mcomp_filter_type == SWITCHABLE) { - for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) { + for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) update_mode_probs(VP9_SWITCHABLE_FILTERS, vp9_switchable_interp_tree, - fc->switchable_interp_count[i], - fc->pre_switchable_interp_prob[i], + counts->switchable_interp[i], + pre_fc->switchable_interp_prob[i], fc->switchable_interp_prob[i], 0); - } } - if (cm->txfm_mode == TX_MODE_SELECT) { + + if (cm->tx_mode == TX_MODE_SELECT) { int j; unsigned int branch_ct_8x8p[TX_SIZE_MAX_SB - 3][2]; unsigned int branch_ct_16x16p[TX_SIZE_MAX_SB - 2][2]; unsigned int branch_ct_32x32p[TX_SIZE_MAX_SB - 1][2]; + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { - tx_counts_to_branch_counts_8x8(cm->fc.tx_count_8x8p[i], - branch_ct_8x8p); - for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) { - int factor; - int count = branch_ct_8x8p[j][0] + branch_ct_8x8p[j][1]; - vp9_prob prob = get_binary_prob(branch_ct_8x8p[j][0], - branch_ct_8x8p[j][1]); - count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; - factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); - cm->fc.tx_probs_8x8p[i][j] = weighted_prob( - cm->fc.pre_tx_probs_8x8p[i][j], prob, factor); - } - } - for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { - tx_counts_to_branch_counts_16x16(cm->fc.tx_count_16x16p[i], + tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p); + for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) + fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j], + branch_ct_8x8p[j]); + + tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p); - for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) { - int factor; - int count = branch_ct_16x16p[j][0] + branch_ct_16x16p[j][1]; - vp9_prob prob = get_binary_prob(branch_ct_16x16p[j][0], - branch_ct_16x16p[j][1]); - count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; - factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); - cm->fc.tx_probs_16x16p[i][j] = weighted_prob( - cm->fc.pre_tx_probs_16x16p[i][j], prob, factor); - } - } - for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { - tx_counts_to_branch_counts_32x32(cm->fc.tx_count_32x32p[i], + for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) + fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j], + branch_ct_16x16p[j]); + + tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p); - for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) { - int factor; - int count = branch_ct_32x32p[j][0] + branch_ct_32x32p[j][1]; - vp9_prob prob = get_binary_prob(branch_ct_32x32p[j][0], - branch_ct_32x32p[j][1]); - count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; - factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); - cm->fc.tx_probs_32x32p[i][j] = weighted_prob( - cm->fc.pre_tx_probs_32x32p[i][j], prob, factor); - } + for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) + fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j], + branch_ct_32x32p[j]); } } + for (i = 0; i < MBSKIP_CONTEXTS; ++i) - fc->mbskip_probs[i] = update_mode_ct2(fc->pre_mbskip_probs[i], - fc->mbskip_count[i]); + fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i], + counts->mbskip[i]); } static void set_default_lf_deltas(MACROBLOCKD *xd) { - xd->mode_ref_lf_delta_enabled = 1; - xd->mode_ref_lf_delta_update = 1; + xd->lf.mode_ref_delta_enabled = 1; + xd->lf.mode_ref_delta_update = 1; - xd->ref_lf_deltas[INTRA_FRAME] = 1; - xd->ref_lf_deltas[LAST_FRAME] = 0; - xd->ref_lf_deltas[GOLDEN_FRAME] = -1; - xd->ref_lf_deltas[ALTREF_FRAME] = -1; + xd->lf.ref_deltas[INTRA_FRAME] = 1; + xd->lf.ref_deltas[LAST_FRAME] = 0; + xd->lf.ref_deltas[GOLDEN_FRAME] = -1; + xd->lf.ref_deltas[ALTREF_FRAME] = -1; - xd->mode_lf_deltas[0] = 0; // Zero - xd->mode_lf_deltas[1] = 0; // New mv + xd->lf.mode_deltas[0] = 0; + xd->lf.mode_deltas[1] = 0; } void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) { // Reset the segment feature data to the default stats: // Features disabled, 0, with delta coding (Default state). int i; - vp9_clearall_segfeatures(xd); - xd->mb_segment_abs_delta = SEGMENT_DELTADATA; + vp9_clearall_segfeatures(&xd->seg); + xd->seg.abs_delta = SEGMENT_DELTADATA; if (cm->last_frame_seg_map) vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); // Reset the mode ref deltas for loop filter - vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->last_ref_lf_deltas)); - vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->last_mode_lf_deltas)); + vp9_zero(xd->lf.last_ref_deltas); + vp9_zero(xd->lf.last_mode_deltas); set_default_lf_deltas(xd); + // To force update of the sharpness + xd->lf.last_sharpness_level = -1; + vp9_default_coef_probs(cm); vp9_init_mbmode_probs(cm); - vpx_memcpy(cm->kf_y_mode_prob, vp9_kf_default_bmode_probs, - sizeof(vp9_kf_default_bmode_probs)); vp9_init_mv_probs(cm); + vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs); - // To force update of the sharpness - cm->last_sharpness_level = -1; - - vp9_init_mode_contexts(cm); - - if ((cm->frame_type == KEY_FRAME) || - cm->error_resilient_mode || (cm->reset_frame_context == 3)) { + if (cm->frame_type == KEY_FRAME || + cm->error_resilient_mode || cm->reset_frame_context == 3) { // Reset all frame contexts. for (i = 0; i < NUM_FRAME_CONTEXTS; ++i) - vpx_memcpy(&cm->frame_contexts[i], &cm->fc, sizeof(cm->fc)); + cm->frame_contexts[i] = cm->fc; } else if (cm->reset_frame_context == 2) { // Reset only the frame context specified in the frame header. - vpx_memcpy(&cm->frame_contexts[cm->frame_context_idx], &cm->fc, - sizeof(cm->fc)); + cm->frame_contexts[cm->frame_context_idx] = cm->fc; } vpx_memset(cm->prev_mip, 0, @@ -529,7 +551,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) { vp9_update_mode_info_border(cm, cm->prev_mip); vp9_update_mode_info_in_image(cm, cm->prev_mi); - vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias)); + vp9_zero(cm->ref_frame_sign_bias); cm->frame_context_idx = 0; } diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h index aa8aec7..8c14e7e 100644 --- a/libvpx/vp9/common/vp9_entropymode.h +++ b/libvpx/vp9/common/vp9_entropymode.h @@ -16,81 +16,68 @@ #define SUBMVREF_COUNT 5 #define TX_SIZE_CONTEXTS 2 - #define VP9_MODE_UPDATE_PROB 252 +#define VP9_SWITCHABLE_FILTERS 3 // number of switchable filters // #define MODE_STATS -extern int vp9_mv_cont(const int_mv *l, const int_mv *a); +struct VP9Common; + +struct tx_probs { + vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1]; + vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2]; + vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3]; +}; +struct tx_counts { + unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB]; + unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1]; + unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2]; +}; -extern const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES] - [VP9_INTRA_MODES] - [VP9_INTRA_MODES - 1]; +extern const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1]; +extern const vp9_prob vp9_kf_y_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES] + [VP9_INTRA_MODES - 1]; extern const vp9_tree_index vp9_intra_mode_tree[]; -extern const vp9_tree_index vp9_sb_mv_ref_tree[]; +extern const vp9_tree_index vp9_inter_mode_tree[]; extern struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES]; - -/* Inter mode values do not start at zero */ - -extern struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES]; +extern struct vp9_token vp9_inter_mode_encodings[VP9_INTER_MODES]; // probability models for partition information -extern const vp9_tree_index vp9_partition_tree[]; +extern const vp9_tree_index vp9_partition_tree[]; extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; -extern const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES] - [NUM_PARTITION_CONTEXTS] - [PARTITION_TYPES - 1]; - -void vp9_entropy_mode_init(void); - -struct VP9Common; -/* sets up common features to forget past dependence */ -void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd); +extern const INTERPOLATIONFILTERTYPE vp9_switchable_interp + [VP9_SWITCHABLE_FILTERS]; -void vp9_init_mbmode_probs(struct VP9Common *x); +extern const int vp9_switchable_interp_map[SWITCHABLE + 1]; -extern void vp9_init_mode_contexts(struct VP9Common *pc); +extern const vp9_tree_index vp9_switchable_interp_tree + [2 * (VP9_SWITCHABLE_FILTERS - 1)]; -extern void vp9_adapt_mode_context(struct VP9Common *pc); +extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS]; -extern void vp9_accum_mv_refs(struct VP9Common *pc, - MB_PREDICTION_MODE m, - const int context); +void vp9_entropy_mode_init(); -void vp9_adapt_mode_probs(struct VP9Common *); +int vp9_mv_cont(const int_mv *l, const int_mv *a); -#define VP9_SWITCHABLE_FILTERS 3 /* number of switchable filters */ +void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd); -extern const INTERPOLATIONFILTERTYPE vp9_switchable_interp - [VP9_SWITCHABLE_FILTERS]; +void vp9_init_mbmode_probs(struct VP9Common *x); -extern const int vp9_switchable_interp_map[SWITCHABLE + 1]; +void vp9_adapt_mode_context(struct VP9Common *pc); -extern const int vp9_is_interpolating_filter[SWITCHABLE + 1]; +void vp9_adapt_mode_probs(struct VP9Common *); -extern const vp9_tree_index vp9_switchable_interp_tree - [2 * (VP9_SWITCHABLE_FILTERS - 1)]; +void vp9_accum_mv_refs(struct VP9Common *pc, MB_PREDICTION_MODE m, int context); -extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS]; +void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, + unsigned int (*ct_32x32p)[2]); +void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p, + unsigned int (*ct_16x16p)[2]); +void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p, + unsigned int (*ct_8x8p)[2]); -extern const vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] - [VP9_SWITCHABLE_FILTERS - 1]; - -extern const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS] - [TX_SIZE_MAX_SB - 1]; -extern const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS] - [TX_SIZE_MAX_SB - 2]; -extern const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS] - [TX_SIZE_MAX_SB - 3]; - -extern void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, - unsigned int (*ct_32x32p)[2]); -extern void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p, - unsigned int (*ct_16x16p)[2]); -extern void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p, - unsigned int (*ct_8x8p)[2]); #endif // VP9_COMMON_VP9_ENTROPYMODE_H_ diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c index e07e43c..343b624 100644 --- a/libvpx/vp9/common/vp9_entropymv.c +++ b/libvpx/vp9/common/vp9_entropymv.c @@ -12,17 +12,12 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_entropymv.h" -//#define MV_COUNT_TESTING - #define MV_COUNT_SAT 20 #define MV_MAX_UPDATE_FACTOR 128 /* Integer pel reference mv threshold for use of high-precision 1/8 mv */ #define COMPANDED_MVREF_THRESH 8 -/* Smooth or bias the mv-counts before prob computation */ -/* #define SMOOTH_MV_COUNTS */ - const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = { -MV_JOINT_ZERO, 2, -MV_JOINT_HNZVZ, 4, @@ -56,7 +51,7 @@ const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = { }; struct vp9_token vp9_mv_fp_encodings[4]; -const nmv_context vp9_default_nmv_context = { +static const nmv_context default_nmv_context = { {32, 64, 96}, { { /* vert component */ @@ -82,21 +77,10 @@ const nmv_context vp9_default_nmv_context = { }, }; -MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) { - if (mv->row == 0 && mv->col == 0) - return MV_JOINT_ZERO; - else if (mv->row == 0 && mv->col != 0) - return MV_JOINT_HNZVZ; - else if (mv->row != 0 && mv->col == 0) - return MV_JOINT_HZVNZ; - else - return MV_JOINT_HNZVNZ; -} - #define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0) MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { - MV_CLASS_TYPE c; + MV_CLASS_TYPE c = MV_CLASS_0; if (z < CLASS0_SIZE * 8) c = MV_CLASS_0; else if (z < CLASS0_SIZE * 16) c = MV_CLASS_1; else if (z < CLASS0_SIZE * 32) c = MV_CLASS_2; @@ -114,7 +98,7 @@ MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { return c; } -int vp9_use_nmv_hp(const MV *ref) { +int vp9_use_mv_hp(const MV *ref) { return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH && (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH; } @@ -123,95 +107,71 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) { return mv_class_base(c) + offset; } -static void increment_nmv_component_count(int v, - nmv_component_counts *mvcomp, - int incr, - int usehp) { - assert (v != 0); /* should not be zero */ - mvcomp->mvcount[MV_MAX + v] += incr; +static void inc_mv_component_count(int v, nmv_component_counts *comp_counts, + int incr) { + assert (v != 0); + comp_counts->mvcount[MV_MAX + v] += incr; } -static void increment_nmv_component(int v, - nmv_component_counts *mvcomp, - int incr, - int usehp) { +static void inc_mv_component(int v, nmv_component_counts *comp_counts, + int incr, int usehp) { int s, z, c, o, d, e, f; if (!incr) return; assert (v != 0); /* should not be zero */ s = v < 0; - mvcomp->sign[s] += incr; + comp_counts->sign[s] += incr; z = (s ? -v : v) - 1; /* magnitude - 1 */ c = vp9_get_mv_class(z, &o); - mvcomp->classes[c] += incr; + comp_counts->classes[c] += incr; d = (o >> 3); /* int mv data */ f = (o >> 1) & 3; /* fractional pel mv data */ e = (o & 1); /* high precision mv data */ if (c == MV_CLASS_0) { - mvcomp->class0[d] += incr; + comp_counts->class0[d] += incr; } else { int i; int b = c + CLASS0_BITS - 1; // number of bits for (i = 0; i < b; ++i) - mvcomp->bits[i][((d >> i) & 1)] += incr; + comp_counts->bits[i][((d >> i) & 1)] += incr; } /* Code the fractional pel bits */ if (c == MV_CLASS_0) { - mvcomp->class0_fp[d][f] += incr; + comp_counts->class0_fp[d][f] += incr; } else { - mvcomp->fp[f] += incr; + comp_counts->fp[f] += incr; } /* Code the high precision bit */ if (usehp) { if (c == MV_CLASS_0) { - mvcomp->class0_hp[e] += incr; + comp_counts->class0_hp[e] += incr; } else { - mvcomp->hp[e] += incr; + comp_counts->hp[e] += incr; } } } -#ifdef SMOOTH_MV_COUNTS -static void smooth_counts(nmv_component_counts *mvcomp) { - static const int flen = 3; // (filter_length + 1) / 2 - static const int fval[] = {8, 3, 1}; - static const int fvalbits = 4; - int i; - unsigned int smvcount[MV_VALS]; - vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount)); - smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1; - for (i = flen - 1; i <= MV_VALS - flen; ++i) { - int j, s = smvcount[i] * fval[0]; - for (j = 1; j < flen; ++j) - s += (smvcount[i - j] + smvcount[i + j]) * fval[j]; - mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits; - } -} -#endif - static void counts_to_context(nmv_component_counts *mvcomp, int usehp) { int v; vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount)); for (v = 1; v <= MV_MAX; v++) { - increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp); - increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp); + inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp); + inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp); } } -void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx, - int usehp) { +void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx) { const MV_JOINT_TYPE j = vp9_get_mv_joint(mv); mvctx->joints[j]++; - usehp = usehp && vp9_use_nmv_hp(ref); if (mv_joint_vertical(j)) - increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp); + inc_mv_component_count(mv->row, &mvctx->comps[0], 1); if (mv_joint_horizontal(j)) - increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp); + inc_mv_component_count(mv->col, &mvctx->comps[1], 1); } static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) { @@ -230,79 +190,6 @@ void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) { counts_to_context(&nmv_count->comps[1], usehp); } -void vp9_counts_to_nmv_context( - nmv_context_counts *nmv_count, - nmv_context *prob, - int usehp, - unsigned int (*branch_ct_joint)[2], - unsigned int (*branch_ct_sign)[2], - unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2], - unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2], - unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2], - unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2], - unsigned int (*branch_ct_fp)[4 - 1][2], - unsigned int (*branch_ct_class0_hp)[2], - unsigned int (*branch_ct_hp)[2]) { - int i, j, k; - vp9_counts_process(nmv_count, usehp); - vp9_tree_probs_from_distribution(vp9_mv_joint_tree, - prob->joints, - branch_ct_joint, - nmv_count->joints, 0); - for (i = 0; i < 2; ++i) { - const uint32_t s0 = nmv_count->comps[i].sign[0]; - const uint32_t s1 = nmv_count->comps[i].sign[1]; - - prob->comps[i].sign = get_binary_prob(s0, s1); - branch_ct_sign[i][0] = s0; - branch_ct_sign[i][1] = s1; - vp9_tree_probs_from_distribution(vp9_mv_class_tree, - prob->comps[i].classes, - branch_ct_classes[i], - nmv_count->comps[i].classes, 0); - vp9_tree_probs_from_distribution(vp9_mv_class0_tree, - prob->comps[i].class0, - branch_ct_class0[i], - nmv_count->comps[i].class0, 0); - for (j = 0; j < MV_OFFSET_BITS; ++j) { - const uint32_t b0 = nmv_count->comps[i].bits[j][0]; - const uint32_t b1 = nmv_count->comps[i].bits[j][1]; - - prob->comps[i].bits[j] = get_binary_prob(b0, b1); - branch_ct_bits[i][j][0] = b0; - branch_ct_bits[i][j][1] = b1; - } - } - for (i = 0; i < 2; ++i) { - for (k = 0; k < CLASS0_SIZE; ++k) { - vp9_tree_probs_from_distribution(vp9_mv_fp_tree, - prob->comps[i].class0_fp[k], - branch_ct_class0_fp[i][k], - nmv_count->comps[i].class0_fp[k], 0); - } - vp9_tree_probs_from_distribution(vp9_mv_fp_tree, - prob->comps[i].fp, - branch_ct_fp[i], - nmv_count->comps[i].fp, 0); - } - if (usehp) { - for (i = 0; i < 2; ++i) { - const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0]; - const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1]; - const uint32_t hp0 = nmv_count->comps[i].hp[0]; - const uint32_t hp1 = nmv_count->comps[i].hp[1]; - - prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1); - branch_ct_class0_hp[i][0] = c0_hp0; - branch_ct_class0_hp[i][1] = c0_hp1; - - prob->comps[i].hp = get_binary_prob(hp0, hp1); - branch_ct_hp[i][0] = hp0; - branch_ct_hp[i][1] = hp1; - } - } -} - static unsigned int adapt_probs(unsigned int i, vp9_tree tree, vp9_prob this_probs[], @@ -332,110 +219,45 @@ static unsigned int adapt_probs(unsigned int i, } -void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) { +void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) { int i, j; -#ifdef MV_COUNT_TESTING - printf("joints count: "); - for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]); - printf("\n"); fflush(stdout); - printf("signs count:\n"); - for (i = 0; i < 2; ++i) - printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]); - printf("\n"); fflush(stdout); - printf("classes count:\n"); - for (i = 0; i < 2; ++i) { - for (j = 0; j < MV_CLASSES; ++j) - printf("%d ", cm->fc.NMVcount.comps[i].classes[j]); - printf("\n"); fflush(stdout); - } - printf("class0 count:\n"); - for (i = 0; i < 2; ++i) { - for (j = 0; j < CLASS0_SIZE; ++j) - printf("%d ", cm->fc.NMVcount.comps[i].class0[j]); - printf("\n"); fflush(stdout); - } - printf("bits count:\n"); - for (i = 0; i < 2; ++i) { - for (j = 0; j < MV_OFFSET_BITS; ++j) - printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0], - cm->fc.NMVcount.comps[i].bits[j][1]); - printf("\n"); fflush(stdout); - } - printf("class0_fp count:\n"); - for (i = 0; i < 2; ++i) { - for (j = 0; j < CLASS0_SIZE; ++j) { - printf("{"); - for (k = 0; k < 4; ++k) - printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]); - printf("}, "); - } - printf("\n"); fflush(stdout); - } - printf("fp count:\n"); - for (i = 0; i < 2; ++i) { - for (j = 0; j < 4; ++j) - printf("%d ", cm->fc.NMVcount.comps[i].fp[j]); - printf("\n"); fflush(stdout); - } - if (usehp) { - printf("class0_hp count:\n"); - for (i = 0; i < 2; ++i) - printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0], - cm->fc.NMVcount.comps[i].class0_hp[1]); - printf("\n"); fflush(stdout); - printf("hp count:\n"); - for (i = 0; i < 2; ++i) - printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0], - cm->fc.NMVcount.comps[i].hp[1]); - printf("\n"); fflush(stdout); - } -#endif -#ifdef SMOOTH_MV_COUNTS - smooth_counts(&cm->fc.NMVcount.comps[0]); - smooth_counts(&cm->fc.NMVcount.comps[1]); -#endif - vp9_counts_process(&cm->fc.NMVcount, usehp); - adapt_probs(0, vp9_mv_joint_tree, - cm->fc.nmvc.joints, cm->fc.pre_nmvc.joints, - cm->fc.NMVcount.joints); + FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; + + nmv_context *ctx = &cm->fc.nmvc; + nmv_context *pre_ctx = &pre_fc->nmvc; + nmv_context_counts *cts = &cm->counts.mv; + + vp9_counts_process(cts, usehp); + + adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints); for (i = 0; i < 2; ++i) { - adapt_prob(&cm->fc.nmvc.comps[i].sign, - cm->fc.pre_nmvc.comps[i].sign, - cm->fc.NMVcount.comps[i].sign); - adapt_probs(0, vp9_mv_class_tree, - cm->fc.nmvc.comps[i].classes, cm->fc.pre_nmvc.comps[i].classes, - cm->fc.NMVcount.comps[i].classes); - adapt_probs(0, vp9_mv_class0_tree, - cm->fc.nmvc.comps[i].class0, cm->fc.pre_nmvc.comps[i].class0, - cm->fc.NMVcount.comps[i].class0); - for (j = 0; j < MV_OFFSET_BITS; ++j) { - adapt_prob(&cm->fc.nmvc.comps[i].bits[j], - cm->fc.pre_nmvc.comps[i].bits[j], - cm->fc.NMVcount.comps[i].bits[j]); - } + adapt_prob(&ctx->comps[i].sign, pre_ctx->comps[i].sign, cts->comps[i].sign); + adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes, + pre_ctx->comps[i].classes, cts->comps[i].classes); + adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0, + pre_ctx->comps[i].class0, cts->comps[i].class0); + + for (j = 0; j < MV_OFFSET_BITS; ++j) + adapt_prob(&ctx->comps[i].bits[j], pre_ctx->comps[i].bits[j], + cts->comps[i].bits[j]); } + for (i = 0; i < 2; ++i) { - for (j = 0; j < CLASS0_SIZE; ++j) { - adapt_probs(0, vp9_mv_fp_tree, - cm->fc.nmvc.comps[i].class0_fp[j], - cm->fc.pre_nmvc.comps[i].class0_fp[j], - cm->fc.NMVcount.comps[i].class0_fp[j]); - } - adapt_probs(0, vp9_mv_fp_tree, - cm->fc.nmvc.comps[i].fp, - cm->fc.pre_nmvc.comps[i].fp, - cm->fc.NMVcount.comps[i].fp); + for (j = 0; j < CLASS0_SIZE; ++j) + adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j], + pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]); + + adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp, + cts->comps[i].fp); } + if (usehp) { for (i = 0; i < 2; ++i) { - adapt_prob(&cm->fc.nmvc.comps[i].class0_hp, - cm->fc.pre_nmvc.comps[i].class0_hp, - cm->fc.NMVcount.comps[i].class0_hp); - adapt_prob(&cm->fc.nmvc.comps[i].hp, - cm->fc.pre_nmvc.comps[i].hp, - cm->fc.NMVcount.comps[i].hp); + adapt_prob(&ctx->comps[i].class0_hp, pre_ctx->comps[i].class0_hp, + cts->comps[i].class0_hp); + adapt_prob(&ctx->comps[i].hp, pre_ctx->comps[i].hp, cts->comps[i].hp); } } } @@ -448,5 +270,5 @@ void vp9_entropy_mv_init() { } void vp9_init_mv_probs(VP9_COMMON *cm) { - vpx_memcpy(&cm->fc.nmvc, &vp9_default_nmv_context, sizeof(nmv_context)); + cm->fc.nmvc = default_nmv_context; } diff --git a/libvpx/vp9/common/vp9_entropymv.h b/libvpx/vp9/common/vp9_entropymv.h index 15994a6..85a1f3a 100644 --- a/libvpx/vp9/common/vp9_entropymv.h +++ b/libvpx/vp9/common/vp9_entropymv.h @@ -21,15 +21,11 @@ struct VP9Common; void vp9_entropy_mv_init(); void vp9_init_mv_probs(struct VP9Common *cm); -void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp); -int vp9_use_nmv_hp(const MV *ref); +void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp); +int vp9_use_mv_hp(const MV *ref); #define VP9_NMV_UPDATE_PROB 252 -//#define MV_GROUP_UPDATE - -#define LOW_PRECISION_MV_UPDATE /* Use 7 bit forward update */ - /* Symbols for coding which components are zero jointly */ #define MV_JOINTS 4 typedef enum { @@ -99,7 +95,14 @@ typedef struct { nmv_component comps[2]; } nmv_context; -MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv); +static INLINE MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) { + if (mv->row == 0) { + return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ; + } else { + return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ; + } +} + MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset); int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset); @@ -121,22 +124,8 @@ typedef struct { nmv_component_counts comps[2]; } nmv_context_counts; -void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx, - int usehp); -extern const nmv_context vp9_default_nmv_context; -void vp9_counts_to_nmv_context( - nmv_context_counts *NMVcount, - nmv_context *prob, - int usehp, - unsigned int (*branch_ct_joint)[2], - unsigned int (*branch_ct_sign)[2], - unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2], - unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2], - unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2], - unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2], - unsigned int (*branch_ct_fp)[4 - 1][2], - unsigned int (*branch_ct_class0_hp)[2], - unsigned int (*branch_ct_hp)[2]); +void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx); + void vp9_counts_process(nmv_context_counts *NMVcount, int usehp); #endif // VP9_COMMON_VP9_ENTROPYMV_H_ diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h index e18d353..86f0d0b 100644 --- a/libvpx/vp9/common/vp9_enums.h +++ b/libvpx/vp9/common/vp9_enums.h @@ -14,25 +14,28 @@ #include "./vpx_config.h" #define LOG2_MI_SIZE 3 +#define LOG2_MI_BLOCK_SIZE (6 - LOG2_MI_SIZE) // 64 = 2^6 -#define MI_SIZE (1 << LOG2_MI_SIZE) -#define MI_MASK ((64 >> LOG2_MI_SIZE) - 1) +#define MI_SIZE (1 << LOG2_MI_SIZE) // pixels per mi-unit +#define MI_BLOCK_SIZE (1 << LOG2_MI_BLOCK_SIZE) // mi-units per max block + +#define MI_MASK (MI_BLOCK_SIZE - 1) typedef enum BLOCK_SIZE_TYPE { - BLOCK_SIZE_AB4X4, - BLOCK_SIZE_SB4X8, - BLOCK_SIZE_SB8X4, - BLOCK_SIZE_SB8X8, - BLOCK_SIZE_SB8X16, - BLOCK_SIZE_SB16X8, - BLOCK_SIZE_MB16X16, - BLOCK_SIZE_SB16X32, - BLOCK_SIZE_SB32X16, - BLOCK_SIZE_SB32X32, - BLOCK_SIZE_SB32X64, - BLOCK_SIZE_SB64X32, - BLOCK_SIZE_SB64X64, - BLOCK_SIZE_TYPES + BLOCK_SIZE_AB4X4, BLOCK_4X4 = BLOCK_SIZE_AB4X4, + BLOCK_SIZE_SB4X8, BLOCK_4X8 = BLOCK_SIZE_SB4X8, + BLOCK_SIZE_SB8X4, BLOCK_8X4 = BLOCK_SIZE_SB8X4, + BLOCK_SIZE_SB8X8, BLOCK_8X8 = BLOCK_SIZE_SB8X8, + BLOCK_SIZE_SB8X16, BLOCK_8X16 = BLOCK_SIZE_SB8X16, + BLOCK_SIZE_SB16X8, BLOCK_16X8 = BLOCK_SIZE_SB16X8, + BLOCK_SIZE_MB16X16, BLOCK_16X16 = BLOCK_SIZE_MB16X16, + BLOCK_SIZE_SB16X32, BLOCK_16X32 = BLOCK_SIZE_SB16X32, + BLOCK_SIZE_SB32X16, BLOCK_32X16 = BLOCK_SIZE_SB32X16, + BLOCK_SIZE_SB32X32, BLOCK_32X32 = BLOCK_SIZE_SB32X32, + BLOCK_SIZE_SB32X64, BLOCK_32X64 = BLOCK_SIZE_SB32X64, + BLOCK_SIZE_SB64X32, BLOCK_64X32 = BLOCK_SIZE_SB64X32, + BLOCK_SIZE_SB64X64, BLOCK_64X64 = BLOCK_SIZE_SB64X64, + BLOCK_SIZE_TYPES, BLOCK_MAX_SB_SEGMENTS = BLOCK_SIZE_TYPES } BLOCK_SIZE_TYPE; typedef enum PARTITION_TYPE { @@ -40,10 +43,34 @@ typedef enum PARTITION_TYPE { PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT, - PARTITION_TYPES + PARTITION_TYPES, PARTITION_INVALID = PARTITION_TYPES } PARTITION_TYPE; #define PARTITION_PLOFFSET 4 // number of probability models per block size #define NUM_PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET) +typedef enum { + TX_4X4 = 0, // 4x4 dct transform + TX_8X8 = 1, // 8x8 dct transform + TX_16X16 = 2, // 16x16 dct transform + TX_32X32 = 3, // 32x32 dct transform + TX_SIZE_MAX_SB, // Number of transforms available to SBs +} TX_SIZE; + +typedef enum { + ONLY_4X4 = 0, + ALLOW_8X8 = 1, + ALLOW_16X16 = 2, + ALLOW_32X32 = 3, + TX_MODE_SELECT = 4, + NB_TXFM_MODES = 5, +} TX_MODE; + +typedef enum { + DCT_DCT = 0, // DCT in both horizontal and vertical + ADST_DCT = 1, // ADST in vertical, DCT in horizontal + DCT_ADST = 2, // DCT in vertical, ADST in horizontal + ADST_ADST = 3 // ADST in both directions +} TX_TYPE; + #endif // VP9_COMMON_VP9_ENUMS_H_ diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c index a692271..643b229 100644 --- a/libvpx/vp9/common/vp9_findnearmv.c +++ b/libvpx/vp9/common/vp9_findnearmv.c @@ -15,7 +15,7 @@ #include "vp9/common/vp9_sadmxn.h" static void lower_mv_precision(int_mv *mv, int usehp) { - if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) { + if (!usehp || !vp9_use_mv_hp(&mv->as_mv)) { if (mv->as_mv.row & 1) mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1); if (mv->as_mv.col & 1) diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h index d4ae210..b0fa505 100644 --- a/libvpx/vp9/common/vp9_findnearmv.h +++ b/libvpx/vp9/common/vp9_findnearmv.h @@ -28,18 +28,6 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int_mv *nearest, int_mv *near); -static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, - int_mv *mvp, const int *ref_frame_sign_bias) { - MV xmv = mvp->as_mv; - - if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) { - xmv.row *= -1; - xmv.col *= -1; - } - - mvp->as_mv = xmv; -} - // TODO(jingning): this mv clamping function should be block size dependent. static void clamp_mv(int_mv *mv, int mb_to_left_edge, @@ -61,15 +49,6 @@ static int clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) { return tmp_mv.as_int != mv->as_int; } -static int check_mv_bounds(int_mv *mv, - int mb_to_left_edge, int mb_to_right_edge, - int mb_to_top_edge, int mb_to_bottom_edge) { - return mv->as_mv.col < mb_to_left_edge || - mv->as_mv.col > mb_to_right_edge || - mv->as_mv.row < mb_to_top_edge || - mv->as_mv.row > mb_to_bottom_edge; -} - void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc, MACROBLOCKD *xd, int_mv *dst_nearest, @@ -86,13 +65,13 @@ static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) { if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) { return DC_PRED; } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) { - return ((cur_mb->bmi + 1 + b)->as_mode.first); + return ((cur_mb->bmi + 1 + b)->as_mode); } else { return cur_mb->mbmi.mode; } } assert(b == 1 || b == 3); - return (cur_mb->bmi + b - 1)->as_mode.first; + return (cur_mb->bmi + b - 1)->as_mode; } static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, @@ -104,13 +83,13 @@ static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) { return DC_PRED; } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) { - return ((cur_mb->bmi + 2 + b)->as_mode.first); + return ((cur_mb->bmi + 2 + b)->as_mode); } else { return cur_mb->mbmi.mode; } } - return (cur_mb->bmi + b - 2)->as_mode.first; + return (cur_mb->bmi + b - 2)->as_mode; } #endif // VP9_COMMON_VP9_FINDNEARMV_H_ diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c index dcc7f03..a95560a 100644 --- a/libvpx/vp9/common/vp9_idct.c +++ b/libvpx/vp9/common/vp9_idct.c @@ -124,9 +124,7 @@ void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { // Rows for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = input[j]; - vp9_idct4_1d(temp_in, outptr); + vp9_idct4_1d(input, outptr); input += 4; outptr += 4; } @@ -158,23 +156,6 @@ void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { } } -void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, - uint8_t *dst_ptr, int pitch, int stride) { - int a1; - int r, c; - int16_t out = dct_const_round_shift(input_dc * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 4); - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) - dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]); - - dst_ptr += stride; - pred_ptr += pitch; - } -} - static void idct8_1d(int16_t *input, int16_t *output) { int16_t step1[8], step2[8]; int temp1, temp2; @@ -428,12 +409,11 @@ void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride, void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) { - int16_t out[8 * 8]; + int16_t out[8 * 8] = { 0 }; int16_t *outptr = out; int i, j; int16_t temp_in[8], temp_out[8]; - vpx_memset(out, 0, sizeof(out)); // First transform rows // only first 4 row has non-zero coefs for (i = 0; i < 4; ++i) { @@ -535,6 +515,7 @@ static void idct16_1d(int16_t *input, int16_t *output) { step1[14] = -step2[14] + step2[15]; step1[15] = step2[14] + step2[15]; + // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; step2[0] = dct_const_round_shift(temp1); @@ -852,15 +833,13 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride, void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) { - int16_t out[16 * 16]; + int16_t out[16 * 16] = { 0 }; int16_t *outptr = out; int i, j; int16_t temp_in[16], temp_out[16]; - /* First transform rows. Since all non-zero dct coefficients are in - * upper-left 4x4 area, we only need to calculate first 4 rows here. - */ - vpx_memset(out, 0, sizeof(out)); + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. for (i = 0; i < 4; ++i) { idct16_1d(input, outptr); input += 16; @@ -1283,15 +1262,13 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) { void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) { - int16_t out[32 * 32]; + int16_t out[32 * 32] = { 0 }; int16_t *outptr = out; int i, j; int16_t temp_in[32], temp_out[32]; - /* First transform rows. Since all non-zero dct coefficients are in - * upper-left 4x4 area, we only need to calculate first 4 rows here. - */ - vpx_memset(out, 0, sizeof(out)); + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. for (i = 0; i < 4; ++i) { idct32_1d(input, outptr); input += 32; diff --git a/libvpx/vp9/common/vp9_idct.h b/libvpx/vp9/common/vp9_idct.h index 64f14c9..2d959f0 100644 --- a/libvpx/vp9/common/vp9_idct.h +++ b/libvpx/vp9/common/vp9_idct.h @@ -22,10 +22,15 @@ #define DCT_CONST_BITS 14 #define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) +#define WHT_UPSCALE_FACTOR 2 + #define pair_set_epi16(a, b) \ _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16)) -// Constants are round(16384 * cos(k*Pi/64)) where k = 1 to 31. +// Constants: +// for (int i = 1; i< 32; ++i) +// printf("static const int cospi_%d_64 = %.0f;\n", i, +// round(16384 * cos(i*M_PI/64))); // Note: sin(k*Pi/64) = cos((32-k)*Pi/64) static const int cospi_1_64 = 16364; static const int cospi_2_64 = 16305; diff --git a/libvpx/vp9/common/vp9_implicit_segmentation.c b/libvpx/vp9/common/vp9_implicit_segmentation.c deleted file mode 100644 index 2a1d35f..0000000 --- a/libvpx/vp9/common/vp9_implicit_segmentation.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp9/common/vp9_onyxc_int.h" - -#define MAX_REGIONS 24000 -#ifndef NULL -#define NULL 0 -#endif - -#define min_mbs_in_region 3 - -// this linked list structure holds equivalences for connected -// component labeling -struct list_el { - int label; - int seg_value; - int count; - struct list_el *next; -}; -typedef struct list_el item; - -// connected colorsegments -typedef struct { - int min_x; - int min_y; - int max_x; - int max_y; - int64_t sum_x; - int64_t sum_y; - int pixels; - int seg_value; - int label; -} segment_info; - - -typedef enum { - SEGMENT_MODE, - SEGMENT_MV, - SEGMENT_REFFRAME, - SEGMENT_SKIPPED -} SEGMENT_TYPE; - - -// this merges the two equivalence lists and -// then makes sure that every label points to the same -// equivalence list -void merge(item *labels, int u, int v) { - item *a = labels[u].next; - item *b = labels[v].next; - item c; - item *it = &c; - int count; - - // check if they are already merged - if (u == v || a == b) - return; - - count = a->count + b->count; - - // merge 2 sorted linked lists. - while (a != NULL && b != NULL) { - if (a->label < b->label) { - it->next = a; - a = a->next; - } else { - it->next = b; - b = b->next; - } - - it = it->next; - } - - if (a == NULL) - it->next = b; - else - it->next = a; - - it = c.next; - - // make sure every equivalence in the linked list points to this new ll - while (it != NULL) { - labels[it->label].next = c.next; - it = it->next; - } - c.next->count = count; - -} - -void segment_via_mode_info(VP9_COMMON *oci, int how) { - MODE_INFO *mi = oci->mi; - int i, j; - int mb_index = 0; - - int label = 1; - int pitch = oci->mb_cols; - - // holds linked list equivalences - // the max should probably be allocated at a higher level in oci - item equivalences[MAX_REGIONS]; - int eq_ptr = 0; - item labels[MAX_REGIONS]; - segment_info segments[MAX_REGIONS]; - int label_count = 1; - int labeling[400 * 300]; - int *lp = labeling; - - label_count = 1; - memset(labels, 0, sizeof(labels)); - memset(segments, 0, sizeof(segments)); - - /* Go through each macroblock first pass labelling */ - for (i = 0; i < oci->mb_rows; i++, lp += pitch) { - for (j = 0; j < oci->mb_cols; j++) { - // int above seg_value, left seg_value, this seg_value... - int a = -1, l = -1, n = -1; - - // above label, left label - int al = -1, ll = -1; - if (i) { - al = lp[j - pitch]; - a = labels[al].next->seg_value; - } - if (j) { - ll = lp[j - 1]; - l = labels[ll].next->seg_value; - } - - // what setting are we going to do the implicit segmentation on - switch (how) { - case SEGMENT_MODE: - n = mi[mb_index].mbmi.mode; - break; - case SEGMENT_MV: - n = mi[mb_index].mbmi.mv[0].as_int; - if (mi[mb_index].mbmi.ref_frame[0] == INTRA_FRAME) - n = -9999999; - break; - case SEGMENT_REFFRAME: - n = mi[mb_index].mbmi.ref_frame[0]; - break; - case SEGMENT_SKIPPED: - n = mi[mb_index].mbmi.mb_skip_coeff; - break; - } - - // above and left both have the same seg_value - if (n == a && n == l) { - // pick the lowest label - lp[j] = (al < ll ? al : ll); - labels[lp[j]].next->count++; - - // merge the above and left equivalencies - merge(labels, al, ll); - } - // this matches above seg_value - else if (n == a) { - // give it the same label as above - lp[j] = al; - labels[al].next->count++; - } - // this matches left seg_value - else if (n == l) { - // give it the same label as above - lp[j] = ll; - labels[ll].next->count++; - } else { - // new label doesn't match either - item *e = &labels[label]; - item *nl = &equivalences[eq_ptr++]; - lp[j] = label; - nl->label = label; - nl->next = 0; - nl->seg_value = n; - nl->count = 1; - e->next = nl; - label++; - } - mb_index++; - } - mb_index++; - } - lp = labeling; - - // give new labels to regions - for (i = 1; i < label; i++) - if (labels[i].next->count > min_mbs_in_region && - labels[labels[i].next->label].label == 0) { - segment_info *cs = &segments[label_count]; - cs->label = label_count; - labels[labels[i].next->label].label = label_count++; - labels[labels[i].next->label].seg_value = labels[i].next->seg_value; - cs->seg_value = labels[labels[i].next->label].seg_value; - cs->min_x = oci->mb_cols; - cs->min_y = oci->mb_rows; - cs->max_x = 0; - cs->max_y = 0; - cs->sum_x = 0; - cs->sum_y = 0; - cs->pixels = 0; - } - - lp = labeling; - - // this is just to gather stats... - for (i = 0; i < oci->mb_rows; i++, lp += pitch) { - for (j = 0; j < oci->mb_cols; j++) { - const int old_lab = labels[lp[j]].next->label; - const int lab = labels[old_lab].label; - segment_info *cs = &segments[lab]; - - cs->min_x = MIN(cs->min_x, j); - cs->max_x = MAX(cs->max_x, j); - cs->min_y = MIN(cs->min_y, i); - cs->max_y = MAX(cs->max_y, i); - cs->sum_x += j; - cs->sum_y += i; - cs->pixels++; - - lp[j] = lab; - mb_index++; - } - mb_index++; - } - - { - lp = labeling; - printf("labelling \n"); - mb_index = 0; - for (i = 0; i < oci->mb_rows; i++, lp += pitch) { - for (j = 0; j < oci->mb_cols; j++) { - printf("%4d", lp[j]); - } - printf(" "); - for (j = 0; j < oci->mb_cols; j++, mb_index++) { - // printf("%3d",mi[mb_index].mbmi.mode ); - printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row, - mi[mb_index].mbmi.mv[0].as_mv.col); - } - printf("\n"); - ++mb_index; - } - printf("\n"); - } -} - diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c index 7b3f0be..5498b17 100644 --- a/libvpx/vp9/common/vp9_loopfilter.c +++ b/libvpx/vp9/common/vp9_loopfilter.c @@ -33,18 +33,13 @@ static void lf_init_lut(loop_filter_info_n *lfi) { lfi->mode_lf_lut[NEWMV] = 1; } -void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi, - int sharpness_lvl) { - int i; - - /* For each possible value for the loop filter fill out limits */ - for (i = 0; i <= MAX_LOOP_FILTER; i++) { - int filt_lvl = i; - int block_inside_limit = 0; +static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) { + int lvl; - /* Set loop filter paramaeters that control sharpness. */ - block_inside_limit = filt_lvl >> (sharpness_lvl > 0); - block_inside_limit = block_inside_limit >> (sharpness_lvl > 4); + // For each possible value for the loop filter fill out limits + for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) { + // Set loop filter paramaeters that control sharpness. + int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4)); if (sharpness_lvl > 0) { if (block_inside_limit > (9 - sharpness_lvl)) @@ -54,21 +49,19 @@ void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi, if (block_inside_limit < 1) block_inside_limit = 1; - vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH); - vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), - SIMD_WIDTH); - vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit), + vpx_memset(lfi->lim[lvl], block_inside_limit, SIMD_WIDTH); + vpx_memset(lfi->mblim[lvl], (2 * (lvl + 2) + block_inside_limit), SIMD_WIDTH); } } -void vp9_loop_filter_init(VP9_COMMON *cm) { +void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) { loop_filter_info_n *lfi = &cm->lf_info; int i; // init limits for given sharpness - vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level); - cm->last_sharpness_level = cm->sharpness_level; + update_sharpness(lfi, lf->sharpness_level); + lf->last_sharpness_level = lf->sharpness_level; // init LUT for lvl and hev thr picking lf_init_lut(lfi); @@ -78,98 +71,68 @@ void vp9_loop_filter_init(VP9_COMMON *cm) { vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH); } -void vp9_loop_filter_frame_init(VP9_COMMON *cm, - MACROBLOCKD *xd, +void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd, int default_filt_lvl) { - int seg, // segment number - ref, // index in ref_lf_deltas - mode; // index in mode_lf_deltas + int seg; // n_shift is the a multiplier for lf_deltas // the multiplier is 1 for when filter_lvl is between 0 and 31; // 2 when filter_lvl is between 32 and 63 - int n_shift = default_filt_lvl >> 5; - - loop_filter_info_n *lfi = &cm->lf_info; - - /* update limits if sharpness has changed */ - // printf("vp9_loop_filter_frame_init %d\n", default_filt_lvl); - // printf("sharpness level: %d [%d]\n", - // cm->sharpness_level, cm->last_sharpness_level); - if (cm->last_sharpness_level != cm->sharpness_level) { - vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level); - cm->last_sharpness_level = cm->sharpness_level; + const int n_shift = default_filt_lvl >> 5; + loop_filter_info_n *const lfi = &cm->lf_info; + struct loopfilter *lf = &xd->lf; + + // update limits if sharpness has changed + if (lf->last_sharpness_level != lf->sharpness_level) { + update_sharpness(lfi, lf->sharpness_level); + lf->last_sharpness_level = lf->sharpness_level; } - for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) { - int lvl_seg = default_filt_lvl; - int lvl_ref, lvl_mode; - + for (seg = 0; seg < MAX_SEGMENTS; seg++) { + int lvl_seg = default_filt_lvl, ref, mode, intra_lvl; // Set the baseline filter values for each segment - if (vp9_segfeature_active(xd, seg, SEG_LVL_ALT_LF)) { - /* Abs value */ - if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { - lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF); - } else { /* Delta Value */ - lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF); - lvl_seg = clamp(lvl_seg, 0, 63); - } + if (vp9_segfeature_active(&xd->seg, seg, SEG_LVL_ALT_LF)) { + const int data = vp9_get_segdata(&xd->seg, seg, SEG_LVL_ALT_LF); + lvl_seg = xd->seg.abs_delta == SEGMENT_ABSDATA + ? data + : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER); } - if (!xd->mode_ref_lf_delta_enabled) { - /* we could get rid of this if we assume that deltas are set to - * zero when not in use; encoder always uses deltas - */ + if (!lf->mode_ref_delta_enabled) { + // we could get rid of this if we assume that deltas are set to + // zero when not in use; encoder always uses deltas vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4); continue; } - lvl_ref = lvl_seg; + intra_lvl = lvl_seg + (lf->ref_deltas[INTRA_FRAME] << n_shift); + lfi->lvl[seg][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER); - /* INTRA_FRAME */ - ref = INTRA_FRAME; - - /* Apply delta for reference frame */ - lvl_ref += xd->ref_lf_deltas[ref] << n_shift; - - mode = 0; /* all the rest of Intra modes */ - lvl_mode = lvl_ref; - lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63); - - /* LAST, GOLDEN, ALT */ - for (ref = 1; ref < MAX_REF_FRAMES; ref++) { - int lvl_ref = lvl_seg; - - /* Apply delta for reference frame */ - lvl_ref += xd->ref_lf_deltas[ref] << n_shift; - - /* Apply delta for Inter modes */ - for (mode = 0; mode < MAX_MODE_LF_DELTAS; mode++) { - lvl_mode = lvl_ref + (xd->mode_lf_deltas[mode] << n_shift); - lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63); + for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref) + for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) { + const int inter_lvl = lvl_seg + (lf->ref_deltas[ref] << n_shift) + + (lf->mode_deltas[mode] << n_shift); + lfi->lvl[seg][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER); } - } } } -static int build_lfi(const VP9_COMMON *cm, const MB_MODE_INFO *mbmi, - struct loop_filter_info *lfi) { - const loop_filter_info_n *lfi_n = &cm->lf_info; - int mode = mbmi->mode; - int mode_index = lfi_n->mode_lf_lut[mode]; - int seg = mbmi->segment_id; - int ref_frame = mbmi->ref_frame[0]; - int filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; - - if (filter_level) { - const int hev_index = filter_level >> 4; +static int build_lfi(const loop_filter_info_n *const lfi_n, + const MB_MODE_INFO *const mbmi, + struct loop_filter_info *const lfi) { + const int seg = mbmi->segment_id; + const int ref = mbmi->ref_frame[0]; + const int mode = lfi_n->mode_lf_lut[mbmi->mode]; + const int filter_level = lfi_n->lvl[seg][ref][mode]; + + if (filter_level > 0) { lfi->mblim = lfi_n->mblim[filter_level]; - lfi->blim = lfi_n->blim[filter_level]; lfi->lim = lfi_n->lim[filter_level]; - lfi->hev_thr = lfi_n->hev_thr[hev_index]; + lfi->hev_thr = lfi_n->hev_thr[filter_level >> 4]; return 1; + } else { + return 0; } - return 0; } static void filter_selectively_vert(uint8_t *s, int pitch, @@ -180,7 +143,8 @@ static void filter_selectively_vert(uint8_t *s, int pitch, const struct loop_filter_info *lfi) { unsigned int mask; - for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= 1) { + for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; + mask; mask >>= 1) { if (mask & 1) { if (mask_16x16 & 1) { vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim, @@ -198,14 +162,11 @@ static void filter_selectively_vert(uint8_t *s, int pitch, lfi->hev_thr, 1); assert(!(mask_16x16 & 1)); assert(!(mask_8x8 & 1)); - } else { - assert(0); } - - if (mask_4x4_int & 1) - vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); } + if (mask_4x4_int & 1) + vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); s += 8; lfi++; mask_16x16 >>= 1; @@ -223,13 +184,22 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, int only_4x4_1, const struct loop_filter_info *lfi) { unsigned int mask; + int count; - for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= 1) { + for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; + mask; mask >>= count) { + count = 1; if (mask & 1) { if (!only_4x4_1) { if (mask_16x16 & 1) { - vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr); + if ((mask_16x16 & 3) == 3) { + vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + count = 2; + } else { + vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + } assert(!(mask_8x8 & 1)); assert(!(mask_4x4 & 1)); assert(!(mask_4x4_int & 1)); @@ -243,8 +213,6 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, lfi->hev_thr, 1); assert(!(mask_16x16 & 1)); assert(!(mask_8x8 & 1)); - } else { - assert(0); } } @@ -252,40 +220,41 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } - s += 8; - lfi++; - mask_16x16 >>= 1; - mask_8x8 >>= 1; - mask_4x4 >>= 1; - mask_4x4_int >>= 1; + s += 8 * count; + lfi += count; + mask_16x16 >>= count; + mask_8x8 >>= count; + mask_4x4 >>= count; + mask_4x4_int >>= count; } } -static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd, - int plane, int mi_row, int mi_col) { - const int ss_x = xd->plane[plane].subsampling_x; - const int ss_y = xd->plane[plane].subsampling_y; - const int row_step = 1 << xd->plane[plane].subsampling_y; - const int col_step = 1 << xd->plane[plane].subsampling_x; - struct buf_2d * const dst = &xd->plane[plane].dst; +static void filter_block_plane(VP9_COMMON *const cm, + struct macroblockd_plane *const plane, + const MODE_INFO *mi, + int mi_row, int mi_col) { + const int ss_x = plane->subsampling_x; + const int ss_y = plane->subsampling_y; + const int row_step = 1 << ss_x; + const int col_step = 1 << ss_y; + const int row_step_stride = cm->mode_info_stride * row_step; + struct buf_2d *const dst = &plane->dst; uint8_t* const dst0 = dst->buf; - MODE_INFO* const mi0 = xd->mode_info_context; - unsigned int mask_16x16[64 / MI_SIZE] = {0}; - unsigned int mask_8x8[64 / MI_SIZE] = {0}; - unsigned int mask_4x4[64 / MI_SIZE] = {0}; - unsigned int mask_4x4_int[64 / MI_SIZE] = {0}; - struct loop_filter_info lfi[64 / MI_SIZE][64 / MI_SIZE]; + unsigned int mask_16x16[MI_BLOCK_SIZE] = {0}; + unsigned int mask_8x8[MI_BLOCK_SIZE] = {0}; + unsigned int mask_4x4[MI_BLOCK_SIZE] = {0}; + unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0}; + struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE]; int r, c; - for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) { + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { unsigned int mask_16x16_c = 0; unsigned int mask_8x8_c = 0; unsigned int mask_4x4_c = 0; unsigned int border_mask; // Determine the vertical edges that need filtering - for (c = 0; c < 64 / MI_SIZE && mi_col + c < cm->mi_cols; c += col_step) { - const MODE_INFO * const mi = xd->mode_info_context; + for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) { const int skip_this = mi[c].mbmi.mb_skip_coeff && mi[c].mbmi.ref_frame[0] != INTRA_FRAME; // left edge of current unit is block/partition edge -> no skip @@ -296,14 +265,14 @@ static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd, const int block_edge_above = b_height_log2(mi[c].mbmi.sb_type) ? !(r & ((1 << (b_height_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1; const int skip_this_r = skip_this && !block_edge_above; - const TX_SIZE tx_size = plane ? get_uv_tx_size(&mi[c].mbmi) - : mi[c].mbmi.txfm_size; + const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV) + ? get_uv_tx_size(&mi[c].mbmi) + : mi[c].mbmi.txfm_size; const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1; const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; // Filter level can vary per MI - if (!build_lfi(cm, &mi[c].mbmi, - lfi[r] + (c >> xd->plane[plane].subsampling_x))) + if (!build_lfi(&cm->lf_info, &mi[c].mbmi, lfi[r] + (c >> ss_x))) continue; // Build masks based on the transform size of each block @@ -362,13 +331,12 @@ static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd, mask_4x4_c & border_mask, mask_4x4_int[r], lfi[r]); dst->buf += 8 * dst->stride; - xd->mode_info_context += cm->mode_info_stride * row_step; + mi += row_step_stride; } // Now do horizontal pass dst->buf = dst0; - xd->mode_info_context = mi0; - for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) { + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r]; @@ -378,30 +346,33 @@ static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd, mask_4x4[r], mask_4x4_int_r, mi_row + r == 0, lfi[r]); dst->buf += 8 * dst->stride; - xd->mode_info_context += cm->mode_info_stride * row_step; } } -void vp9_loop_filter_frame(VP9_COMMON *cm, - MACROBLOCKD *xd, - int frame_filter_level, - int y_only) { +void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, + VP9_COMMON *cm, MACROBLOCKD *xd, + int start, int stop, int y_only) { + const int num_planes = y_only ? 1 : MAX_MB_PLANE; int mi_row, mi_col; - // Initialize the loop filter for this frame. - vp9_loop_filter_frame_init(cm, xd, frame_filter_level); - - for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 64 / MI_SIZE) { + for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) { MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride; - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 64 / MI_SIZE) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { int plane; - setup_dst_planes(xd, cm->frame_to_show, mi_row, mi_col); - for (plane = 0; plane < (y_only ? 1 : MAX_MB_PLANE); plane++) { - xd->mode_info_context = mi + mi_col; - filter_block_plane(cm, xd, plane, mi_row, mi_col); + setup_dst_planes(xd, frame_buffer, mi_row, mi_col); + for (plane = 0; plane < num_planes; ++plane) { + filter_block_plane(cm, &xd->plane[plane], mi + mi_col, mi_row, mi_col); } } } } + +void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd, + int frame_filter_level, int y_only) { + if (!frame_filter_level) return; + vp9_loop_filter_frame_init(cm, xd, frame_filter_level); + vp9_loop_filter_rows(cm->frame_to_show, cm, xd, + 0, cm->mi_rows, y_only); +} diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h index ce954c0..e59cc64 100644 --- a/libvpx/vp9/common/vp9_loopfilter.h +++ b/libvpx/vp9/common/vp9_loopfilter.h @@ -13,61 +13,46 @@ #include "vpx_ports/mem.h" #include "vpx_config.h" + #include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_seg_common.h" #define MAX_LOOP_FILTER 63 +#define MAX_SHARPNESS 7 + #define SIMD_WIDTH 16 -/* Need to align this structure so when it is declared and - * passed it can be loaded into vector registers. - */ +// Need to align this structure so when it is declared and +// passed it can be loaded into vector registers. typedef struct { - DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); - DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, - blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); - DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); - DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[4][SIMD_WIDTH]); - unsigned char lvl[MAX_MB_SEGMENTS][4][4]; - unsigned char mode_lf_lut[MB_MODE_COUNT]; + uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS]; + uint8_t mode_lf_lut[MB_MODE_COUNT]; } loop_filter_info_n; struct loop_filter_info { - const unsigned char *mblim; - const unsigned char *blim; - const unsigned char *lim; - const unsigned char *hev_thr; + const uint8_t *mblim; + const uint8_t *lim; + const uint8_t *hev_thr; }; -#define prototype_loopfilter(sym) \ - void sym(uint8_t *src, int pitch, const unsigned char *blimit, \ - const unsigned char *limit, const unsigned char *thresh, int count) - -#define prototype_loopfilter_block(sym) \ - void sym(uint8_t *y, uint8_t *u, uint8_t *v, \ - int ystride, int uv_stride, struct loop_filter_info *lfi) - -#if ARCH_X86 || ARCH_X86_64 -#include "x86/vp9_loopfilter_x86.h" -#endif - -typedef void loop_filter_uvfunction(uint8_t *u, /* source pointer */ - int p, /* pitch */ - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - uint8_t *v); /* assorted loopfilter functions which get used elsewhere */ struct VP9Common; struct macroblockd; -void vp9_loop_filter_init(struct VP9Common *cm); +void vp9_loop_filter_init(struct VP9Common *cm, struct loopfilter *lf); -void vp9_loop_filter_frame_init(struct VP9Common *cm, - struct macroblockd *mbd, +// Update the loop filter for the current frame. +// This should be called before vp9_loop_filter_rows(), vp9_loop_filter_frame() +// calls this function directly. +void vp9_loop_filter_frame_init(struct VP9Common *const cm, + struct macroblockd *const xd, int default_filt_lvl); void vp9_loop_filter_frame(struct VP9Common *cm, @@ -75,11 +60,8 @@ void vp9_loop_filter_frame(struct VP9Common *cm, int filter_level, int y_only); -void vp9_loop_filter_partial_frame(struct VP9Common *cm, - struct macroblockd *mbd, - int default_filt_lvl); - -void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi, - int sharpness_lvl); - +// Apply the loop filter to [start, stop) macro block rows in frame_buffer. +void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, + struct VP9Common *cm, struct macroblockd *xd, + int start, int stop, int y_only); #endif // VP9_COMMON_VP9_LOOPFILTER_H_ diff --git a/libvpx/vp9/common/vp9_loopfilter_filters.c b/libvpx/vp9/common/vp9_loopfilter_filters.c index 0efbcaf..88130d8 100644 --- a/libvpx/vp9/common/vp9_loopfilter_filters.c +++ b/libvpx/vp9/common/vp9_loopfilter_filters.c @@ -34,17 +34,44 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, return ~mask; } +static INLINE int8_t flat_mask4(uint8_t thresh, + uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1, + uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > thresh) * -1; + mask |= (abs(q1 - q0) > thresh) * -1; + mask |= (abs(p2 - p0) > thresh) * -1; + mask |= (abs(q2 - q0) > thresh) * -1; + mask |= (abs(p3 - p0) > thresh) * -1; + mask |= (abs(q3 - q0) > thresh) * -1; + return ~mask; +} + +static INLINE int8_t flat_mask5(uint8_t thresh, + uint8_t p4, uint8_t p3, + uint8_t p2, uint8_t p1, + uint8_t p0, uint8_t q0, + uint8_t q1, uint8_t q2, + uint8_t q3, uint8_t q4) { + int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3); + mask |= (abs(p4 - p0) > thresh) * -1; + mask |= (abs(q4 - q0) > thresh) * -1; + return ~mask; +} + // is there high edge variance internal edge: 11111111 yes, 00000000 no -static INLINE int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0, - uint8_t q0, uint8_t q1) { +static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1) { int8_t hev = 0; hev |= (abs(p1 - p0) > thresh) * -1; hev |= (abs(q1 - q0) > thresh) * -1; return hev; } -static INLINE void filter(int8_t mask, uint8_t hev, uint8_t *op1, - uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { +static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { int8_t filter1, filter2; const int8_t ps1 = (int8_t) *op1 ^ 0x80; @@ -68,7 +95,7 @@ static INLINE void filter(int8_t mask, uint8_t hev, uint8_t *op1, *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80; // outer tap adjustments - filter = ((filter1 + 1) >> 1) & ~hev; + filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80; *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; @@ -88,8 +115,8 @@ void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */, const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hevmask(*thresh, p1, p0, q0, q1); - filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); + const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); + filter4(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); ++s; } } @@ -108,57 +135,30 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hevmask(*thresh, p1, p0, q0, q1); - filter(mask, hev, s - 2, s - 1, s, s + 1); + const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); + filter4(mask, hev, s - 2, s - 1, s, s + 1); s += pitch; } } -static INLINE int8_t flatmask4(uint8_t thresh, - uint8_t p3, uint8_t p2, - uint8_t p1, uint8_t p0, - uint8_t q0, uint8_t q1, - uint8_t q2, uint8_t q3) { - int8_t flat = 0; - flat |= (abs(p1 - p0) > thresh) * -1; - flat |= (abs(q1 - q0) > thresh) * -1; - flat |= (abs(p0 - p2) > thresh) * -1; - flat |= (abs(q0 - q2) > thresh) * -1; - flat |= (abs(p3 - p0) > thresh) * -1; - flat |= (abs(q3 - q0) > thresh) * -1; - return ~flat; -} -static INLINE signed char flatmask5(uint8_t thresh, - uint8_t p4, uint8_t p3, uint8_t p2, - uint8_t p1, uint8_t p0, - uint8_t q0, uint8_t q1, uint8_t q2, - uint8_t q3, uint8_t q4) { - int8_t flat = 0; - flat |= (abs(p4 - p0) > thresh) * -1; - flat |= (abs(q4 - q0) > thresh) * -1; - flat = ~flat; - return flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3); -} - - -static INLINE void mbfilter(int8_t mask, uint8_t hev, uint8_t flat, - uint8_t *op3, uint8_t *op2, - uint8_t *op1, uint8_t *op0, - uint8_t *oq0, uint8_t *oq1, - uint8_t *oq2, uint8_t *oq3) { - // use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line +static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat, + uint8_t *op3, uint8_t *op2, + uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, + uint8_t *oq2, uint8_t *oq3) { if (flat && mask) { const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; - *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3); - *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3); - *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3); - *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3); - *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3); - *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3); + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); } else { - filter(mask, hev, op1, op0, oq0, oq1); + filter4(mask, hev, op1, op0, oq0, oq1); } } @@ -177,11 +177,10 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p, const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hevmask(*thresh, p1, p0, q0, q1); - const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - mbfilter(mask, hev, flat, - s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, - s, s + 1 * p, s + 2 * p, s + 3 * p); + const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, hev, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p); ++s; } } @@ -198,23 +197,24 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hevmask(thresh[0], p1, p0, q0, q1); - const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - mbfilter(mask, hev, flat, s - 4, s - 3, s - 2, s - 1, - s, s + 1, s + 2, s + 3); + const int8_t hev = hev_mask(thresh[0], p1, p0, q0, q1); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, hev, flat, s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3); s += pitch; } } -static INLINE void wide_mbfilter(int8_t mask, uint8_t hev, - uint8_t flat, uint8_t flat2, - uint8_t *op7, uint8_t *op6, uint8_t *op5, - uint8_t *op4, uint8_t *op3, uint8_t *op2, - uint8_t *op1, uint8_t *op0, uint8_t *oq0, - uint8_t *oq1, uint8_t *oq2, uint8_t *oq3, - uint8_t *oq4, uint8_t *oq5, uint8_t *oq6, - uint8_t *oq7) { - // use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line +static INLINE void filter16(int8_t mask, uint8_t hev, + uint8_t flat, uint8_t flat2, + uint8_t *op7, uint8_t *op6, + uint8_t *op5, uint8_t *op4, + uint8_t *op3, uint8_t *op2, + uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, + uint8_t *oq2, uint8_t *oq3, + uint8_t *oq4, uint8_t *oq5, + uint8_t *oq6, uint8_t *oq7) { if (flat2 && flat && mask) { const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; @@ -222,6 +222,7 @@ static INLINE void wide_mbfilter(int8_t mask, uint8_t hev, const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; + // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4); *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + @@ -251,35 +252,35 @@ static INLINE void wide_mbfilter(int8_t mask, uint8_t hev, *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); } else { - mbfilter(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + filter8(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); } } void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { + const uint8_t *thresh, + int count) { int i; // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8; ++i) { + for (i = 0; i < 8 * count; ++i) { const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hevmask(*thresh, p1, p0, q0, q1); - const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t flat2 = flatmask5(1, + const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat2 = flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]); - wide_mbfilter(mask, hev, flat, flat2, - s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p, - s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, - s, s + 1 * p, s + 2 * p, s + 3 * p, - s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p); - + filter16(mask, hev, flat, flat2, + s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p, + s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p); ++s; } } @@ -295,14 +296,14 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p, const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hevmask(*thresh, p1, p0, q0, q1); - const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t flat2 = flatmask5(1, s[-8], s[-7], s[-6], s[-5], p0, - q0, s[4], s[5], s[6], s[7]); - - wide_mbfilter(mask, hev, flat, flat2, - s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, - s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7); + const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, + q0, s[4], s[5], s[6], s[7]); + + filter16(mask, hev, flat, flat2, + s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7); s += p; } } diff --git a/libvpx/vp9/common/vp9_maskingmv.c b/libvpx/vp9/common/vp9_maskingmv.c deleted file mode 100644 index 326201b..0000000 --- a/libvpx/vp9/common/vp9_maskingmv.c +++ /dev/null @@ -1,803 +0,0 @@ -/* - ============================================================================ - Name : vp9_maskingmv.c - Author : jimbankoski - Version : - Copyright : Your copyright notice - Description : Hello World in C, Ansi-style - ============================================================================ - */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -unsigned int vp9_sad16x16_sse3( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride, - int max_err); - -int vp8_growmaskmb_sse3( - unsigned char *om, - unsigned char *nm); - -void vp8_makemask_sse3( - unsigned char *y, - unsigned char *u, - unsigned char *v, - unsigned char *ym, - int yp, - int uvp, - int ys, - int us, - int vs, - int yt, - int ut, - int vt); - -unsigned int vp9_sad16x16_unmasked_wmt( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride, - unsigned char *mask); - -unsigned int vp9_sad16x16_masked_wmt( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride, - unsigned char *mask); - -unsigned int vp8_masked_predictor_wmt( - unsigned char *masked, - unsigned char *unmasked, - int src_stride, - unsigned char *dst_ptr, - int dst_stride, - unsigned char *mask); -unsigned int vp8_masked_predictor_uv_wmt( - unsigned char *masked, - unsigned char *unmasked, - int src_stride, - unsigned char *dst_ptr, - int dst_stride, - unsigned char *mask); -unsigned int vp8_uv_from_y_mask( - unsigned char *ymask, - unsigned char *uvmask); -int yp = 16; -unsigned char sxy[] = { - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90 -}; - -unsigned char sts[] = { - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -}; -unsigned char str[] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 -}; - -unsigned char y[] = { - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, - 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, - 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, - 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, - 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, - 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, - 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40 -}; -int uvp = 8; -unsigned char u[] = { - 90, 80, 70, 70, 90, 90, 90, 17, - 90, 80, 70, 70, 90, 90, 90, 17, - 84, 70, 70, 90, 90, 90, 17, 17, - 84, 70, 70, 90, 90, 90, 17, 17, - 80, 70, 70, 90, 90, 90, 17, 17, - 90, 80, 70, 70, 90, 90, 90, 17, - 90, 80, 70, 70, 90, 90, 90, 17, - 90, 80, 70, 70, 90, 90, 90, 17 -}; - -unsigned char v[] = { - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80 -}; - -unsigned char ym[256]; -unsigned char uvm[64]; -typedef struct { - unsigned char y; - unsigned char yt; - unsigned char u; - unsigned char ut; - unsigned char v; - unsigned char vt; - unsigned char use; -} COLOR_SEG_ELEMENT; - -/* -COLOR_SEG_ELEMENT segmentation[]= -{ - { 60,4,80,17,80,10, 1}, - { 40,4,15,10,80,10, 1}, -}; -*/ - -COLOR_SEG_ELEMENT segmentation[] = { - { 79, 44, 92, 44, 237, 60, 1}, -}; - -unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v, - COLOR_SEG_ELEMENT sgm[], - int c) { - COLOR_SEG_ELEMENT *s = sgm; - unsigned char m = 0; - int i; - for (i = 0; i < c; i++, s++) - m |= (abs(y - s->y) < s->yt && - abs(u - s->u) < s->ut && - abs(v - s->v) < s->vt ? 255 : 0); - - return m; -} -int neighbors[256][8]; -int makeneighbors(void) { - int i, j; - for (i = 0; i < 256; i++) { - int r = (i >> 4), c = (i & 15); - int ni = 0; - for (j = 0; j < 8; j++) - neighbors[i][j] = i; - for (j = 0; j < 256; j++) { - int nr = (j >> 4), nc = (j & 15); - if (abs(nr - r) < 2 && abs(nc - c) < 2) - neighbors[i][ni++] = j; - } - } - return 0; -} -void grow_ymask(unsigned char *ym) { - unsigned char nym[256]; - int i, j; - - for (i = 0; i < 256; i++) { - nym[i] = ym[i]; - for (j = 0; j < 8; j++) { - nym[i] |= ym[neighbors[i][j]]; - } - } - for (i = 0; i < 256; i++) - ym[i] = nym[i]; -} - -void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v, - unsigned char *ym, unsigned char *uvm, - int yp, int uvp, - COLOR_SEG_ELEMENT sgm[], - int count) { - int r, c; - unsigned char *oym = ym; - - memset(ym, 20, 256); - for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32) - for (c = 0; c < 8; c++) { - int y1 = y[c << 1]; - int u1 = u[c]; - int v1 = v[c]; - int m = pixel_mask(y1, u1, v1, sgm, count); - uvm[c] = m; - ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count); - ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count); - ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count); - ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count); - } - grow_ymask(oym); -} - -int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp, - unsigned char *ym) { - int i, j; - unsigned sad = 0; - for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16) - for (j = 0; j < 16; j++) - if (ym[j]) - sad += abs(src[j] - dst[j]); - - return sad; -} - -int compare_masks(unsigned char *sym, unsigned char *ym) { - int i, j; - unsigned sad = 0; - for (i = 0; i < 16; i++, sym += 16, ym += 16) - for (j = 0; j < 16; j++) - sad += (sym[j] != ym[j] ? 1 : 0); - - return sad; -} - -int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp, - unsigned char *ym) { - int i, j; - unsigned sad = 0; - for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16) - for (j = 0; j < 16; j++) - if (!ym[j]) - sad += abs(src[j] - dst[j]); - - return sad; -} - -int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v, - int yp, int uvp, - unsigned char *dy, unsigned char *du, unsigned char *dv, - int dyp, int duvp, - COLOR_SEG_ELEMENT sgm[], - int count, - int *mi, - int *mj, - int *ui, - int *uj, - int *wm) { - int i, j; - - unsigned char ym[256]; - unsigned char uvm[64]; - unsigned char dym[256]; - unsigned char duvm[64]; - unsigned int e = 0; - int beste = 256; - int bmi = -32, bmj = -32; - int bui = -32, buj = -32; - int beste1 = 256; - int bmi1 = -32, bmj1 = -32; - int bui1 = -32, buj1 = -32; - int obeste; - - // first try finding best mask and then unmasked - beste = 0xffffffff; - - // find best unmasked mv - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - unsigned char *duz = i / 2 * duvp + du; - unsigned char *dvz = i / 2 * duvp + dv; - for (j = -32; j < 32; j++) { - // 0,0 masked destination - make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count); - - e = unmasked_sad(y, yp, dyz + j, dyp, dym); - - if (e < beste) { - bui = i; - buj = j; - beste = e; - } - } - } - // bui=0;buj=0; - // best mv masked destination - make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2, - dym, duvm, dyp, duvp, sgm, count); - - obeste = beste; - beste = 0xffffffff; - - // find best masked - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - for (j = -32; j < 32; j++) { - e = masked_sad(y, yp, dyz + j, dyp, dym); - - if (e < beste) { - bmi = i; - bmj = j; - beste = e; - } - } - } - beste1 = beste + obeste; - bmi1 = bmi; - bmj1 = bmj; - bui1 = bui; - buj1 = buj; - - beste = 0xffffffff; - // source mask - make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count); - - // find best mask - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - unsigned char *duz = i / 2 * duvp + du; - unsigned char *dvz = i / 2 * duvp + dv; - for (j = -32; j < 32; j++) { - // 0,0 masked destination - make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count); - - e = compare_masks(ym, dym); - - if (e < beste) { - bmi = i; - bmj = j; - beste = e; - } - } - } - - - // best mv masked destination - make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2, - dym, duvm, dyp, duvp, sgm, count); - - obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym); - - beste = 0xffffffff; - - // find best unmasked mv - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - for (j = -32; j < 32; j++) { - e = unmasked_sad(y, yp, dyz + j, dyp, dym); - - if (e < beste) { - bui = i; - buj = j; - beste = e; - } - } - } - beste += obeste; - - - if (beste < beste1) { - *mi = bmi; - *mj = bmj; - *ui = bui; - *uj = buj; - *wm = 1; - } else { - *mi = bmi1; - *mj = bmj1; - *ui = bui1; - *uj = buj1; - *wm = 0; - - } - return 0; -} - -int predict(unsigned char *src, int p, unsigned char *dst, int dp, - unsigned char *ym, unsigned char *prd) { - int i, j; - for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16) - for (j = 0; j < 16; j++) - prd[j] = (ym[j] ? src[j] : dst[j]); - return 0; -} - -int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v, - int yp, int uvp, - unsigned char *dy, unsigned char *du, unsigned char *dv, - int dyp, int duvp, - COLOR_SEG_ELEMENT sgm[], - int count, - int *mi, - int *mj, - int *ui, - int *uj, - int *wm) { - int i, j; - - unsigned char ym[256]; - unsigned char ym2[256]; - unsigned char uvm[64]; - unsigned char dym2[256]; - unsigned char dym[256]; - unsigned char duvm[64]; - unsigned int e = 0; - int beste = 256; - int bmi = -32, bmj = -32; - int bui = -32, buj = -32; - int beste1 = 256; - int bmi1 = -32, bmj1 = -32; - int bui1 = -32, buj1 = -32; - int obeste; - - // first try finding best mask and then unmasked - beste = 0xffffffff; - -#if 0 - for (i = 0; i < 16; i++) { - unsigned char *dy = i * yp + y; - for (j = 0; j < 16; j++) - printf("%2x", dy[j]); - printf("\n"); - } - printf("\n"); - - for (i = -32; i < 48; i++) { - unsigned char *dyz = i * dyp + dy; - for (j = -32; j < 48; j++) - printf("%2x", dyz[j]); - printf("\n"); - } -#endif - - // find best unmasked mv - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - unsigned char *duz = i / 2 * duvp + du; - unsigned char *dvz = i / 2 * duvp + dv; - for (j = -32; j < 32; j++) { - // 0,0 masked destination - vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - - vp8_growmaskmb_sse3(dym, dym2); - - e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2); - - if (e < beste) { - bui = i; - buj = j; - beste = e; - } - } - } - // bui=0;buj=0; - // best mv masked destination - - vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2, - dym, dyp, duvp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - - vp8_growmaskmb_sse3(dym, dym2); - - obeste = beste; - beste = 0xffffffff; - - // find best masked - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - for (j = -32; j < 32; j++) { - e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2); - if (e < beste) { - bmi = i; - bmj = j; - beste = e; - } - } - } - beste1 = beste + obeste; - bmi1 = bmi; - bmj1 = bmj; - bui1 = bui; - buj1 = buj; - - // source mask - vp8_makemask_sse3(y, u, v, - ym, yp, uvp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - - vp8_growmaskmb_sse3(ym, ym2); - - // find best mask - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - unsigned char *duz = i / 2 * duvp + du; - unsigned char *dvz = i / 2 * duvp + dv; - for (j = -32; j < 32; j++) { - // 0,0 masked destination - vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - - vp8_growmaskmb_sse3(dym, dym2); - - e = compare_masks(ym2, dym2); - - if (e < beste) { - bmi = i; - bmj = j; - beste = e; - } - } - } - - vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2, - dym, dyp, duvp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - - vp8_growmaskmb_sse3(dym, dym2); - - obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2); - - beste = 0xffffffff; - - // find best unmasked mv - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - for (j = -32; j < 32; j++) { - e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2); - - if (e < beste) { - bui = i; - buj = j; - beste = e; - } - } - } - beste += obeste; - - if (beste < beste1) { - *mi = bmi; - *mj = bmj; - *ui = bui; - *uj = buj; - *wm = 1; - } else { - *mi = bmi1; - *mj = bmj1; - *ui = bui1; - *uj = buj1; - *wm = 0; - beste = beste1; - - } - return beste; -} - -int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm, - int ymp, int uvmp, - unsigned char *yp, unsigned char *up, unsigned char *vp, - int ypp, int uvpp, - COLOR_SEG_ELEMENT sgm[], - int count, - int mi, - int mj, - int ui, - int uj, - int wm) { - int i, j; - unsigned char dym[256]; - unsigned char dym2[256]; - unsigned char duvm[64]; - unsigned char *yu = ym, *uu = um, *vu = vm; - - unsigned char *dym3 = dym2; - - ym += mi * ymp + mj; - um += mi / 2 * uvmp + mj / 2; - vm += mi / 2 * uvmp + mj / 2; - - yu += ui * ymp + uj; - uu += ui / 2 * uvmp + uj / 2; - vu += ui / 2 * uvmp + uj / 2; - - // best mv masked destination - if (wm) - vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - else - vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - - vp8_growmaskmb_sse3(dym, dym2); - vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3); - vp8_uv_from_y_mask(dym3, duvm); - vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm); - vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm); - - return 0; -} - -unsigned char f0p[1280 * 720 * 3 / 2]; -unsigned char f1p[1280 * 720 * 3 / 2]; -unsigned char prd[1280 * 720 * 3 / 2]; -unsigned char msk[1280 * 720 * 3 / 2]; - - -int mainz(int argc, char *argv[]) { - - FILE *f = fopen(argv[1], "rb"); - FILE *g = fopen(argv[2], "wb"); - int w = atoi(argv[3]), h = atoi(argv[4]); - int y_stride = w, uv_stride = w / 2; - int r, c; - unsigned char *f0 = f0p, *f1 = f1p, *t; - unsigned char ym[256], uvm[64]; - unsigned char ym2[256], uvm2[64]; - unsigned char ym3[256], uvm3[64]; - int a, b; - - COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best; -#if 0 - makeneighbors(); - COLOR_SEG_ELEMENT segmentation[] = { - { 60, 4, 80, 17, 80, 10, 1}, - { 40, 4, 15, 10, 80, 10, 1}, - }; - make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1); - - vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8, - (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v, - segmentation[0].yt, segmentation[0].ut, segmentation[0].vt); - - vp8_growmaskmb_sse3(ym, ym3); - - a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3); - b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3); - - vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3); - - vp8_uv_from_y_mask(ym3, uvm3); - - return 4; -#endif - makeneighbors(); - - - memset(prd, 128, w * h * 3 / 2); - - fread(f0, w * h * 3 / 2, 1, f); - - while (!feof(f)) { - unsigned char *ys = f1, *yd = f0, *yp = prd; - unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h; - unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4; - fread(f1, w * h * 3 / 2, 1, f); - - ys += 32 * y_stride; - yd += 32 * y_stride; - yp += 32 * y_stride; - us += 16 * uv_stride; - ud += 16 * uv_stride; - up += 16 * uv_stride; - vs += 16 * uv_stride; - vd += 16 * uv_stride; - vp += 16 * uv_stride; - for (r = 32; r < h - 32; r += 16, - ys += 16 * w, yd += 16 * w, yp += 16 * w, - us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride, - vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) { - for (c = 32; c < w - 32; c += 16) { - int mi, mj, ui, uj, wm; - int bmi, bmj, bui, buj, bwm; - unsigned char ym[256]; - - if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0) - bmi = bmj = bui = buj = bwm = 0; - else { - COLOR_SEG_ELEMENT cs[5]; - int j; - unsigned int beste = 0xfffffff; - unsigned int bestj = 0; - - // try color from last mb segmentation - cs[0] = last; - - // try color segs from 4 pixels in mb recon as segmentation - cs[1].y = yd[c + y_stride + 1]; - cs[1].u = ud[c / 2 + uv_stride]; - cs[1].v = vd[c / 2 + uv_stride]; - cs[1].yt = cs[1].ut = cs[1].vt = 20; - cs[2].y = yd[c + w + 14]; - cs[2].u = ud[c / 2 + uv_stride + 7]; - cs[2].v = vd[c / 2 + uv_stride + 7]; - cs[2].yt = cs[2].ut = cs[2].vt = 20; - cs[3].y = yd[c + w * 14 + 1]; - cs[3].u = ud[c / 2 + uv_stride * 7]; - cs[3].v = vd[c / 2 + uv_stride * 7]; - cs[3].yt = cs[3].ut = cs[3].vt = 20; - cs[4].y = yd[c + w * 14 + 14]; - cs[4].u = ud[c / 2 + uv_stride * 7 + 7]; - cs[4].v = vd[c / 2 + uv_stride * 7 + 7]; - cs[4].yt = cs[4].ut = cs[4].vt = 20; - - for (j = 0; j < 5; j++) { - int e; - - e = fast_masked_motion_search( - ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride, - yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride, - &cs[j], 1, &mi, &mj, &ui, &uj, &wm); - - if (e < beste) { - bmi = mi; - bmj = mj; - bui = ui; - buj = uj, bwm = wm; - bestj = j; - beste = e; - } - } - best = cs[bestj]; - // best = segmentation[0]; - last = best; - } - predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride, - yp + c, up + c / 2, vp + c / 2, w, uv_stride, - &best, 1, bmi, bmj, bui, buj, bwm); - - } - } - fwrite(prd, w * h * 3 / 2, 1, g); - t = f0; - f0 = f1; - f1 = t; - - } - fclose(f); - fclose(g); - return 0; -} diff --git a/libvpx/vp9/common/vp9_mbpitch.c b/libvpx/vp9/common/vp9_mbpitch.c deleted file mode 100644 index 3cf37ff..0000000 --- a/libvpx/vp9/common/vp9_mbpitch.c +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vp9/common/vp9_blockd.h" - -void vp9_setup_block_dptrs(MACROBLOCKD *mb, - int subsampling_x, int subsampling_y) { - int i; - - for (i = 0; i < MAX_MB_PLANE; i++) { - mb->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC; - mb->plane[i].subsampling_x = i ? subsampling_x : 0; - mb->plane[i].subsampling_y = i ? subsampling_y : 0; - } -#if CONFIG_ALPHA - // TODO(jkoleszar): Using the Y w/h for now - mb->plane[3].subsampling_x = 0; - mb->plane[3].subsampling_y = 0; -#endif -} diff --git a/libvpx/vp9/common/vp9_modecont.c b/libvpx/vp9/common/vp9_modecont.c deleted file mode 100644 index 5d92cfa..0000000 --- a/libvpx/vp9/common/vp9_modecont.c +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vp9/common/vp9_modecont.h" - -const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS] - [VP9_INTER_MODES - 1] = { - {2, 173, 34}, // 0 = both zero mv - {7, 145, 85}, // 1 = one zero mv + one a predicted mv - {7, 166, 63}, // 2 = two predicted mvs - {7, 94, 66}, // 3 = one predicted/zero and one new mv - {8, 64, 46}, // 4 = two new mvs - {17, 81, 31}, // 5 = one intra neighbour + x - {25, 29, 30}, // 6 = two intra neighbours -}; diff --git a/libvpx/vp9/common/vp9_modecont.h b/libvpx/vp9/common/vp9_modecont.h deleted file mode 100644 index 3ec6079..0000000 --- a/libvpx/vp9/common/vp9_modecont.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_MODECONT_H_ -#define VP9_COMMON_VP9_MODECONT_H_ - -#include "vp9/common/vp9_entropy.h" - -extern const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS] - [VP9_INTER_MODES - 1]; - -#endif // VP9_COMMON_VP9_MODECONT_H_ diff --git a/libvpx/vp9/common/vp9_modecontext.c b/libvpx/vp9/common/vp9_modecontext.c deleted file mode 100644 index a79ab2a..0000000 --- a/libvpx/vp9/common/vp9_modecontext.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vp9/common/vp9_entropymode.h" - -const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES] - [VP9_INTRA_MODES] - [VP9_INTRA_MODES - 1] = { - { /* above = dc */ - { 137, 30, 42, 148, 151, 207, 70, 52, 91 } /* left = dc */, - { 92, 45, 102, 136, 116, 180, 74, 90, 100 } /* left = v */, - { 73, 32, 19, 187, 222, 215, 46, 34, 100 } /* left = h */, - { 91, 30, 32, 116, 121, 186, 93, 86, 94 } /* left = d45 */, - { 72, 35, 36, 149, 68, 206, 68, 63, 105 } /* left = d135 */, - { 73, 31, 28, 138, 57, 124, 55, 122, 151 } /* left = d117 */, - { 67, 23, 21, 140, 126, 197, 40, 37, 171 } /* left = d153 */, - { 86, 27, 28, 128, 154, 212, 45, 43, 53 } /* left = d27 */, - { 74, 32, 27, 107, 86, 160, 63, 134, 102 } /* left = d63 */, - { 59, 67, 44, 140, 161, 202, 78, 67, 119 } /* left = tm */ - }, { /* above = v */ - { 63, 36, 126, 146, 123, 158, 60, 90, 96 } /* left = dc */, - { 43, 46, 168, 134, 107, 128, 69, 142, 92 } /* left = v */, - { 44, 29, 68, 159, 201, 177, 50, 57, 77 } /* left = h */, - { 58, 38, 76, 114, 97, 172, 78, 133, 92 } /* left = d45 */, - { 46, 41, 76, 140, 63, 184, 69, 112, 57 } /* left = d135 */, - { 38, 32, 85, 140, 46, 112, 54, 151, 133 } /* left = d117 */, - { 39, 27, 61, 131, 110, 175, 44, 75, 136 } /* left = d153 */, - { 52, 30, 74, 113, 130, 175, 51, 64, 58 } /* left = d27 */, - { 47, 35, 80, 100, 74, 143, 64, 163, 74 } /* left = d63 */, - { 36, 61, 116, 114, 128, 162, 80, 125, 82 } /* left = tm */ - }, { /* above = h */ - { 82, 26, 26, 171, 208, 204, 44, 32, 105 } /* left = dc */, - { 55, 44, 68, 166, 179, 192, 57, 57, 108 } /* left = v */, - { 42, 26, 11, 199, 241, 228, 23, 15, 85 } /* left = h */, - { 68, 42, 19, 131, 160, 199, 55, 52, 83 } /* left = d45 */, - { 58, 50, 25, 139, 115, 232, 39, 52, 118 } /* left = d135 */, - { 50, 35, 33, 153, 104, 162, 64, 59, 131 } /* left = d117 */, - { 44, 24, 16, 150, 177, 202, 33, 19, 156 } /* left = d153 */, - { 55, 27, 12, 153, 203, 218, 26, 27, 49 } /* left = d27 */, - { 53, 49, 21, 110, 116, 168, 59, 80, 76 } /* left = d63 */, - { 38, 72, 19, 168, 203, 212, 50, 50, 107 } /* left = tm */ - }, { /* above = d45 */ - { 103, 26, 36, 129, 132, 201, 83, 80, 93 } /* left = dc */, - { 59, 38, 83, 112, 103, 162, 98, 136, 90 } /* left = v */, - { 62, 30, 23, 158, 200, 207, 59, 57, 50 } /* left = h */, - { 67, 30, 29, 84, 86, 191, 102, 91, 59 } /* left = d45 */, - { 60, 32, 33, 112, 71, 220, 64, 89, 104 } /* left = d135 */, - { 53, 26, 34, 130, 56, 149, 84, 120, 103 } /* left = d117 */, - { 53, 21, 23, 133, 109, 210, 56, 77, 172 } /* left = d153 */, - { 77, 19, 29, 112, 142, 228, 55, 66, 36 } /* left = d27 */, - { 61, 29, 29, 93, 97, 165, 83, 175, 162 } /* left = d63 */, - { 47, 47, 43, 114, 137, 181, 100, 99, 95 } /* left = tm */ - }, { /* above = d135 */ - { 69, 23, 29, 128, 83, 199, 46, 44, 101 } /* left = dc */, - { 53, 40, 55, 139, 69, 183, 61, 80, 110 } /* left = v */, - { 40, 29, 19, 161, 180, 207, 43, 24, 91 } /* left = h */, - { 60, 34, 19, 105, 61, 198, 53, 64, 89 } /* left = d45 */, - { 52, 31, 22, 158, 40, 209, 58, 62, 89 } /* left = d135 */, - { 44, 31, 29, 147, 46, 158, 56, 102, 198 } /* left = d117 */, - { 35, 19, 12, 135, 87, 209, 41, 45, 167 } /* left = d153 */, - { 55, 25, 21, 118, 95, 215, 38, 39, 66 } /* left = d27 */, - { 51, 38, 25, 113, 58, 164, 70, 93, 97 } /* left = d63 */, - { 47, 54, 34, 146, 108, 203, 72, 103, 151 } /* left = tm */ - }, { /* above = d117 */ - { 64, 19, 37, 156, 66, 138, 49, 95, 133 } /* left = dc */, - { 46, 27, 80, 150, 55, 124, 55, 121, 135 } /* left = v */, - { 36, 23, 27, 165, 149, 166, 54, 64, 118 } /* left = h */, - { 53, 21, 36, 131, 63, 163, 60, 109, 81 } /* left = d45 */, - { 40, 26, 35, 154, 40, 185, 51, 97, 123 } /* left = d135 */, - { 35, 19, 34, 179, 19, 97, 48, 129, 124 } /* left = d117 */, - { 36, 20, 26, 136, 62, 164, 33, 77, 154 } /* left = d153 */, - { 45, 18, 32, 130, 90, 157, 40, 79, 91 } /* left = d27 */, - { 45, 26, 28, 129, 45, 129, 49, 147, 123 } /* left = d63 */, - { 38, 44, 51, 136, 74, 162, 57, 97, 121 } /* left = tm */ - }, { /* above = d153 */ - { 75, 17, 22, 136, 138, 185, 32, 34, 166 } /* left = dc */, - { 56, 39, 58, 133, 117, 173, 48, 53, 187 } /* left = v */, - { 35, 21, 12, 161, 212, 207, 20, 23, 145 } /* left = h */, - { 56, 29, 19, 117, 109, 181, 55, 68, 112 } /* left = d45 */, - { 47, 29, 17, 153, 64, 220, 59, 51, 114 } /* left = d135 */, - { 46, 16, 24, 136, 76, 147, 41, 64, 172 } /* left = d117 */, - { 34, 17, 11, 108, 152, 187, 13, 15, 209 } /* left = d153 */, - { 51, 24, 14, 115, 133, 209, 32, 26, 104 } /* left = d27 */, - { 55, 30, 18, 122, 79, 179, 44, 88, 116 } /* left = d63 */, - { 37, 49, 25, 129, 168, 164, 41, 54, 148 } /* left = tm */ - }, { /* above = d27 */ - { 82, 22, 32, 127, 143, 213, 39, 41, 70 } /* left = dc */, - { 62, 44, 61, 123, 105, 189, 48, 57, 64 } /* left = v */, - { 47, 25, 17, 175, 222, 220, 24, 30, 86 } /* left = h */, - { 68, 36, 17, 106, 102, 206, 59, 74, 74 } /* left = d45 */, - { 57, 39, 23, 151, 68, 216, 55, 63, 58 } /* left = d135 */, - { 49, 30, 35, 141, 70, 168, 82, 40, 115 } /* left = d117 */, - { 51, 25, 15, 136, 129, 202, 38, 35, 139 } /* left = d153 */, - { 68, 26, 16, 111, 141, 215, 29, 28, 28 } /* left = d27 */, - { 59, 39, 19, 114, 75, 180, 77, 104, 42 } /* left = d63 */, - { 40, 61, 26, 126, 152, 206, 61, 59, 93 } /* left = tm */ - }, { /* above = d63 */ - { 78, 23, 39, 111, 117, 170, 74, 124, 94 } /* left = dc */, - { 48, 34, 86, 101, 92, 146, 78, 179, 134 } /* left = v */, - { 47, 22, 24, 138, 187, 178, 68, 69, 59 } /* left = h */, - { 56, 25, 33, 105, 112, 187, 95, 177, 129 } /* left = d45 */, - { 48, 31, 27, 114, 63, 183, 82, 116, 56 } /* left = d135 */, - { 43, 28, 37, 121, 63, 123, 61, 192, 169 } /* left = d117 */, - { 42, 17, 24, 109, 97, 177, 56, 76, 122 } /* left = d153 */, - { 58, 18, 28, 105, 139, 182, 70, 92, 63 } /* left = d27 */, - { 46, 23, 32, 74, 86, 150, 67, 183, 88 } /* left = d63 */, - { 36, 38, 48, 92, 122, 165, 88, 137, 91 } /* left = tm */ - }, { /* above = tm */ - { 65, 70, 60, 155, 159, 199, 61, 60, 81 } /* left = dc */, - { 44, 78, 115, 132, 119, 173, 71, 112, 93 } /* left = v */, - { 39, 38, 21, 184, 227, 206, 42, 32, 64 } /* left = h */, - { 58, 47, 36, 124, 137, 193, 80, 82, 78 } /* left = d45 */, - { 49, 50, 35, 144, 95, 205, 63, 78, 59 } /* left = d135 */, - { 41, 53, 52, 148, 71, 142, 65, 128, 51 } /* left = d117 */, - { 40, 36, 28, 143, 143, 202, 40, 55, 137 } /* left = d153 */, - { 52, 34, 29, 129, 183, 227, 42, 35, 43 } /* left = d27 */, - { 42, 44, 44, 104, 105, 164, 64, 130, 80 } /* left = d63 */, - { 43, 81, 53, 140, 169, 204, 68, 84, 72 } /* left = tm */ - } -}; diff --git a/libvpx/vp9/common/vp9_mv.h b/libvpx/vp9/common/vp9_mv.h index a1eef46..a095258 100644 --- a/libvpx/vp9/common/vp9_mv.h +++ b/libvpx/vp9/common/vp9_mv.h @@ -23,14 +23,9 @@ typedef union int_mv { MV as_mv; } int_mv; /* facilitates faster equality tests and copies */ -struct mv32 { +typedef struct { int32_t row; int32_t col; -}; - -typedef union int_mv32 { - uint64_t as_int; - struct mv32 as_mv; -} int_mv32; /* facilitates faster equality tests and copies */ +} MV32; #endif // VP9_COMMON_VP9_MV_H_ diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c index 78fb2f0..ae009b0 100644 --- a/libvpx/vp9/common/vp9_mvref_common.c +++ b/libvpx/vp9/common/vp9_mvref_common.c @@ -11,7 +11,7 @@ #include "vp9/common/vp9_mvref_common.h" #define MVREF_NEIGHBOURS 8 -static int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = { +static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = { // SB4X4 {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}}, // SB4X8 @@ -147,10 +147,9 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here, int_mv c2_refmv; MV_REFERENCE_FRAME c_ref_frame; MV_REFERENCE_FRAME c2_ref_frame; - int candidate_scores[MAX_MV_REF_CANDIDATES]; + int candidate_scores[MAX_MV_REF_CANDIDATES] = { 0 }; int refmv_count = 0; - int split_count = 0; - int (*mv_ref_search)[2]; + const int (*mv_ref_search)[2] = mv_ref_blocks[mbmi->sb_type]; const int mi_col = get_mi_col(xd); const int mi_row = get_mi_row(xd); int intra_count = 0; @@ -160,9 +159,7 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here, // Blank the reference vector lists and other local structures. vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES); - vpx_memset(candidate_scores, 0, sizeof(candidate_scores)); - mv_ref_search = mv_ref_blocks[mbmi->sb_type]; if (mbmi->sb_type < BLOCK_SIZE_SB8X8) { x_idx = block_idx & 1; y_idx = block_idx >> 1; @@ -193,8 +190,6 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here, add_candidate_mv(mv_ref_list, candidate_scores, &refmv_count, c_refmv, 16); } - split_count += (candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 && - candidate_mi->mbmi.ref_frame[0] != INTRA_FRAME); // Count number of neihgbours coded intra and zeromv intra_count += (candidate_mi->mbmi.mode < NEARESTMV); diff --git a/libvpx/vp9/common/vp9_onyx.h b/libvpx/vp9/common/vp9_onyx.h index b85b889..152046f 100644 --- a/libvpx/vp9/common/vp9_onyx.h +++ b/libvpx/vp9/common/vp9_onyx.h @@ -22,7 +22,7 @@ extern "C" #include "vpx_scale/yv12config.h" #include "vp9/common/vp9_ppflags.h" -#define MAX_MB_SEGMENTS 8 +#define MAX_SEGMENTS 8 typedef int *VP9_PTR; @@ -64,41 +64,13 @@ extern "C" FRAMEFLAGS_ALTREF = 4, } FRAMETYPE_FLAGS; - -#include <assert.h> - static INLINE void Scale2Ratio(int mode, int *hr, int *hs) { - switch (mode) { - case NORMAL: - *hr = 1; - *hs = 1; - break; - case FOURFIVE: - *hr = 4; - *hs = 5; - break; - case THREEFIVE: - *hr = 3; - *hs = 5; - break; - case ONETWO: - *hr = 1; - *hs = 2; - break; - default: - *hr = 1; - *hs = 1; - assert(0); - break; - } - } - typedef struct { int version; // 4 versions of bitstream defined: // 0 - best quality/slowest decode, // 3 - lowest quality/fastest decode int width; // width of data passed to the compressor int height; // height of data passed to the compressor - double frame_rate; // set to passed in framerate + double framerate; // set to passed in framerate int64_t target_bandwidth; // bandwidth to be used in kilobits per second int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0 @@ -228,9 +200,9 @@ extern "C" int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, - int delta_q[MAX_MB_SEGMENTS], - int delta_lf[MAX_MB_SEGMENTS], - unsigned int threshold[MAX_MB_SEGMENTS]); + int delta_q[MAX_SEGMENTS], + int delta_lf[MAX_SEGMENTS], + unsigned int threshold[MAX_SEGMENTS]); int vp9_set_active_map(VP9_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols); diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h index 0d8b0f4..f31f24b 100644 --- a/libvpx/vp9/common/vp9_onyxc_int.h +++ b/libvpx/vp9/common/vp9_onyxc_int.h @@ -24,87 +24,57 @@ #include "vp9/common/vp9_postproc.h" #endif -/* Create/destroy static data structures. */ - -// Define the number of candidate reference buffers. -#define NUM_REF_FRAMES 8 -#define NUM_REF_FRAMES_LG2 3 - #define ALLOWED_REFS_PER_FRAME 3 +#define NUM_REF_FRAMES_LOG2 3 +#define NUM_REF_FRAMES (1 << NUM_REF_FRAMES_LOG2) + // 1 scratch frame for the new frame, 3 for scaled references on the encoder // TODO(jkoleszar): These 3 extra references could probably come from the // normal reference pool. #define NUM_YV12_BUFFERS (NUM_REF_FRAMES + 4) -#define NUM_FRAME_CONTEXTS_LG2 2 -#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LG2) - -#define MAX_LAG_BUFFERS 25 +#define NUM_FRAME_CONTEXTS_LOG2 2 +#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LOG2) typedef struct frame_contexts { vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1]; vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1]; vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS] [PARTITION_TYPES - 1]; - - nmv_context nmvc; - nmv_context pre_nmvc; - /* interframe intra mode probs */ - vp9_prob pre_y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1]; - vp9_prob pre_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1]; - vp9_prob pre_partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1]; - /* interframe intra mode probs */ - unsigned int y_mode_counts[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES]; - unsigned int uv_mode_counts[VP9_INTRA_MODES][VP9_INTRA_MODES]; - unsigned int partition_counts[NUM_PARTITION_CONTEXTS][PARTITION_TYPES]; - vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES]; - vp9_coeff_probs_model pre_coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES]; - vp9_coeff_count_model coef_counts[TX_SIZE_MAX_SB][BLOCK_TYPES]; - unsigned int eob_branch_counts[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES] - [COEF_BANDS][PREV_COEF_CONTEXTS]; - - nmv_context_counts NMVcount; vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] [VP9_SWITCHABLE_FILTERS - 1]; - vp9_prob pre_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] - [VP9_SWITCHABLE_FILTERS - 1]; - unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1] - [VP9_SWITCHABLE_FILTERS]; - vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1]; - vp9_prob pre_inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1]; - unsigned int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2]; - vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS]; vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS]; vp9_prob single_ref_prob[REF_CONTEXTS][2]; vp9_prob comp_ref_prob[REF_CONTEXTS]; - vp9_prob pre_intra_inter_prob[INTRA_INTER_CONTEXTS]; - vp9_prob pre_comp_inter_prob[COMP_INTER_CONTEXTS]; - vp9_prob pre_single_ref_prob[REF_CONTEXTS][2]; - vp9_prob pre_comp_ref_prob[REF_CONTEXTS]; - unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2]; - unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2]; - unsigned int single_ref_count[REF_CONTEXTS][2][2]; - unsigned int comp_ref_count[REF_CONTEXTS][2]; - - vp9_prob tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1]; - vp9_prob tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2]; - vp9_prob tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3]; - vp9_prob pre_tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1]; - vp9_prob pre_tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2]; - vp9_prob pre_tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3]; - unsigned int tx_count_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB]; - unsigned int tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1]; - unsigned int tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2]; - + struct tx_probs tx_probs; vp9_prob mbskip_probs[MBSKIP_CONTEXTS]; - vp9_prob pre_mbskip_probs[MBSKIP_CONTEXTS]; - unsigned int mbskip_count[MBSKIP_CONTEXTS][2]; + nmv_context nmvc; } FRAME_CONTEXT; +typedef struct { + unsigned int y_mode[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES]; + unsigned int uv_mode[VP9_INTRA_MODES][VP9_INTRA_MODES]; + unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES]; + vp9_coeff_count_model coef[TX_SIZE_MAX_SB][BLOCK_TYPES]; + unsigned int eob_branch[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES] + [COEF_BANDS][PREV_COEF_CONTEXTS]; + unsigned int switchable_interp[VP9_SWITCHABLE_FILTERS + 1] + [VP9_SWITCHABLE_FILTERS]; + unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2]; + unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; + unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; + unsigned int single_ref[REF_CONTEXTS][2][2]; + unsigned int comp_ref[REF_CONTEXTS][2]; + struct tx_counts tx; + unsigned int mbskip[MBSKIP_CONTEXTS][2]; + nmv_context_counts mv; +} FRAME_COUNTS; + + typedef enum { SINGLE_PREDICTION_ONLY = 0, COMP_PREDICTION_ONLY = 1, @@ -112,22 +82,13 @@ typedef enum { NB_PREDICTION_TYPES = 3, } COMPPREDMODE_TYPE; -typedef enum { - ONLY_4X4 = 0, - ALLOW_8X8 = 1, - ALLOW_16X16 = 2, - ALLOW_32X32 = 3, - TX_MODE_SELECT = 4, - NB_TXFM_MODES = 5, -} TXFM_MODE; - typedef struct VP9Common { struct vpx_internal_error_info error; - DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][2]); - DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][2]); + DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]); #if CONFIG_ALPHA - DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][2]); + DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]); #endif int width; @@ -143,8 +104,6 @@ typedef struct VP9Common { int subsampling_x; int subsampling_y; - YUV_TYPE clr_type; - YV12_BUFFER_CONFIG *frame_to_show; YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS]; @@ -159,10 +118,7 @@ typedef struct VP9Common { struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME]; int new_fb_idx; - YV12_BUFFER_CONFIG post_proc_buffer; - YV12_BUFFER_CONFIG temp_scale_frame; - FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */ FRAME_TYPE frame_type; @@ -187,7 +143,7 @@ typedef struct VP9Common { int mode_info_stride; /* profile settings */ - TXFM_MODE txfm_mode; + TX_MODE tx_mode; int base_qindex; int last_kf_gf_q; /* Q used on the last GF or KF */ @@ -200,9 +156,6 @@ typedef struct VP9Common { int a_ac_delta_q; #endif - unsigned int frames_since_golden; - unsigned int frames_till_alt_ref_frame; - /* We allocate a MODE_INFO struct for each macroblock, together with an extra row on top and column on the left to simplify prediction. */ @@ -219,10 +172,6 @@ typedef struct VP9Common { loop_filter_info_n lf_info; - int filter_level; - int last_sharpness_level; - int sharpness_level; - int refresh_frame_context; /* Two state 0 = NO, 1 = YES */ int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */ @@ -235,17 +184,6 @@ typedef struct VP9Common { PARTITION_CONTEXT *above_seg_context; PARTITION_CONTEXT left_seg_context[8]; - /* keyframe block modes are predicted by their above, left neighbors */ - - vp9_prob kf_y_mode_prob[VP9_INTRA_MODES] - [VP9_INTRA_MODES] - [VP9_INTRA_MODES - 1]; - vp9_prob kf_uv_mode_prob[VP9_INTRA_MODES] [VP9_INTRA_MODES - 1]; - - // Context probabilities when using predictive coding of segment id - vp9_prob segment_pred_probs[PREDICTION_PROBS]; - unsigned char temporal_update; - // Context probabilities for reference frame prediction int allow_comp_inter_inter; MV_REFERENCE_FRAME comp_fixed_ref; @@ -255,14 +193,11 @@ typedef struct VP9Common { FRAME_CONTEXT fc; /* this frame entropy */ FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS]; unsigned int frame_context_idx; /* Context to use/update */ + FRAME_COUNTS counts; unsigned int current_video_frame; - int near_boffset[3]; int version; - double bitrate; - double framerate; - #if CONFIG_POSTPROC struct postproc_state postproc_state; #endif @@ -270,10 +205,9 @@ typedef struct VP9Common { int error_resilient_mode; int frame_parallel_decoding_mode; - int tile_columns, log2_tile_columns; - int cur_tile_mi_col_start, cur_tile_mi_col_end, cur_tile_col_idx; - int tile_rows, log2_tile_rows; - int cur_tile_mi_row_start, cur_tile_mi_row_end, cur_tile_row_idx; + int log2_tile_cols, log2_tile_rows; + int cur_tile_mi_col_start, cur_tile_mi_col_end; + int cur_tile_mi_row_start, cur_tile_mi_row_end; } VP9_COMMON; static int get_free_fb(VP9_COMMON *cm) { @@ -296,15 +230,14 @@ static void ref_cnt_fb(int *buf, int *idx, int new_idx) { buf[new_idx]++; } -static int mi_cols_aligned_to_sb(VP9_COMMON *cm) { - return 2 * ((cm->mb_cols + 3) & ~3); +static int mi_cols_aligned_to_sb(int n_mis) { + return ALIGN_POWER_OF_TWO(n_mis, LOG2_MI_BLOCK_SIZE); } -static INLINE void set_partition_seg_context(VP9_COMMON *cm, - MACROBLOCKD *xd, +static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col) { xd->above_seg_context = cm->above_seg_context + mi_col; - xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK); + xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK); } static int check_bsize_coverage(VP9_COMMON *cm, MACROBLOCKD *xd, diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c index 4282ddd..1157fbb 100644 --- a/libvpx/vp9/common/vp9_postproc.c +++ b/libvpx/vp9/common/vp9_postproc.c @@ -411,7 +411,7 @@ static void fillrd(struct postproc_state *state, int q, int a) { } - for (next = next; next < 256; next++) + for (; next < 256; next++) char_dist[next] = 0; } @@ -630,9 +630,11 @@ static void constrain_line(int x0, int *x1, int y0, int *y1, } } -int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest, +int vp9_post_proc_frame(struct VP9Common *oci, + struct loopfilter *lf, + YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) { - int q = oci->filter_level * 10 / 6; + int q = lf->filter_level * 10 / 6; int flags = ppflags->post_proc_flag; int deblock_level = ppflags->deblocking_level; int noise_level = ppflags->noise_level; @@ -758,7 +760,7 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest, if (flags & VP9D_DEBUG_TXT_RATE_INFO) { char message[512]; snprintf(message, sizeof(message), - "Bitrate: %10.2f frame_rate: %10.2f ", + "Bitrate: %10.2f framerate: %10.2f ", oci->bitrate, oci->framerate); vp9_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride); @@ -936,9 +938,9 @@ int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest, for (bx = 0; bx < 16; bx += 4) { if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode)) || (ppflags->display_mb_modes_flag & I4X4_PRED)) { - Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0]; - U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1]; - V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2]; + Y = B_PREDICTION_MODE_colors[bmi->as_mode][0]; + U = B_PREDICTION_MODE_colors[bmi->as_mode][1]; + V = B_PREDICTION_MODE_colors[bmi->as_mode][2]; vp9_blend_b(yl + bx, ul + (bx >> 1), vl + (bx >> 1), Y, U, V, 0xc000, y_stride); diff --git a/libvpx/vp9/common/vp9_postproc.h b/libvpx/vp9/common/vp9_postproc.h index 2c0d333..a814e39 100644 --- a/libvpx/vp9/common/vp9_postproc.h +++ b/libvpx/vp9/common/vp9_postproc.h @@ -26,8 +26,8 @@ struct postproc_state { #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_ppflags.h" -int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *flags); +int vp9_post_proc_frame(struct VP9Common *oci, struct loopfilter *lf, + YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags); void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q); diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c index 17da4f2..e8bcdea 100644 --- a/libvpx/vp9/common/vp9_pred_common.c +++ b/libvpx/vp9/common/vp9_pred_common.c @@ -16,505 +16,425 @@ #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_treecoder.h" -// TBD prediction functions for various bitstream signals - // Returns a context number for the given MB prediction signal -unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id) { +unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { + const MODE_INFO *const mi = xd->mode_info_context; + const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; + const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; + const int left_in_image = xd->left_available && left_mbmi->mb_in_image; + const int above_in_image = xd->up_available && above_mbmi->mb_in_image; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries correpsonding to real macroblocks. + // The prediction flags in these dummy entries are initialised to 0. + // left + const int left_mv_pred = is_inter_mode(left_mbmi->mode); + const int left_interp = left_in_image && left_mv_pred ? + vp9_switchable_interp_map[left_mbmi->interp_filter] : + VP9_SWITCHABLE_FILTERS; + + // above + const int above_mv_pred = is_inter_mode(above_mbmi->mode); + const int above_interp = above_in_image && above_mv_pred ? + vp9_switchable_interp_map[above_mbmi->interp_filter] : + VP9_SWITCHABLE_FILTERS; + + assert(left_interp != -1); + assert(above_interp != -1); + + if (left_interp == above_interp) + return left_interp; + else if (left_interp == VP9_SWITCHABLE_FILTERS && + above_interp != VP9_SWITCHABLE_FILTERS) + return above_interp; + else if (left_interp != VP9_SWITCHABLE_FILTERS && + above_interp == VP9_SWITCHABLE_FILTERS) + return left_interp; + else + return VP9_SWITCHABLE_FILTERS; +} +// Returns a context number for the given MB prediction signal +unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) { int pred_context; const MODE_INFO *const mi = xd->mode_info_context; - const MODE_INFO *const above_mi = mi - cm->mode_info_stride; - const MODE_INFO *const left_mi = mi - 1; - const int left_in_image = xd->left_available && left_mi->mbmi.mb_in_image; - const int above_in_image = xd->up_available && above_mi->mbmi.mb_in_image; + const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; + const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; + const int left_in_image = xd->left_available && left_mbmi->mb_in_image; + const int above_in_image = xd->up_available && above_mbmi->mb_in_image; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. - switch (pred_id) { - case PRED_SEG_ID: - pred_context = above_mi->mbmi.seg_id_predicted; - if (xd->left_available) - pred_context += left_mi->mbmi.seg_id_predicted; - break; - - case PRED_MBSKIP: - pred_context = above_mi->mbmi.mb_skip_coeff; - if (xd->left_available) - pred_context += left_mi->mbmi.mb_skip_coeff; - break; - - case PRED_SWITCHABLE_INTERP: { - // left - const int left_mv_pred = is_inter_mode(left_mi->mbmi.mode); - const int left_interp = left_in_image && left_mv_pred ? - vp9_switchable_interp_map[left_mi->mbmi.interp_filter] : - VP9_SWITCHABLE_FILTERS; - - // above - const int above_mv_pred = is_inter_mode(above_mi->mbmi.mode); - const int above_interp = above_in_image && above_mv_pred ? - vp9_switchable_interp_map[above_mi->mbmi.interp_filter] : - VP9_SWITCHABLE_FILTERS; - - assert(left_interp != -1); - assert(above_interp != -1); - - if (left_interp == above_interp) - pred_context = left_interp; - else if (left_interp == VP9_SWITCHABLE_FILTERS && - above_interp != VP9_SWITCHABLE_FILTERS) - pred_context = above_interp; - else if (left_interp != VP9_SWITCHABLE_FILTERS && - above_interp == VP9_SWITCHABLE_FILTERS) - pred_context = left_interp; - else - pred_context = VP9_SWITCHABLE_FILTERS; - - break; + if (above_in_image && left_in_image) { // both edges available + if (left_mbmi->ref_frame[0] == INTRA_FRAME && + above_mbmi->ref_frame[0] == INTRA_FRAME) { // intra/intra (3) + pred_context = 3; + } else { // intra/inter (1) or inter/inter (0) + pred_context = left_mbmi->ref_frame[0] == INTRA_FRAME || + above_mbmi->ref_frame[0] == INTRA_FRAME; } + } else if (above_in_image || left_in_image) { // one edge available + const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; - case PRED_INTRA_INTER: { - if (above_in_image && left_in_image) { // both edges available - if (left_mi->mbmi.ref_frame[0] == INTRA_FRAME && - above_mi->mbmi.ref_frame[0] == INTRA_FRAME) { // intra/intra (3) - pred_context = 3; - } else { // intra/inter (1) or inter/inter (0) - pred_context = left_mi->mbmi.ref_frame[0] == INTRA_FRAME || - above_mi->mbmi.ref_frame[0] == INTRA_FRAME; - } - } else if (above_in_image || left_in_image) { // one edge available - const MODE_INFO *edge = above_in_image ? above_mi : left_mi; - - // inter: 0, intra: 2 - pred_context = 2 * (edge->mbmi.ref_frame[0] == INTRA_FRAME); - } else { - pred_context = 0; - } - assert(pred_context >= 0 && pred_context < INTRA_INTER_CONTEXTS); - break; - } + // inter: 0, intra: 2 + pred_context = 2 * (edge_mbmi->ref_frame[0] == INTRA_FRAME); + } else { + pred_context = 0; + } + assert(pred_context >= 0 && pred_context < INTRA_INTER_CONTEXTS); + return pred_context; +} +// Returns a context number for the given MB prediction signal +unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + int pred_context; + const MODE_INFO *const mi = xd->mode_info_context; + const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi; + const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; + const int left_in_image = xd->left_available && left_mbmi->mb_in_image; + const int above_in_image = xd->up_available && above_mbmi->mb_in_image; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries correpsonding to real macroblocks. + // The prediction flags in these dummy entries are initialised to 0. + if (above_in_image && left_in_image) { // both edges available + if (above_mbmi->ref_frame[1] <= INTRA_FRAME && + left_mbmi->ref_frame[1] <= INTRA_FRAME) + // neither edge uses comp pred (0/1) + pred_context = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^ + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref); + else if (above_mbmi->ref_frame[1] <= INTRA_FRAME) + // one of two edges uses comp pred (2/3) + pred_context = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref || + above_mbmi->ref_frame[0] == INTRA_FRAME); + else if (left_mbmi->ref_frame[1] <= INTRA_FRAME) + // one of two edges uses comp pred (2/3) + pred_context = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref || + left_mbmi->ref_frame[0] == INTRA_FRAME); + else // both edges use comp pred (4) + pred_context = 4; + } else if (above_in_image || left_in_image) { // one edge available + const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + + if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) + // edge does not use comp pred (0/1) + pred_context = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref; + else + // edge uses comp pred (3) + pred_context = 3; + } else { // no edges available (1) + pred_context = 1; + } + assert(pred_context >= 0 && pred_context < COMP_INTER_CONTEXTS); + return pred_context; +} - case PRED_COMP_INTER_INTER: { - if (above_in_image && left_in_image) { // both edges available - if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME && - left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) { - // neither edge uses comp pred (0/1) - pred_context = ((above_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref) ^ - (left_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref)); - } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME) { - // one of two edges uses comp pred (2/3) - pred_context = 2 + - (above_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref || - above_mi->mbmi.ref_frame[0] == INTRA_FRAME); - } else if (left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) { - // one of two edges uses comp pred (2/3) - pred_context = 2 + - (left_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref || - left_mi->mbmi.ref_frame[0] == INTRA_FRAME); - } else { // both edges use comp pred (4) +// Returns a context number for the given MB prediction signal +unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + int pred_context; + const MODE_INFO *const mi = xd->mode_info_context; + const MB_MODE_INFO *const above_mbmi = &mi[-cm->mode_info_stride].mbmi; + const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; + const int left_in_image = xd->left_available && left_mbmi->mb_in_image; + const int above_in_image = xd->up_available && above_mbmi->mb_in_image; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries correpsonding to real macroblocks. + // The prediction flags in these dummy entries are initialised to 0. + const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + const int var_ref_idx = !fix_ref_idx; + + if (above_in_image && left_in_image) { // both edges available + if (above_mbmi->ref_frame[0] == INTRA_FRAME && + left_mbmi->ref_frame[0] == INTRA_FRAME) { // intra/intra (2) + pred_context = 2; + } else if (above_mbmi->ref_frame[0] == INTRA_FRAME || + left_mbmi->ref_frame[0] == INTRA_FRAME) { // intra/inter + const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ? + left_mbmi : above_mbmi; + + if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) // single pred (1/3) + pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]); + else // comp pred (1/3) + pred_context = 1 + 2 * (edge_mbmi->ref_frame[var_ref_idx] + != cm->comp_var_ref[1]); + } else { // inter/inter + int l_sg = left_mbmi->ref_frame[1] <= INTRA_FRAME; + int a_sg = above_mbmi->ref_frame[1] <= INTRA_FRAME; + MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0] + : above_mbmi->ref_frame[var_ref_idx]; + MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0] + : left_mbmi->ref_frame[var_ref_idx]; + + if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) { + pred_context = 0; + } else if (l_sg && a_sg) { // single/single + if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) || + (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0])) pred_context = 4; - } - } else if (above_in_image || left_in_image) { // one edge available - const MODE_INFO *edge = above_in_image ? above_mi : left_mi; - - if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) { - // edge does not use comp pred (0/1) - pred_context = edge->mbmi.ref_frame[0] == cm->comp_fixed_ref; - } else { // edge uses comp pred (3) + else if (vrfa == vrfl) pred_context = 3; - } - } else { // no edges available (1) - pred_context = 1; - } - assert(pred_context >= 0 && pred_context < COMP_INTER_CONTEXTS); - break; - } - - case PRED_COMP_REF_P: { - const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; - const int var_ref_idx = !fix_ref_idx; - - if (above_in_image && left_in_image) { // both edges available - if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME && - left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { // intra/intra (2) - pred_context = 2; - } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME || - left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { // intra/inter - const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ? - left_mi : above_mi; - - if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) { // single pred (1/3) - pred_context = 1 + - 2 * edge->mbmi.ref_frame[0] != cm->comp_var_ref[1]; - } else { // comp pred (1/3) - pred_context = 1 + - 2 * edge->mbmi.ref_frame[var_ref_idx] != cm->comp_var_ref[1]; - } - } else { // inter/inter - int l_sg = left_mi->mbmi.ref_frame[1] <= INTRA_FRAME; - int a_sg = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME; - MV_REFERENCE_FRAME vrfa = a_sg ? above_mi->mbmi.ref_frame[0] : - above_mi->mbmi.ref_frame[var_ref_idx]; - MV_REFERENCE_FRAME vrfl = l_sg ? left_mi->mbmi.ref_frame[0] : - left_mi->mbmi.ref_frame[var_ref_idx]; - - if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) { - pred_context = 0; - } else if (l_sg && a_sg) { // single/single - if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) || - (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0])) { - pred_context = 4; - } else if (vrfa == vrfl) { - pred_context = 3; - } else { - pred_context = 1; - } - } else if (l_sg || a_sg) { // single/comp - MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl; - MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl; - - if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1]) { - pred_context = 1; - } else if (rfs == cm->comp_var_ref[1] && - vrfc != cm->comp_var_ref[1]) { - pred_context = 2; - } else { - pred_context = 4; - } - } else if (vrfa == vrfl) { // comp/comp - pred_context = 4; - } else { - pred_context = 2; - } - } - } else if (above_in_image || left_in_image) { // one edge available - const MODE_INFO *edge = above_in_image ? above_mi : left_mi; - - if (edge->mbmi.ref_frame[0] == INTRA_FRAME) { + else + pred_context = 1; + } else if (l_sg || a_sg) { // single/comp + MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl; + MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl; + if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1]) + pred_context = 1; + else if (rfs == cm->comp_var_ref[1] && vrfc != cm->comp_var_ref[1]) pred_context = 2; - } else if (edge->mbmi.ref_frame[1] > INTRA_FRAME) { - pred_context = - 4 * edge->mbmi.ref_frame[var_ref_idx] != cm->comp_var_ref[1]; - } else { - pred_context = 3 * edge->mbmi.ref_frame[0] != cm->comp_var_ref[1]; - } - } else { // no edges available (2) + else + pred_context = 4; + } else if (vrfa == vrfl) { // comp/comp + pred_context = 4; + } else { pred_context = 2; } - assert(pred_context >= 0 && pred_context < REF_CONTEXTS); - break; } + } else if (above_in_image || left_in_image) { // one edge available + const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + + if (edge_mbmi->ref_frame[0] == INTRA_FRAME) + pred_context = 2; + else if (edge_mbmi->ref_frame[1] > INTRA_FRAME) + pred_context = 4 * (edge_mbmi->ref_frame[var_ref_idx] + != cm->comp_var_ref[1]); + else + pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_var_ref[1]); + } else { // no edges available (2) + pred_context = 2; + } + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); - case PRED_SINGLE_REF_P1: { - if (above_in_image && left_in_image) { // both edges available - if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME && - left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { - pred_context = 2; - } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME || - left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { - const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ? - left_mi : above_mi; - - if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) { - pred_context = 4 * (edge->mbmi.ref_frame[0] == LAST_FRAME); - } else { - pred_context = 1 + (edge->mbmi.ref_frame[0] == LAST_FRAME || - edge->mbmi.ref_frame[1] == LAST_FRAME); - } - } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME && - left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) { - pred_context = 2 * (above_mi->mbmi.ref_frame[0] == LAST_FRAME) + - 2 * (left_mi->mbmi.ref_frame[0] == LAST_FRAME); - } else if (above_mi->mbmi.ref_frame[1] > INTRA_FRAME && - left_mi->mbmi.ref_frame[1] > INTRA_FRAME) { - pred_context = 1 + (above_mi->mbmi.ref_frame[0] == LAST_FRAME || - above_mi->mbmi.ref_frame[1] == LAST_FRAME || - left_mi->mbmi.ref_frame[0] == LAST_FRAME || - left_mi->mbmi.ref_frame[1] == LAST_FRAME); - } else { - MV_REFERENCE_FRAME rfs = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME ? - above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0]; - MV_REFERENCE_FRAME crf1 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ? - above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0]; - MV_REFERENCE_FRAME crf2 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ? - above_mi->mbmi.ref_frame[1] : left_mi->mbmi.ref_frame[1]; - - if (rfs == LAST_FRAME) { - pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME); - } else { - pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME; - } - } - } else if (above_in_image || left_in_image) { // one edge available - const MODE_INFO *edge = above_in_image ? above_mi : left_mi; - - if (edge->mbmi.ref_frame[0] == INTRA_FRAME) { - pred_context = 2; - } else if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) { - pred_context = 4 * (edge->mbmi.ref_frame[0] == LAST_FRAME); - } else { - pred_context = 1 + (edge->mbmi.ref_frame[0] == LAST_FRAME || - edge->mbmi.ref_frame[1] == LAST_FRAME); - } - } else { // no edges available (2) - pred_context = 2; - } - assert(pred_context >= 0 && pred_context < REF_CONTEXTS); - break; + return pred_context; +} +unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { + int pred_context; + const MODE_INFO *const mi = xd->mode_info_context; + const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; + const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; + const int left_in_image = xd->left_available && left_mbmi->mb_in_image; + const int above_in_image = xd->up_available && above_mbmi->mb_in_image; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries correpsonding to real macroblocks. + // The prediction flags in these dummy entries are initialised to 0. + if (above_in_image && left_in_image) { // both edges available + if (above_mbmi->ref_frame[0] == INTRA_FRAME && + left_mbmi->ref_frame[0] == INTRA_FRAME) { + pred_context = 2; + } else if (above_mbmi->ref_frame[0] == INTRA_FRAME || + left_mbmi->ref_frame[0] == INTRA_FRAME) { + const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ? + left_mbmi : above_mbmi; + + if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) + pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME); + else + pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME || + edge_mbmi->ref_frame[1] == LAST_FRAME); + } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME && + left_mbmi->ref_frame[1] <= INTRA_FRAME) { + pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) + + 2 * (left_mbmi->ref_frame[0] == LAST_FRAME); + } else if (above_mbmi->ref_frame[1] > INTRA_FRAME && + left_mbmi->ref_frame[1] > INTRA_FRAME) { + pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME || + above_mbmi->ref_frame[1] == LAST_FRAME || + left_mbmi->ref_frame[0] == LAST_FRAME || + left_mbmi->ref_frame[1] == LAST_FRAME); + } else { + MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ? + above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; + MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ? + above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; + MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ? + above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; + + if (rfs == LAST_FRAME) + pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME); + else + pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME; } + } else if (above_in_image || left_in_image) { // one edge available + const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + + if (edge_mbmi->ref_frame[0] == INTRA_FRAME) + pred_context = 2; + else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) + pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME); + else + pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME || + edge_mbmi->ref_frame[1] == LAST_FRAME); + } else { // no edges available (2) + pred_context = 2; + } + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + return pred_context; +} - case PRED_SINGLE_REF_P2: { - if (above_in_image && left_in_image) { // both edges available - if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME && - left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { - pred_context = 2; - } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME || - left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { - const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ? - left_mi : above_mi; - - if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) { - if (edge->mbmi.ref_frame[0] == LAST_FRAME) { - pred_context = 3; - } else { - pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME); - } - } else { - pred_context = 1 + 2 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME || - edge->mbmi.ref_frame[1] == GOLDEN_FRAME); - } - } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME && - left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) { - if (above_mi->mbmi.ref_frame[0] == LAST_FRAME && - left_mi->mbmi.ref_frame[0] == LAST_FRAME) { - pred_context = 3; - } else if (above_mi->mbmi.ref_frame[0] == LAST_FRAME || - left_mi->mbmi.ref_frame[0] == LAST_FRAME) { - const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == LAST_FRAME ? - left_mi : above_mi; - - pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME); - } else { - pred_context = 2 * (above_mi->mbmi.ref_frame[0] == GOLDEN_FRAME) + - 2 * (left_mi->mbmi.ref_frame[0] == GOLDEN_FRAME); - } - } else if (above_mi->mbmi.ref_frame[1] > INTRA_FRAME && - left_mi->mbmi.ref_frame[1] > INTRA_FRAME) { - if (above_mi->mbmi.ref_frame[0] == left_mi->mbmi.ref_frame[0] && - above_mi->mbmi.ref_frame[1] == left_mi->mbmi.ref_frame[1]) { - pred_context = 3 * (above_mi->mbmi.ref_frame[0] == GOLDEN_FRAME || - above_mi->mbmi.ref_frame[1] == GOLDEN_FRAME || - left_mi->mbmi.ref_frame[0] == GOLDEN_FRAME || - left_mi->mbmi.ref_frame[1] == GOLDEN_FRAME); - } else { - pred_context = 2; - } - } else { - MV_REFERENCE_FRAME rfs = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME ? - above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0]; - MV_REFERENCE_FRAME crf1 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ? - above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0]; - MV_REFERENCE_FRAME crf2 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ? - above_mi->mbmi.ref_frame[1] : left_mi->mbmi.ref_frame[1]; - - if (rfs == GOLDEN_FRAME) { - pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); - } else if (rfs == ALTREF_FRAME) { - pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME; - } else { - pred_context = - 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); - } - } - } else if (above_in_image || left_in_image) { // one edge available - const MODE_INFO *edge = above_in_image ? above_mi : left_mi; - - if (edge->mbmi.ref_frame[0] == INTRA_FRAME || - (edge->mbmi.ref_frame[0] == LAST_FRAME && - edge->mbmi.ref_frame[1] <= INTRA_FRAME)) { - pred_context = 2; - } else if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) { - pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME); - } else { - pred_context = 3 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME || - edge->mbmi.ref_frame[1] == GOLDEN_FRAME); - } - } else { // no edges available (2) - pred_context = 2; - } - assert(pred_context >= 0 && pred_context < REF_CONTEXTS); - break; - } +unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { + int pred_context; + const MODE_INFO *const mi = xd->mode_info_context; + const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; + const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; + const int left_in_image = xd->left_available && left_mbmi->mb_in_image; + const int above_in_image = xd->up_available && above_mbmi->mb_in_image; - case PRED_TX_SIZE: { - int above_context, left_context; - int max_tx_size; - if (mi->mbmi.sb_type < BLOCK_SIZE_SB8X8) - max_tx_size = TX_4X4; - else if (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16) - max_tx_size = TX_8X8; - else if (mi->mbmi.sb_type < BLOCK_SIZE_SB32X32) - max_tx_size = TX_16X16; - else - max_tx_size = TX_32X32; - above_context = left_context = max_tx_size; - if (above_in_image) { - above_context = (above_mi->mbmi.mb_skip_coeff ? - max_tx_size : above_mi->mbmi.txfm_size); - } - if (left_in_image) { - left_context = (left_mi->mbmi.mb_skip_coeff ? - max_tx_size : left_mi->mbmi.txfm_size); - } - if (!left_in_image) { - left_context = above_context; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries correpsonding to real macroblocks. + // The prediction flags in these dummy entries are initialised to 0. + if (above_in_image && left_in_image) { // both edges available + if (above_mbmi->ref_frame[0] == INTRA_FRAME && + left_mbmi->ref_frame[0] == INTRA_FRAME) { + pred_context = 2; + } else if (above_mbmi->ref_frame[0] == INTRA_FRAME || + left_mbmi->ref_frame[0] == INTRA_FRAME) { + const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == INTRA_FRAME ? + left_mbmi : above_mbmi; + + if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) { + if (edge_mbmi->ref_frame[0] == LAST_FRAME) + pred_context = 3; + else + pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME); + } else { + pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME || + edge_mbmi->ref_frame[1] == GOLDEN_FRAME); } - if (!above_in_image) { - above_context = left_context; + } else if (above_mbmi->ref_frame[1] <= INTRA_FRAME && + left_mbmi->ref_frame[1] <= INTRA_FRAME) { + if (above_mbmi->ref_frame[0] == LAST_FRAME && + left_mbmi->ref_frame[0] == LAST_FRAME) { + pred_context = 3; + } else if (above_mbmi->ref_frame[0] == LAST_FRAME || + left_mbmi->ref_frame[0] == LAST_FRAME) { + const MB_MODE_INFO *edge_mbmi = above_mbmi->ref_frame[0] == LAST_FRAME ? + left_mbmi : above_mbmi; + + pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME); + } else { + pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) + + 2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME); } - pred_context = (above_context + left_context > max_tx_size); - break; + } else if (above_mbmi->ref_frame[1] > INTRA_FRAME && + left_mbmi->ref_frame[1] > INTRA_FRAME) { + if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] && + above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1]) + pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME || + above_mbmi->ref_frame[1] == GOLDEN_FRAME || + left_mbmi->ref_frame[0] == GOLDEN_FRAME || + left_mbmi->ref_frame[1] == GOLDEN_FRAME); + else + pred_context = 2; + } else { + MV_REFERENCE_FRAME rfs = above_mbmi->ref_frame[1] <= INTRA_FRAME ? + above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; + MV_REFERENCE_FRAME crf1 = above_mbmi->ref_frame[1] > INTRA_FRAME ? + above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; + MV_REFERENCE_FRAME crf2 = above_mbmi->ref_frame[1] > INTRA_FRAME ? + above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; + + if (rfs == GOLDEN_FRAME) + pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); + else if (rfs == ALTREF_FRAME) + pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME; + else + pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); } - - default: - assert(0); - pred_context = 0; // *** add error trap code. - break; + } else if (above_in_image || left_in_image) { // one edge available + const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + + if (edge_mbmi->ref_frame[0] == INTRA_FRAME || + (edge_mbmi->ref_frame[0] == LAST_FRAME && + edge_mbmi->ref_frame[1] <= INTRA_FRAME)) + pred_context = 2; + else if (edge_mbmi->ref_frame[1] <= INTRA_FRAME) + pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME); + else + pred_context = 3 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME || + edge_mbmi->ref_frame[1] == GOLDEN_FRAME); + } else { // no edges available (2) + pred_context = 2; } - + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); return pred_context; } - -// This function returns a context probability for coding a given -// prediction signal -vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id) { - const int pred_context = vp9_get_pred_context(cm, xd, pred_id); - - switch (pred_id) { - case PRED_SEG_ID: - return cm->segment_pred_probs[pred_context]; - case PRED_MBSKIP: - return cm->fc.mbskip_probs[pred_context]; - case PRED_INTRA_INTER: - return cm->fc.intra_inter_prob[pred_context]; - case PRED_COMP_INTER_INTER: - return cm->fc.comp_inter_prob[pred_context]; - case PRED_COMP_REF_P: - return cm->fc.comp_ref_prob[pred_context]; - case PRED_SINGLE_REF_P1: - return cm->fc.single_ref_prob[pred_context][0]; - case PRED_SINGLE_REF_P2: - return cm->fc.single_ref_prob[pred_context][1]; - default: - assert(0); - return 128; // *** add error trap code. - } -} - -// This function returns a context probability ptr for coding a given -// prediction signal -const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id) { +// Returns a context number for the given MB prediction signal +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real blocks. +// The prediction flags in these dummy entries are initialized to 0. +unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) { const MODE_INFO *const mi = xd->mode_info_context; - const int pred_context = vp9_get_pred_context(cm, xd, pred_id); + const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; + const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; + const int left_in_image = xd->left_available && left_mbmi->mb_in_image; + const int above_in_image = xd->up_available && above_mbmi->mb_in_image; + const int max_tx_size = max_txsize_lookup[mi->mbmi.sb_type]; + int above_context = max_tx_size; + int left_context = max_tx_size; - switch (pred_id) { - case PRED_SWITCHABLE_INTERP: - return &cm->fc.switchable_interp_prob[pred_context][0]; + if (above_in_image) + above_context = above_mbmi->mb_skip_coeff ? max_tx_size + : above_mbmi->txfm_size; - case PRED_TX_SIZE: - if (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16) - return cm->fc.tx_probs_8x8p[pred_context]; - else if (mi->mbmi.sb_type < BLOCK_SIZE_SB32X32) - return cm->fc.tx_probs_16x16p[pred_context]; - else - return cm->fc.tx_probs_32x32p[pred_context]; + if (left_in_image) + left_context = left_mbmi->mb_skip_coeff ? max_tx_size + : left_mbmi->txfm_size; - default: - assert(0); - return NULL; // *** add error trap code. - } -} + if (!left_in_image) + left_context = above_context; -// This function returns the status of the given prediction signal. -// I.e. is the predicted value for the given signal correct. -unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd, - PRED_ID pred_id) { - switch (pred_id) { - case PRED_SEG_ID: - return xd->mode_info_context->mbmi.seg_id_predicted; - case PRED_MBSKIP: - return xd->mode_info_context->mbmi.mb_skip_coeff; - default: - assert(0); - return 0; // *** add error trap code. - } + if (!above_in_image) + above_context = left_context; + + return above_context + left_context > max_tx_size; } -// This function sets the status of the given prediction signal. -// I.e. is the predicted value for the given signal correct. -void vp9_set_pred_flag(MACROBLOCKD *const xd, - PRED_ID pred_id, - unsigned char pred_flag) { - const int mis = xd->mode_info_stride; - BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; - const int bh = 1 << mi_height_log2(bsize); +void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize, + int mi_row, int mi_col, uint8_t pred_flag) { + MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col]; const int bw = 1 << mi_width_log2(bsize); -#define sub(a, b) (b) < 0 ? (a) + (b) : (a) - const int x_mis = sub(bw, xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)); - const int y_mis = sub(bh, xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)); -#undef sub + const int bh = 1 << mi_height_log2(bsize); + const int xmis = MIN(cm->mi_cols - mi_col, bw); + const int ymis = MIN(cm->mi_rows - mi_row, bh); int x, y; - switch (pred_id) { - case PRED_SEG_ID: - for (y = 0; y < y_mis; y++) { - for (x = 0; x < x_mis; x++) { - xd->mode_info_context[y * mis + x].mbmi.seg_id_predicted = pred_flag; - } - } - break; - - case PRED_MBSKIP: - for (y = 0; y < y_mis; y++) { - for (x = 0; x < x_mis; x++) { - xd->mode_info_context[y * mis + x].mbmi.mb_skip_coeff = pred_flag; - } - } - break; - - default: - assert(0); - // *** add error trap code. - break; - } + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) + mi[y * cm->mode_info_stride + x].mbmi.seg_id_predicted = pred_flag; } +void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize, + int mi_row, int mi_col, uint8_t pred_flag) { + MODE_INFO *mi = &cm->mi[mi_row * cm->mode_info_stride + mi_col]; + const int bw = 1 << mi_width_log2(bsize); + const int bh = 1 << mi_height_log2(bsize); + const int xmis = MIN(cm->mi_cols - mi_col, bw); + const int ymis = MIN(cm->mi_rows - mi_row, bh); + int x, y; -// The following contain the guts of the prediction code used to -// peredict various bitstream signals. + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) + mi[y * cm->mode_info_stride + x].mbmi.mb_skip_coeff = pred_flag; +} -// Macroblock segment id prediction function -int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type, - int mi_row, int mi_col) { - const int mi_index = mi_row * cm->mi_cols + mi_col; - const int bw = 1 << mi_width_log2(sb_type); - const int bh = 1 << mi_height_log2(sb_type); - const int ymis = MIN(cm->mi_rows - mi_row, bh); +int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, + BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col) { + const int mi_offset = mi_row * cm->mi_cols + mi_col; + const int bw = 1 << mi_width_log2(bsize); + const int bh = 1 << mi_height_log2(bsize); const int xmis = MIN(cm->mi_cols - mi_col, bw); - int segment_id = INT_MAX; - int x, y; + const int ymis = MIN(cm->mi_rows - mi_row, bh); + int x, y, segment_id = INT_MAX; - for (y = 0; y < ymis; y++) { - for (x = 0; x < xmis; x++) { - const int index = mi_index + (y * cm->mi_cols + x); - segment_id = MIN(segment_id, cm->last_frame_seg_map[index]); - } - } + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) + segment_id = MIN(segment_id, + segment_ids[mi_offset + y * cm->mi_cols + x]); + + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); return segment_id; } diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h index b728724..e4b6575 100644 --- a/libvpx/vp9/common/vp9_pred_common.h +++ b/libvpx/vp9/common/vp9_pred_common.h @@ -14,40 +14,125 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_onyxc_int.h" -// Predicted items -typedef enum { - PRED_SEG_ID = 0, // Segment identifier - PRED_MBSKIP = 1, - PRED_SWITCHABLE_INTERP = 2, - PRED_INTRA_INTER = 3, - PRED_COMP_INTER_INTER = 4, - PRED_SINGLE_REF_P1 = 5, - PRED_SINGLE_REF_P2 = 6, - PRED_COMP_REF_P = 7, - PRED_TX_SIZE = 8 -} PRED_ID; - -unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id); - -vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id); - -const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id); - -unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd, - PRED_ID pred_id); - -void vp9_set_pred_flag(MACROBLOCKD *const xd, - PRED_ID pred_id, - unsigned char pred_flag); - - -int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type, - int mi_row, int mi_col); +int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, + BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col); + + +static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) { + const MODE_INFO *const mi = xd->mode_info_context; + const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; + const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; + + return above_mbmi->seg_id_predicted + + (xd->left_available ? left_mbmi->seg_id_predicted : 0); +} + +static INLINE vp9_prob vp9_get_pred_prob_seg_id(const MACROBLOCKD *xd) { + return xd->seg.pred_probs[vp9_get_pred_context_seg_id(xd)]; +} + +void vp9_set_pred_flag_seg_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize, + int mi_row, int mi_col, uint8_t pred_flag); + +static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) { + const MODE_INFO *const mi = xd->mode_info_context; + const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; + const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; + + return above_mbmi->mb_skip_coeff + + (xd->left_available ? left_mbmi->mb_skip_coeff : 0); +} + +static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc.mbskip_probs[vp9_get_pred_context_mbskip(xd)]; +} + +static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) { + return xd->mode_info_context->mbmi.mb_skip_coeff; +} + +void vp9_set_pred_flag_mbskip(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize, + int mi_row, int mi_col, uint8_t pred_flag); + +unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd); + +static INLINE const vp9_prob *vp9_get_pred_probs_switchable_interp( + const VP9_COMMON *cm, const MACROBLOCKD *xd) { + const int pred_context = vp9_get_pred_context_switchable_interp(xd); + return &cm->fc.switchable_interp_prob[pred_context][0]; +} + +unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd); + +static INLINE vp9_prob vp9_get_pred_prob_intra_inter(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + const int pred_context = vp9_get_pred_context_intra_inter(xd); + return cm->fc.intra_inter_prob[pred_context]; +} + +unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm, + const MACROBLOCKD *xd); + + +static INLINE vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + const int pred_context = vp9_get_pred_context_comp_inter_inter(cm, xd); + return cm->fc.comp_inter_prob[pred_context]; +} + +unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, + const MACROBLOCKD *xd); + +static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd); + return cm->fc.comp_ref_prob[pred_context]; +} + +unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd); + +static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + const int pred_context = vp9_get_pred_context_single_ref_p1(xd); + return cm->fc.single_ref_prob[pred_context][0]; +} + +unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd); + +static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + const int pred_context = vp9_get_pred_context_single_ref_p2(xd); + return cm->fc.single_ref_prob[pred_context][1]; +} + +unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd); + +static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context, + const struct tx_probs *tx_probs) { + if (bsize < BLOCK_SIZE_MB16X16) + return tx_probs->p8x8[context]; + else if (bsize < BLOCK_SIZE_SB32X32) + return tx_probs->p16x16[context]; + else + return tx_probs->p32x32[context]; +} + +static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd, + const struct tx_probs *tx_probs) { + const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; + const int context = vp9_get_pred_context_tx_size(xd); + return get_tx_probs(bsize, context, tx_probs); +} + +static void update_tx_counts(BLOCK_SIZE_TYPE bsize, uint8_t context, + TX_SIZE tx_size, struct tx_counts *tx_counts) { + if (bsize >= BLOCK_SIZE_SB32X32) + tx_counts->p32x32[context][tx_size]++; + else if (bsize >= BLOCK_SIZE_MB16X16) + tx_counts->p16x16[context][tx_size]++; + else + tx_counts->p8x8[context][tx_size]++; +} #endif // VP9_COMMON_VP9_PRED_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_quant_common.c b/libvpx/vp9/common/vp9_quant_common.c index 295c8e7..48d86c5 100644 --- a/libvpx/vp9/common/vp9_quant_common.c +++ b/libvpx/vp9/common/vp9_quant_common.c @@ -12,6 +12,79 @@ #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_seg_common.h" +#if 1 +static const int16_t dc_qlookup[QINDEX_RANGE] = { + 4, 8, 8, 9, 10, 11, 12, 12, + 13, 14, 15, 16, 17, 18, 19, 19, + 20, 21, 22, 23, 24, 25, 26, 26, + 27, 28, 29, 30, 31, 32, 32, 33, + 34, 35, 36, 37, 38, 38, 39, 40, + 41, 42, 43, 43, 44, 45, 46, 47, + 48, 48, 49, 50, 51, 52, 53, 53, + 54, 55, 56, 57, 57, 58, 59, 60, + 61, 62, 62, 63, 64, 65, 66, 66, + 67, 68, 69, 70, 70, 71, 72, 73, + 74, 74, 75, 76, 77, 78, 78, 79, + 80, 81, 81, 82, 83, 84, 85, 85, + 87, 88, 90, 92, 93, 95, 96, 98, + 99, 101, 102, 104, 105, 107, 108, 110, + 111, 113, 114, 116, 117, 118, 120, 121, + 123, 125, 127, 129, 131, 134, 136, 138, + 140, 142, 144, 146, 148, 150, 152, 154, + 156, 158, 161, 164, 166, 169, 172, 174, + 177, 180, 182, 185, 187, 190, 192, 195, + 199, 202, 205, 208, 211, 214, 217, 220, + 223, 226, 230, 233, 237, 240, 243, 247, + 250, 253, 257, 261, 265, 269, 272, 276, + 280, 284, 288, 292, 296, 300, 304, 309, + 313, 317, 322, 326, 330, 335, 340, 344, + 349, 354, 359, 364, 369, 374, 379, 384, + 389, 395, 400, 406, 411, 417, 423, 429, + 435, 441, 447, 454, 461, 467, 475, 482, + 489, 497, 505, 513, 522, 530, 539, 549, + 559, 569, 579, 590, 602, 614, 626, 640, + 654, 668, 684, 700, 717, 736, 755, 775, + 796, 819, 843, 869, 896, 925, 955, 988, + 1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336, +}; + +static const int16_t ac_qlookup[QINDEX_RANGE] = { + 4, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, 65, 66, 67, 68, 69, 70, + 71, 72, 73, 74, 75, 76, 77, 78, + 79, 80, 81, 82, 83, 84, 85, 86, + 87, 88, 89, 90, 91, 92, 93, 94, + 95, 96, 97, 98, 99, 100, 101, 102, + 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, + 136, 138, 140, 142, 144, 146, 148, 150, + 152, 155, 158, 161, 164, 167, 170, 173, + 176, 179, 182, 185, 188, 191, 194, 197, + 200, 203, 207, 211, 215, 219, 223, 227, + 231, 235, 239, 243, 247, 251, 255, 260, + 265, 270, 275, 280, 285, 290, 295, 300, + 305, 311, 317, 323, 329, 335, 341, 347, + 353, 359, 366, 373, 380, 387, 394, 401, + 408, 416, 424, 432, 440, 448, 456, 465, + 474, 483, 492, 501, 510, 520, 530, 540, + 550, 560, 571, 582, 593, 604, 615, 627, + 639, 651, 663, 676, 689, 702, 715, 729, + 743, 757, 771, 786, 801, 816, 832, 848, + 864, 881, 898, 915, 933, 951, 969, 988, + 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, + 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343, + 1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567, + 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, +}; + +void vp9_init_quant_tables(void) { } +#else static int16_t dc_qlookup[QINDEX_RANGE]; static int16_t ac_qlookup[QINDEX_RANGE]; @@ -46,6 +119,7 @@ void vp9_init_quant_tables() { 0.5, ac_val)); } } +#endif int16_t vp9_dc_quant(int qindex, int delta) { return dc_qlookup[clamp(qindex + delta, 0, MAXQ)]; @@ -57,9 +131,9 @@ int16_t vp9_ac_quant(int qindex, int delta) { int vp9_get_qindex(MACROBLOCKD *xd, int segment_id, int base_qindex) { - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) { - const int data = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q); - return xd->mb_segment_abs_delta == SEGMENT_ABSDATA ? + if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_ALT_Q)) { + const int data = vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_ALT_Q); + return xd->seg.abs_delta == SEGMENT_ABSDATA ? data : // Abs value clamp(base_qindex + data, 0, MAXQ); // Delta value } else { diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c index b28d333..63e5646 100644 --- a/libvpx/vp9/common/vp9_reconinter.c +++ b/libvpx/vp9/common/vp9_reconinter.c @@ -16,6 +16,7 @@ #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" +#include "./vpx_scale_rtcd.h" static int scale_value_x_with_scaling(int val, const struct scale_factors *scale) { @@ -32,45 +33,42 @@ static int unscaled_value(int val, const struct scale_factors *scale) { return val; } -static int_mv32 mv_q3_to_q4_with_scaling(const int_mv *src_mv, - const struct scale_factors *scale) { - // returns mv * scale + offset - int_mv32 result; - const int32_t mv_row_q4 = src_mv->as_mv.row << 1; - const int32_t mv_col_q4 = src_mv->as_mv.col << 1; - - result.as_mv.row = (mv_row_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) - + scale->y_offset_q4; - result.as_mv.col = (mv_col_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) - + scale->x_offset_q4; - return result; +static MV32 mv_q3_to_q4_with_scaling(const MV *mv, + const struct scale_factors *scale) { + const MV32 res = { + ((mv->row << 1) * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) + + scale->y_offset_q4, + ((mv->col << 1) * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) + + scale->x_offset_q4 + }; + return res; } -static int_mv32 mv_q3_to_q4_without_scaling(const int_mv *src_mv, - const struct scale_factors *scale) { - // returns mv * scale + offset - int_mv32 result; - - result.as_mv.row = src_mv->as_mv.row << 1; - result.as_mv.col = src_mv->as_mv.col << 1; - return result; +static MV32 mv_q3_to_q4_without_scaling(const MV *mv, + const struct scale_factors *scale) { + const MV32 res = { + mv->row << 1, + mv->col << 1 + }; + return res; } -static int32_t mv_component_q4_with_scaling(int mv_q4, int scale_fp, - int offset_q4) { - int32_t scaled_mv; - // returns the scaled and offset value of the mv component. - scaled_mv = (mv_q4 * scale_fp >> VP9_REF_SCALE_SHIFT) + offset_q4; - - return scaled_mv; +static MV32 mv_q4_with_scaling(const MV *mv, + const struct scale_factors *scale) { + const MV32 res = { + (mv->row * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->y_offset_q4, + (mv->col * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) + scale->x_offset_q4 + }; + return res; } -static int32_t mv_component_q4_without_scaling(int mv_q4, int scale_fp, - int offset_q4) { - // returns the scaled and offset value of the mv component. - (void)scale_fp; - (void)offset_q4; - return mv_q4; +static MV32 mv_q4_without_scaling(const MV *mv, + const struct scale_factors *scale) { + const MV32 res = { + mv->row, + mv->col + }; + return res; } static void set_offsets_with_scaling(struct scale_factors *scale, @@ -112,13 +110,13 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, scale->scale_value_y = unscaled_value; scale->set_scaled_offsets = set_offsets_without_scaling; scale->scale_mv_q3_to_q4 = mv_q3_to_q4_without_scaling; - scale->scale_mv_component_q4 = mv_component_q4_without_scaling; + scale->scale_mv_q4 = mv_q4_without_scaling; } else { scale->scale_value_x = scale_value_x_with_scaling; scale->scale_value_y = scale_value_y_with_scaling; scale->set_scaled_offsets = set_offsets_with_scaling; scale->scale_mv_q3_to_q4 = mv_q3_to_q4_with_scaling; - scale->scale_mv_component_q4 = mv_component_q4_with_scaling; + scale->scale_mv_q4 = mv_q4_with_scaling; } // TODO(agrange): Investigate the best choice of functions to use here @@ -175,9 +173,7 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd, if (xd->mode_info_context) { MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; - set_scale_factors(xd, - mbmi->ref_frame[0] - 1, - mbmi->ref_frame[1] - 1, + set_scale_factors(xd, mbmi->ref_frame[0] - 1, mbmi->ref_frame[1] - 1, cm->active_ref_scale); } @@ -199,124 +195,20 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd, assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0); } -void vp9_copy_mem16x16_c(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride) { - int r; - - for (r = 0; r < 16; r++) { -#if !(CONFIG_FAST_UNALIGNED) - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - dst[4] = src[4]; - dst[5] = src[5]; - dst[6] = src[6]; - dst[7] = src[7]; - dst[8] = src[8]; - dst[9] = src[9]; - dst[10] = src[10]; - dst[11] = src[11]; - dst[12] = src[12]; - dst[13] = src[13]; - dst[14] = src[14]; - dst[15] = src[15]; - -#else - ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; - ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2]; - ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3]; - -#endif - src += src_stride; - dst += dst_stride; - } -} - -void vp9_copy_mem8x8_c(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride) { - int r; - - for (r = 0; r < 8; r++) { -#if !(CONFIG_FAST_UNALIGNED) - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - dst[4] = src[4]; - dst[5] = src[5]; - dst[6] = src[6]; - dst[7] = src[7]; -#else - ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; -#endif - src += src_stride; - dst += dst_stride; - } -} - -void vp9_copy_mem8x4_c(const uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride) { - int r; - - for (r = 0; r < 4; r++) { -#if !(CONFIG_FAST_UNALIGNED) - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - dst[4] = src[4]; - dst[5] = src[5]; - dst[6] = src[6]; - dst[7] = src[7]; -#else - ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; -#endif - src += src_stride; - dst += dst_stride; - } -} - void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, - const int_mv *mv_q3, + const int_mv *src_mv, const struct scale_factors *scale, int w, int h, int weight, - const struct subpix_fn_table *subpix) { - int_mv32 mv = scale->scale_mv_q3_to_q4(mv_q3, scale); - src += (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4); - scale->predict[!!(mv.as_mv.col & 15)][!!(mv.as_mv.row & 15)][weight]( - src, src_stride, dst, dst_stride, - subpix->filter_x[mv.as_mv.col & 15], scale->x_step_q4, - subpix->filter_y[mv.as_mv.row & 15], scale->y_step_q4, - w, h); -} - -void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - const int_mv *mv_q4, - const struct scale_factors *scale, - int w, int h, int weight, - const struct subpix_fn_table *subpix) { - const int scaled_mv_row_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.row, - scale->y_scale_fp, - scale->y_offset_q4); - const int scaled_mv_col_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.col, - scale->x_scale_fp, - scale->x_offset_q4); - const int subpel_x = scaled_mv_col_q4 & 15; - const int subpel_y = scaled_mv_row_q4 & 15; - - src += (scaled_mv_row_q4 >> 4) * src_stride + (scaled_mv_col_q4 >> 4); + const struct subpix_fn_table *subpix, + enum mv_precision precision) { + const MV32 mv = precision == MV_PRECISION_Q4 + ? scale->scale_mv_q4(&src_mv->as_mv, scale) + : scale->scale_mv_q3_to_q4(&src_mv->as_mv, scale); + const int subpel_x = mv.col & 15; + const int subpel_y = mv.row & 15; + + src += (mv.row >> 4) * src_stride + (mv.col >> 4); scale->predict[!!subpel_x][!!subpel_y][weight]( src, src_stride, dst, dst_stride, subpix->filter_x[subpel_x], scale->x_step_q4, @@ -387,17 +279,16 @@ static void build_inter_predictors(int plane, int block, MACROBLOCKD * const xd = arg->xd; const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; - const int bh = 4 << bhl, bw = 4 << bwl; const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl); const int use_second_ref = xd->mode_info_context->mbmi.ref_frame[1] > 0; int which_mv; - assert(x < bw); - assert(y < bh); + assert(x < (4 << bwl)); + assert(y < (4 << bhl)); assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 || - 4 << pred_w == bw); + 4 << pred_w == (4 << bwl)); assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 || - 4 << pred_h == bh); + 4 << pred_h == (4 << bhl)); for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { // source @@ -405,8 +296,7 @@ static void build_inter_predictors(int plane, int block, const int pre_stride = arg->pre_stride[which_mv][plane]; const uint8_t *const pre = base_pre + scaled_buffer_offset(x, y, pre_stride, &xd->scale_factor[which_mv]); - struct scale_factors * const scale = - plane == 0 ? &xd->scale_factor[which_mv] : &xd->scale_factor_uv[which_mv]; + struct scale_factors * const scale = &xd->scale_factor[which_mv]; // dest uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x; @@ -446,11 +336,11 @@ static void build_inter_predictors(int plane, int block, xd->mb_to_bottom_edge); scale->set_scaled_offsets(scale, arg->y + y, arg->x + x); - vp9_build_inter_predictor_q4(pre, pre_stride, - dst, arg->dst_stride[plane], - &clamped_mv, &xd->scale_factor[which_mv], - 4 << pred_w, 4 << pred_h, which_mv, - &xd->subpix); + vp9_build_inter_predictor(pre, pre_stride, + dst, arg->dst_stride[plane], + &clamped_mv, &xd->scale_factor[which_mv], + 4 << pred_w, 4 << pred_h, which_mv, + &xd->subpix, MV_PRECISION_Q4); } } void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, @@ -505,13 +395,6 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize); } -/*encoder only*/ -void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd, - int mb_row, int mb_col) { - vp9_build_inter_predictors_sbuv(xd, mb_row, mb_col, - BLOCK_SIZE_MB16X16); -} - // TODO(dkovalev: find better place for this function) void vp9_setup_scale_factors(VP9_COMMON *cm, int i) { const int ref = cm->active_ref_idx[i]; @@ -523,6 +406,10 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) { vp9_setup_scale_factors_for_frame(sf, fb->y_crop_width, fb->y_crop_height, cm->width, cm->height); + + if (sf->x_scale_fp != VP9_REF_NO_SCALE || + sf->y_scale_fp != VP9_REF_NO_SCALE) + vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y); } } diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h index 4e52185..e37750d 100644 --- a/libvpx/vp9/common/vp9_reconinter.h +++ b/libvpx/vp9/common/vp9_reconinter.h @@ -42,14 +42,8 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride, const int_mv *mv_q3, const struct scale_factors *scale, int w, int h, int do_avg, - const struct subpix_fn_table *subpix); - -void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - const int_mv *mv_q4, - const struct scale_factors *scale, - int w, int h, int do_avg, - const struct subpix_fn_table *subpix); + const struct subpix_fn_table *subpix, + enum mv_precision precision); static int scaled_buffer_offset(int x_offset, int y_offset, int stride, const struct scale_factors *scale) { @@ -86,43 +80,29 @@ static void setup_dst_planes(MACROBLOCKD *xd, } } -static void setup_pre_planes(MACROBLOCKD *xd, - const YV12_BUFFER_CONFIG *src0, - const YV12_BUFFER_CONFIG *src1, +static void setup_pre_planes(MACROBLOCKD *xd, int i, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, - const struct scale_factors *scale, - const struct scale_factors *scale_uv) { - const YV12_BUFFER_CONFIG *srcs[2] = {src0, src1}; - int i, j; - - for (i = 0; i < 2; ++i) { - const YV12_BUFFER_CONFIG *src = srcs[i]; - if (src) { - uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - - for (j = 0; j < MAX_MB_PLANE; ++j) { - struct macroblockd_plane *pd = &xd->plane[j]; - const struct scale_factors *sf = j ? scale_uv : scale; - setup_pred_plane(&pd->pre[i], - buffers[j], strides[j], - mi_row, mi_col, sf ? &sf[i] : NULL, - pd->subsampling_x, pd->subsampling_y); - } + const struct scale_factors *sf) { + if (src) { + int j; + uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, + src->alpha_buffer}; + int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, + src->alpha_stride}; + + for (j = 0; j < MAX_MB_PLANE; ++j) { + struct macroblockd_plane *pd = &xd->plane[j]; + setup_pred_plane(&pd->pre[i], buffers[j], strides[j], + mi_row, mi_col, sf, pd->subsampling_x, pd->subsampling_y); } } } -static void set_scale_factors(MACROBLOCKD *xd, - int ref0, int ref1, - struct scale_factors scale_factor[MAX_REF_FRAMES]) { - - xd->scale_factor[0] = scale_factor[ref0 >= 0 ? ref0 : 0]; - xd->scale_factor[1] = scale_factor[ref1 >= 0 ? ref1 : 0]; - xd->scale_factor_uv[0] = xd->scale_factor[0]; - xd->scale_factor_uv[1] = xd->scale_factor[1]; +static void set_scale_factors(MACROBLOCKD *xd, int ref0, int ref1, + struct scale_factors sf[MAX_REF_FRAMES]) { + xd->scale_factor[0] = sf[ref0 >= 0 ? ref0 : 0]; + xd->scale_factor[1] = sf[ref1 >= 0 ? ref1 : 0]; } void vp9_setup_scale_factors(VP9_COMMON *cm, int i); diff --git a/libvpx/vp9/common/vp9_reconintra.c b/libvpx/vp9/common/vp9_reconintra.c index 85dfe51..f351224 100644 --- a/libvpx/vp9/common/vp9_reconintra.c +++ b/libvpx/vp9/common/vp9_reconintra.c @@ -15,187 +15,351 @@ #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_onyxc_int.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_once.h" -static void d27_predictor(uint8_t *ypred_ptr, int y_stride, - int bw, int bh, - uint8_t *yabove_row, uint8_t *yleft_col) { +const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = { + DCT_DCT, // DC + ADST_DCT, // V + DCT_ADST, // H + DCT_DCT, // D45 + ADST_ADST, // D135 + ADST_DCT, // D117 + DCT_ADST, // D153 + DCT_ADST, // D27 + ADST_DCT, // D63 + ADST_ADST, // TM + DCT_DCT, // NEARESTMV + DCT_DCT, // NEARMV + DCT_DCT, // ZEROMV + DCT_DCT // NEWMV +}; + +#define intra_pred_sized(type, size) \ +void vp9_##type##_predictor_##size##x##size##_c(uint8_t *pred_ptr, \ + ptrdiff_t stride, \ + uint8_t *above_row, \ + uint8_t *left_col) { \ + type##_predictor(pred_ptr, stride, size, above_row, left_col); \ +} +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) + +static INLINE void d27_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, + uint8_t *above_row, uint8_t *left_col) { int r, c; // first column - for (r = 0; r < bh - 1; ++r) { - ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r] + - yleft_col[r + 1], 1); + for (r = 0; r < bs - 1; ++r) { + pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] + + left_col[r + 1], 1); } - ypred_ptr[(bh - 1) * y_stride] = yleft_col[bh-1]; - ypred_ptr++; + pred_ptr[(bs - 1) * stride] = left_col[bs - 1]; + pred_ptr++; // second column - for (r = 0; r < bh - 2; ++r) { - ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r] + - yleft_col[r + 1] * 2 + - yleft_col[r + 2], 2); + for (r = 0; r < bs - 2; ++r) { + pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r] + + left_col[r + 1] * 2 + + left_col[r + 2], 2); } - ypred_ptr[(bh - 2) * y_stride] = ROUND_POWER_OF_TWO(yleft_col[bh - 2] + - yleft_col[bh - 1] * 3, + pred_ptr[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left_col[bs - 2] + + left_col[bs - 1] * 3, 2); - ypred_ptr[(bh - 1) * y_stride] = yleft_col[bh-1]; - ypred_ptr++; + pred_ptr[(bs - 1) * stride] = left_col[bs - 1]; + pred_ptr++; // rest of last row - for (c = 0; c < bw - 2; ++c) { - ypred_ptr[(bh - 1) * y_stride + c] = yleft_col[bh-1]; + for (c = 0; c < bs - 2; ++c) { + pred_ptr[(bs - 1) * stride + c] = left_col[bs - 1]; } - for (r = bh - 2; r >= 0; --r) { - for (c = 0; c < bw - 2; ++c) { - ypred_ptr[r * y_stride + c] = ypred_ptr[(r + 1) * y_stride + c - 2]; + for (r = bs - 2; r >= 0; --r) { + for (c = 0; c < bs - 2; ++c) { + pred_ptr[r * stride + c] = pred_ptr[(r + 1) * stride + c - 2]; } } } +intra_pred_allsizes(d27) -static void d63_predictor(uint8_t *ypred_ptr, int y_stride, - int bw, int bh, - uint8_t *yabove_row, uint8_t *yleft_col) { +static INLINE void d63_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, + uint8_t *above_row, uint8_t *left_col) { int r, c; - for (r = 0; r < bh; ++r) { - for (c = 0; c < bw; ++c) { + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { if (r & 1) { - ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[r/2 + c] + - yabove_row[r/2 + c + 1] * 2 + - yabove_row[r/2 + c + 2], 2); + pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] + + above_row[r/2 + c + 1] * 2 + + above_row[r/2 + c + 2], 2); } else { - ypred_ptr[c] =ROUND_POWER_OF_TWO(yabove_row[r/2 + c] + - yabove_row[r/2+ c + 1], 1); + pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r/2 + c] + + above_row[r/2+ c + 1], 1); } } - ypred_ptr += y_stride; + pred_ptr += stride; } } +intra_pred_allsizes(d63) -static void d45_predictor(uint8_t *ypred_ptr, int y_stride, - int bw, int bh, - uint8_t *yabove_row, uint8_t *yleft_col) { +static INLINE void d45_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, + uint8_t *above_row, uint8_t *left_col) { int r, c; - for (r = 0; r < bh; ++r) { - for (c = 0; c < bw; ++c) { - if (r + c + 2 < bw * 2) - ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[r + c] + - yabove_row[r + c + 1] * 2 + - yabove_row[r + c + 2], 2); + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + if (r + c + 2 < bs * 2) + pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[r + c] + + above_row[r + c + 1] * 2 + + above_row[r + c + 2], 2); else - ypred_ptr[c] = yabove_row[bw * 2 - 1]; + pred_ptr[c] = above_row[bs * 2 - 1]; } - ypred_ptr += y_stride; + pred_ptr += stride; } } +intra_pred_allsizes(d45) -static void d117_predictor(uint8_t *ypred_ptr, int y_stride, - int bw, int bh, - uint8_t *yabove_row, uint8_t *yleft_col) { +static INLINE void d117_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, + uint8_t *above_row, uint8_t *left_col) { int r, c; // first row - for (c = 0; c < bw; c++) - ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] + yabove_row[c], 1); - ypred_ptr += y_stride; + for (c = 0; c < bs; c++) + pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] + above_row[c], 1); + pred_ptr += stride; // second row - ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] + - yabove_row[-1] * 2 + - yabove_row[0], 2); - for (c = 1; c < bw; c++) - ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 2] + - yabove_row[c - 1] * 2 + - yabove_row[c], 2); - ypred_ptr += y_stride; + pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] + + above_row[-1] * 2 + + above_row[0], 2); + for (c = 1; c < bs; c++) + pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] + + above_row[c - 1] * 2 + + above_row[c], 2); + pred_ptr += stride; // the rest of first col - ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] + - yleft_col[0] * 2 + - yleft_col[1], 2); - for (r = 3; r < bh; ++r) - ypred_ptr[(r-2) * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 3] + - yleft_col[r - 2] * 2 + - yleft_col[r - 1], 2); + pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] + + left_col[0] * 2 + + left_col[1], 2); + for (r = 3; r < bs; ++r) + pred_ptr[(r-2) * stride] = ROUND_POWER_OF_TWO(left_col[r - 3] + + left_col[r - 2] * 2 + + left_col[r - 1], 2); // the rest of the block - for (r = 2; r < bh; ++r) { - for (c = 1; c < bw; c++) - ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1]; - ypred_ptr += y_stride; + for (r = 2; r < bs; ++r) { + for (c = 1; c < bs; c++) + pred_ptr[c] = pred_ptr[-2 * stride + c - 1]; + pred_ptr += stride; } } +intra_pred_allsizes(d117) + +static INLINE void d135_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, + uint8_t *above_row, uint8_t *left_col) { + int r, c; + pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] + + above_row[-1] * 2 + + above_row[0], 2); + for (c = 1; c < bs; c++) + pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 2] + + above_row[c - 1] * 2 + + above_row[c], 2); + + pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] + + left_col[0] * 2 + + left_col[1], 2); + for (r = 2; r < bs; ++r) + pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] + + left_col[r - 1] * 2 + + left_col[r], 2); + pred_ptr += stride; + for (r = 1; r < bs; ++r) { + for (c = 1; c < bs; c++) + pred_ptr[c] = pred_ptr[-stride + c - 1]; + pred_ptr += stride; + } +} +intra_pred_allsizes(d135) -static void d135_predictor(uint8_t *ypred_ptr, int y_stride, - int bw, int bh, - uint8_t *yabove_row, uint8_t *yleft_col) { +static INLINE void d153_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, + uint8_t *above_row, uint8_t *left_col) { int r, c; - ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] + - yabove_row[-1] * 2 + - yabove_row[0], 2); - for (c = 1; c < bw; c++) - ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 2] + - yabove_row[c - 1] * 2 + - yabove_row[c], 2); - - ypred_ptr[y_stride] = ROUND_POWER_OF_TWO(yabove_row[-1] + - yleft_col[0] * 2 + - yleft_col[1], 2); - for (r = 2; r < bh; ++r) - ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 2] + - yleft_col[r - 1] * 2 + - yleft_col[r], 2); - - ypred_ptr += y_stride; - for (r = 1; r < bh; ++r) { - for (c = 1; c < bw; c++) - ypred_ptr[c] = ypred_ptr[-y_stride + c - 1]; - ypred_ptr += y_stride; + pred_ptr[0] = ROUND_POWER_OF_TWO(above_row[-1] + left_col[0], 1); + for (r = 1; r < bs; r++) + pred_ptr[r * stride] = + ROUND_POWER_OF_TWO(left_col[r - 1] + left_col[r], 1); + pred_ptr++; + + pred_ptr[0] = ROUND_POWER_OF_TWO(left_col[0] + + above_row[-1] * 2 + + above_row[0], 2); + pred_ptr[stride] = ROUND_POWER_OF_TWO(above_row[-1] + + left_col[0] * 2 + + left_col[1], 2); + for (r = 2; r < bs; r++) + pred_ptr[r * stride] = ROUND_POWER_OF_TWO(left_col[r - 2] + + left_col[r - 1] * 2 + + left_col[r], 2); + pred_ptr++; + + for (c = 0; c < bs - 2; c++) + pred_ptr[c] = ROUND_POWER_OF_TWO(above_row[c - 1] + + above_row[c] * 2 + + above_row[c + 1], 2); + pred_ptr += stride; + for (r = 1; r < bs; ++r) { + for (c = 0; c < bs - 2; c++) + pred_ptr[c] = pred_ptr[-stride + c - 2]; + pred_ptr += stride; + } +} +intra_pred_allsizes(d153) + +static INLINE void v_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, + uint8_t *above_row, uint8_t *left_col) { + int r; + + for (r = 0; r < bs; r++) { + vpx_memcpy(pred_ptr, above_row, bs); + pred_ptr += stride; } } +intra_pred_allsizes(v) -static void d153_predictor(uint8_t *ypred_ptr, - int y_stride, - int bw, int bh, - uint8_t *yabove_row, - uint8_t *yleft_col) { +static INLINE void h_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, + uint8_t *above_row, uint8_t *left_col) { + int r; + + for (r = 0; r < bs; r++) { + vpx_memset(pred_ptr, left_col[r], bs); + pred_ptr += stride; + } +} +intra_pred_allsizes(h) + +static INLINE void tm_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, + uint8_t *above_row, uint8_t *left_col) { int r, c; - ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] + yleft_col[0], 1); - for (r = 1; r < bh; r++) - ypred_ptr[r * y_stride] = - ROUND_POWER_OF_TWO(yleft_col[r - 1] + yleft_col[r], 1); - ypred_ptr++; - - ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] + - yabove_row[-1] * 2 + - yabove_row[0], 2); - ypred_ptr[y_stride] = ROUND_POWER_OF_TWO(yabove_row[-1] + - yleft_col[0] * 2 + - yleft_col[1], 2); - for (r = 2; r < bh; r++) - ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 2] + - yleft_col[r - 1] * 2 + - yleft_col[r], 2); - ypred_ptr++; - - for (c = 0; c < bw - 2; c++) - ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] + - yabove_row[c] * 2 + - yabove_row[c + 1], 2); - ypred_ptr += y_stride; - for (r = 1; r < bh; ++r) { - for (c = 0; c < bw - 2; c++) - ypred_ptr[c] = ypred_ptr[-y_stride + c - 2]; - ypred_ptr += y_stride; + int ytop_left = above_row[-1]; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + pred_ptr[c] = clip_pixel(left_col[r] + above_row[c] - ytop_left); + pred_ptr += stride; + } +} +intra_pred_allsizes(tm) + +static INLINE void dc_128_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, + uint8_t *above_row, uint8_t *left_col) { + int r; + + for (r = 0; r < bs; r++) { + vpx_memset(pred_ptr, 128, bs); + pred_ptr += stride; + } +} +intra_pred_allsizes(dc_128) + +static INLINE void dc_left_predictor(uint8_t *pred_ptr, ptrdiff_t stride, + int bs, + uint8_t *above_row, uint8_t *left_col) { + int i, r; + int expected_dc = 128; + int average = 0; + const int count = bs; + + for (i = 0; i < bs; i++) + average += left_col[i]; + expected_dc = (average + (count >> 1)) / count; + + for (r = 0; r < bs; r++) { + vpx_memset(pred_ptr, expected_dc, bs); + pred_ptr += stride; + } +} +intra_pred_allsizes(dc_left) + +static INLINE void dc_top_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, + uint8_t *above_row, uint8_t *left_col) { + int i, r; + int expected_dc = 128; + int average = 0; + const int count = bs; + + for (i = 0; i < bs; i++) + average += above_row[i]; + expected_dc = (average + (count >> 1)) / count; + + for (r = 0; r < bs; r++) { + vpx_memset(pred_ptr, expected_dc, bs); + pred_ptr += stride; + } +} +intra_pred_allsizes(dc_top) + +static INLINE void dc_predictor(uint8_t *pred_ptr, ptrdiff_t stride, int bs, + uint8_t *above_row, uint8_t *left_col) { + int i, r; + int expected_dc = 128; + int average = 0; + const int count = 2 * bs; + + for (i = 0; i < bs; i++) + average += above_row[i]; + for (i = 0; i < bs; i++) + average += left_col[i]; + expected_dc = (average + (count >> 1)) / count; + + for (r = 0; r < bs; r++) { + vpx_memset(pred_ptr, expected_dc, bs); + pred_ptr += stride; } } +intra_pred_allsizes(dc) +#undef intra_pred_allsizes + +typedef void (*intra_pred_fn)(uint8_t *pred_ptr, ptrdiff_t stride, + uint8_t *above_row, uint8_t *left_col); + +static intra_pred_fn pred[VP9_INTRA_MODES][4]; +static intra_pred_fn dc_pred[2][2][4]; + +static void init_intra_pred_fn_ptrs(void) { +#define intra_pred_allsizes(l, type) \ + l[0] = vp9_##type##_predictor_4x4; \ + l[1] = vp9_##type##_predictor_8x8; \ + l[2] = vp9_##type##_predictor_16x16; \ + l[3] = vp9_##type##_predictor_32x32 + + intra_pred_allsizes(pred[V_PRED], v); + intra_pred_allsizes(pred[H_PRED], h); + intra_pred_allsizes(pred[D27_PRED], d27); + intra_pred_allsizes(pred[D45_PRED], d45); + intra_pred_allsizes(pred[D63_PRED], d63); + intra_pred_allsizes(pred[D117_PRED], d117); + intra_pred_allsizes(pred[D135_PRED], d135); + intra_pred_allsizes(pred[D153_PRED], d153); + intra_pred_allsizes(pred[TM_PRED], tm); + + intra_pred_allsizes(dc_pred[0][0], dc_128); + intra_pred_allsizes(dc_pred[0][1], dc_top); + intra_pred_allsizes(dc_pred[1][0], dc_left); + intra_pred_allsizes(dc_pred[1][1], dc); + +#undef intra_pred_allsizes +} -void vp9_build_intra_predictors(uint8_t *src, int src_stride, - uint8_t *ypred_ptr, - int y_stride, int mode, - int bw, int bh, - int up_available, int left_available, - int right_available) { - int r, c, i; - uint8_t yleft_col[64], yabove_data[129], ytop_left; - uint8_t *yabove_row = yabove_data + 1; +static void build_intra_predictors(uint8_t *src, int src_stride, + uint8_t *pred_ptr, int stride, + MB_PREDICTION_MODE mode, TX_SIZE txsz, + int up_available, int left_available, + int right_available) { + int i; + DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64); + DECLARE_ALIGNED_ARRAY(16, uint8_t, yabove_data, 128 + 16); + uint8_t *above_row = yabove_data + 16; + const int bs = 4 << txsz; // 127 127 127 .. 127 127 127 127 127 127 // 129 A B .. Y Z @@ -204,124 +368,37 @@ void vp9_build_intra_predictors(uint8_t *src, int src_stride, // 129 G H .. S T T T T T // .. - assert(bw == bh); - + once(init_intra_pred_fn_ptrs); if (left_available) { - for (i = 0; i < bh; i++) - yleft_col[i] = src[i * src_stride - 1]; + for (i = 0; i < bs; i++) + left_col[i] = src[i * src_stride - 1]; } else { - vpx_memset(yleft_col, 129, bh); + vpx_memset(left_col, 129, bs); } if (up_available) { - uint8_t *yabove_ptr = src - src_stride; - vpx_memcpy(yabove_row, yabove_ptr, bw); - if (bw == 4 && right_available) - vpx_memcpy(yabove_row + bw, yabove_ptr + bw, bw); - else - vpx_memset(yabove_row + bw, yabove_row[bw -1], bw); - ytop_left = left_available ? yabove_ptr[-1] : 129; - } else { - vpx_memset(yabove_row, 127, bw * 2); - ytop_left = 127; - } - yabove_row[-1] = ytop_left; - - switch (mode) { - case DC_PRED: { - int i; - int expected_dc = 128; - int average = 0; - int count = 0; - - if (up_available || left_available) { - if (up_available) { - for (i = 0; i < bw; i++) - average += yabove_row[i]; - count += bw; - } - if (left_available) { - for (i = 0; i < bh; i++) - average += yleft_col[i]; - count += bh; - } - expected_dc = (average + (count >> 1)) / count; - } - for (r = 0; r < bh; r++) { - vpx_memset(ypred_ptr, expected_dc, bw); - ypred_ptr += y_stride; - } + uint8_t *above_ptr = src - src_stride; + if (bs == 4 && right_available && left_available) { + above_row = above_ptr; + } else { + vpx_memcpy(above_row, above_ptr, bs); + if (bs == 4 && right_available) + vpx_memcpy(above_row + bs, above_ptr + bs, bs); + else + vpx_memset(above_row + bs, above_row[bs - 1], bs); + above_row[-1] = left_available ? above_ptr[-1] : 129; } - break; - case V_PRED: - for (r = 0; r < bh; r++) { - vpx_memcpy(ypred_ptr, yabove_row, bw); - ypred_ptr += y_stride; - } - break; - case H_PRED: - for (r = 0; r < bh; r++) { - vpx_memset(ypred_ptr, yleft_col[r], bw); - ypred_ptr += y_stride; - } - break; - case TM_PRED: - for (r = 0; r < bh; r++) { - for (c = 0; c < bw; c++) - ypred_ptr[c] = clip_pixel(yleft_col[r] + yabove_row[c] - ytop_left); - ypred_ptr += y_stride; - } - break; - case D45_PRED: - d45_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col); - break; - case D135_PRED: - d135_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col); - break; - case D117_PRED: - d117_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col); - break; - case D153_PRED: - d153_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col); - break; - case D27_PRED: - d27_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col); - break; - case D63_PRED: - d63_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col); - break; - default: - break; + } else { + vpx_memset(above_row, 127, bs * 2); + above_row[-1] = 127; } -} - -void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd, - BLOCK_SIZE_TYPE bsize) { - const struct macroblockd_plane* const pd = &xd->plane[0]; - const int bw = plane_block_width(bsize, pd); - const int bh = plane_block_height(bsize, pd); - vp9_build_intra_predictors(pd->dst.buf, pd->dst.stride, - pd->dst.buf, pd->dst.stride, - xd->mode_info_context->mbmi.mode, - bw, bh, xd->up_available, xd->left_available, - 0 /*xd->right_available*/); -} -void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd, - BLOCK_SIZE_TYPE bsize) { - const int bwl = b_width_log2(bsize), bw = 2 << bwl; - const int bhl = b_height_log2(bsize), bh = 2 << bhl; - - vp9_build_intra_predictors(xd->plane[1].dst.buf, xd->plane[1].dst.stride, - xd->plane[1].dst.buf, xd->plane[1].dst.stride, - xd->mode_info_context->mbmi.uv_mode, - bw, bh, xd->up_available, - xd->left_available, 0 /*xd->right_available*/); - vp9_build_intra_predictors(xd->plane[2].dst.buf, xd->plane[1].dst.stride, - xd->plane[2].dst.buf, xd->plane[1].dst.stride, - xd->mode_info_context->mbmi.uv_mode, - bw, bh, xd->up_available, - xd->left_available, 0 /*xd->right_available*/); + if (mode == DC_PRED) { + dc_pred[left_available][up_available][txsz](pred_ptr, stride, + above_row, left_col); + } else { + pred[mode][txsz](pred_ptr, stride, above_row, left_col); + } } void vp9_predict_intra_block(MACROBLOCKD *xd, @@ -329,29 +406,19 @@ void vp9_predict_intra_block(MACROBLOCKD *xd, int bwl_in, TX_SIZE tx_size, int mode, + uint8_t *reference, int ref_stride, uint8_t *predictor, int pre_stride) { const int bwl = bwl_in - tx_size; const int wmask = (1 << bwl) - 1; const int have_top = (block_idx >> bwl) || xd->up_available; const int have_left = (block_idx & wmask) || xd->left_available; const int have_right = ((block_idx & wmask) != wmask); - const int txfm_block_size = 4 << tx_size; assert(bwl >= 0); - vp9_build_intra_predictors(predictor, pre_stride, - predictor, pre_stride, - mode, - txfm_block_size, - txfm_block_size, - have_top, have_left, - have_right); -} - -void vp9_intra4x4_predict(MACROBLOCKD *xd, - int block_idx, - BLOCK_SIZE_TYPE bsize, - int mode, - uint8_t *predictor, int pre_stride) { - vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize), TX_4X4, - mode, predictor, pre_stride); + build_intra_predictors(reference, ref_stride, + predictor, pre_stride, + mode, + tx_size, + have_top, have_left, + have_right); } diff --git a/libvpx/vp9/common/vp9_reconintra.h b/libvpx/vp9/common/vp9_reconintra.h index f5f5f42..e369a71 100644 --- a/libvpx/vp9/common/vp9_reconintra.h +++ b/libvpx/vp9/common/vp9_reconintra.h @@ -25,6 +25,6 @@ void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in, TX_SIZE tx_size, - int mode, + int mode, uint8_t *ref, int ref_stride, uint8_t *predictor, int pre_stride); #endif // VP9_COMMON_VP9_RECONINTRA_H_ diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh index a405aab..c357ef6 100644 --- a/libvpx/vp9/common/vp9_rtcd_defs.sh +++ b/libvpx/vp9/common/vp9_rtcd_defs.sh @@ -22,6 +22,8 @@ EOF } forward_decls vp9_common_forward_decls +[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 + # # Dequant # @@ -35,46 +37,177 @@ specialize vp9_idct_add_8x8 prototype void vp9_idct_add "int16_t *input, uint8_t *dest, int stride, int eob" specialize vp9_idct_add - - prototype void vp9_idct_add_32x32 "int16_t *q, uint8_t *dst, int stride, int eob" specialize vp9_idct_add_32x32 # # RECON # -prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_copy_mem16x16 mmx sse2 dspr2 -vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2 +prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d27_predictor_4x4 + +prototype void vp9_d45_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d45_predictor_4x4 + +prototype void vp9_d63_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d63_predictor_4x4 + +prototype void vp9_h_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_h_predictor_4x4 ssse3 + +prototype void vp9_d117_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d117_predictor_4x4 + +prototype void vp9_d135_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d135_predictor_4x4 + +prototype void vp9_d153_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d153_predictor_4x4 + +prototype void vp9_v_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_v_predictor_4x4 sse + +prototype void vp9_tm_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_tm_predictor_4x4 sse + +prototype void vp9_dc_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_predictor_4x4 sse + +prototype void vp9_dc_top_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_top_predictor_4x4 + +prototype void vp9_dc_left_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_left_predictor_4x4 + +prototype void vp9_dc_128_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_128_predictor_4x4 + +prototype void vp9_d27_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d27_predictor_8x8 + +prototype void vp9_d45_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d45_predictor_8x8 + +prototype void vp9_d63_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d63_predictor_8x8 + +prototype void vp9_h_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_h_predictor_8x8 ssse3 + +prototype void vp9_d117_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d117_predictor_8x8 + +prototype void vp9_d135_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d135_predictor_8x8 + +prototype void vp9_d153_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d153_predictor_8x8 + +prototype void vp9_v_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_v_predictor_8x8 sse + +prototype void vp9_tm_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_tm_predictor_8x8 sse2 + +prototype void vp9_dc_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_predictor_8x8 sse + +prototype void vp9_dc_top_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_top_predictor_8x8 + +prototype void vp9_dc_left_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_left_predictor_8x8 + +prototype void vp9_dc_128_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_128_predictor_8x8 + +prototype void vp9_d27_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d27_predictor_16x16 + +prototype void vp9_d45_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d45_predictor_16x16 + +prototype void vp9_d63_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d63_predictor_16x16 + +prototype void vp9_h_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_h_predictor_16x16 ssse3 + +prototype void vp9_d117_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d117_predictor_16x16 + +prototype void vp9_d135_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d135_predictor_16x16 + +prototype void vp9_d153_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d153_predictor_16x16 -prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_copy_mem8x8 mmx dspr2 -vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2 +prototype void vp9_v_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_v_predictor_16x16 sse2 -prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_copy_mem8x4 mmx +prototype void vp9_tm_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_tm_predictor_16x16 sse2 -prototype void vp9_build_intra_predictors "uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available" -specialize void vp9_build_intra_predictors +prototype void vp9_dc_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_predictor_16x16 sse2 -prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize" -specialize vp9_build_intra_predictors_sby_s +prototype void vp9_dc_top_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_top_predictor_16x16 -prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize" -specialize vp9_build_intra_predictors_sbuv_s +prototype void vp9_dc_left_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_left_predictor_16x16 -prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride" -specialize vp9_intra4x4_predict; +prototype void vp9_dc_128_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_128_predictor_16x16 + +prototype void vp9_d27_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d27_predictor_32x32 + +prototype void vp9_d45_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d45_predictor_32x32 + +prototype void vp9_d63_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d63_predictor_32x32 + +prototype void vp9_h_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_h_predictor_32x32 ssse3 + +prototype void vp9_d117_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d117_predictor_32x32 + +prototype void vp9_d135_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d135_predictor_32x32 + +prototype void vp9_d153_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_d153_predictor_32x32 + +prototype void vp9_v_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_v_predictor_32x32 sse2 + +prototype void vp9_tm_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_tm_predictor_32x32 sse2_x86_64 + +prototype void vp9_dc_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_predictor_32x32 sse2 + +prototype void vp9_dc_top_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_top_predictor_32x32 + +prototype void vp9_dc_left_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_left_predictor_32x32 + +prototype void vp9_dc_128_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" +specialize vp9_dc_128_predictor_32x32 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride" -specialize vp9_add_constant_residual_8x8 sse2 +specialize vp9_add_constant_residual_8x8 sse2 neon prototype void vp9_add_constant_residual_16x16 "const int16_t diff, uint8_t *dest, int stride" -specialize vp9_add_constant_residual_16x16 sse2 +specialize vp9_add_constant_residual_16x16 sse2 neon prototype void vp9_add_constant_residual_32x32 "const int16_t diff, uint8_t *dest, int stride" -specialize vp9_add_constant_residual_32x32 sse2 +specialize vp9_add_constant_residual_32x32 sse2 neon fi # @@ -84,19 +217,19 @@ prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t specialize vp9_mb_lpf_vertical_edge_w sse2 prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mbloop_filter_vertical_edge sse2 +specialize vp9_mbloop_filter_vertical_edge sse2 neon prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_loop_filter_vertical_edge mmx +specialize vp9_loop_filter_vertical_edge mmx neon -prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh" +prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_mb_lpf_horizontal_edge_w sse2 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mbloop_filter_horizontal_edge sse2 +specialize vp9_mbloop_filter_horizontal_edge sse2 neon prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_loop_filter_horizontal_edge mmx +specialize vp9_loop_filter_horizontal_edge mmx neon # # post proc @@ -131,35 +264,41 @@ specialize vp9_blend_b # # Sub Pixel Filters # -prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8 ssse3 +prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve_copy sse2 + +prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve_avg sse2 -prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_horiz ssse3 +prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8 ssse3 neon -prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_vert ssse3 +prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_horiz ssse3 neon -prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg ssse3 +prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_vert ssse3 neon -prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg_horiz ssse3 +prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_avg ssse3 neon -prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg_vert ssse3 +prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_avg_horiz ssse3 neon + +prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_avg_vert ssse3 neon # # dct # prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct4x4_1_add +specialize vp9_short_idct4x4_1_add sse2 prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct4x4_add sse2 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct8x8_add sse2 +specialize vp9_short_idct8x8_add sse2 neon prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct10_8x8_add sse2 @@ -186,21 +325,18 @@ prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int de specialize vp9_short_idct10_32x32_add prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_short_iht4x4_add +specialize vp9_short_iht4x4_add sse2 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" -specialize vp9_short_iht8x8_add +specialize vp9_short_iht8x8_add sse2 prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type" -specialize vp9_short_iht16x16_add +specialize vp9_short_iht16x16_add sse2 prototype void vp9_idct4_1d "int16_t *input, int16_t *output" specialize vp9_idct4_1d sse2 # dct and add -prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" -specialize vp9_dc_only_idct_add sse2 - prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_iwalsh4x4_1_add @@ -220,8 +356,6 @@ if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then # variance -[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 - prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_variance32x16 sse2 @@ -266,88 +400,84 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid specialize vp9_variance4x4 mmx sse2 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x64 sse2 +specialize vp9_sub_pixel_variance64x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance64x64 +specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x64 +specialize vp9_sub_pixel_variance32x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x64 +specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x32 +specialize vp9_sub_pixel_variance64x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance64x32 +specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x16 +specialize vp9_sub_pixel_variance32x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x16 +specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x32 +specialize vp9_sub_pixel_variance16x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x32 +specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x32 sse2 +specialize vp9_sub_pixel_variance32x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x32 +specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3 +specialize vp9_sub_pixel_variance16x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x16 +specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x16 sse2 mmx -vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt +specialize vp9_sub_pixel_variance8x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x16 +specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3 -vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3; -vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt +specialize vp9_sub_pixel_variance16x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x8 +specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x8 sse2 mmx -vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt +specialize vp9_sub_pixel_variance8x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x8 +specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3 # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x4 +specialize vp9_sub_pixel_variance8x4 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x4 +specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x8 +specialize vp9_sub_pixel_variance4x8 sse ssse3 prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance4x8 +specialize vp9_sub_pixel_avg_variance4x8 sse ssse3 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x4 sse2 mmx -vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt +specialize vp9_sub_pixel_variance4x4 sse ssse3 +#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance4x4 +specialize vp9_sub_pixel_avg_variance4x4 sse ssse3 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad64x64 sse2 @@ -379,7 +509,6 @@ specialize vp9_sad8x16 mmx sse2 prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad8x8 mmx sse2 -# TODO(jingning): need to covert these functions into mmx/sse2 form prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad8x4 sse2 @@ -389,16 +518,55 @@ specialize vp9_sad4x8 sse prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad4x4 mmx sse +prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad64x64_avg sse2 + +prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad32x64_avg sse2 + +prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad64x32_avg sse2 + +prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad32x16_avg sse2 + +prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad16x32_avg sse2 + +prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad32x32_avg sse2 + +prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad16x16_avg sse2 + +prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad16x8_avg sse2 + +prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad8x16_avg sse2 + +prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad8x8_avg sse2 + +prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad8x4_avg sse2 + +prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad4x8_avg sse + +prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad" +specialize vp9_sad4x4_avg sse + prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_h mmx sse2 +specialize vp9_variance_halfpixvar16x16_h sse2 vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_v mmx sse2 +specialize vp9_variance_halfpixvar16x16_v sse2 vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_hv mmx sse2 +specialize vp9_variance_halfpixvar16x16_hv sse2 vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" @@ -507,8 +675,8 @@ specialize vp9_sad4x8x4d sse prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" specialize vp9_sad4x4x4d sse -prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" -specialize vp9_sub_pixel_mse16x16 sse2 mmx +#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" +#specialize vp9_sub_pixel_mse16x16 sse2 mmx prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" specialize vp9_mse16x16 mmx sse2 @@ -533,9 +701,19 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *" specialize vp9_get_mb_ss mmx sse2 # ENCODEMB INVOKE -prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size" -specialize vp9_block_error mmx sse2 -vp9_block_error_sse2=vp9_block_error_xmm +prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz" +specialize vp9_block_error sse2 + +prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride" +specialize vp9_subtract_block sse2 + +[ $arch = "x86_64" ] && ssse3_x86_64=ssse3 + +prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" +specialize vp9_quantize_b $ssse3_x86_64 + +prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" +specialize vp9_quantize_b_32x32 $ssse3_x86_64 # # Structured Similarity (SSIM) @@ -552,13 +730,13 @@ fi # fdct functions prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" -specialize vp9_short_fht4x4 +specialize vp9_short_fht4x4 sse2 prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" -specialize vp9_short_fht8x8 +specialize vp9_short_fht8x8 sse2 prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" -specialize vp9_short_fht16x16 +specialize vp9_short_fht16x16 sse2 prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch" specialize vp9_short_fdct8x8 sse2 @@ -573,7 +751,7 @@ prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int specialize vp9_short_fdct32x32 prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct32x32_rd +specialize vp9_short_fdct32x32_rd sse2 prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch" specialize vp9_short_fdct16x16 sse2 diff --git a/libvpx/vp9/common/vp9_seg_common.c b/libvpx/vp9/common/vp9_seg_common.c index df7747c..6bfd8f8 100644 --- a/libvpx/vp9/common/vp9_seg_common.c +++ b/libvpx/vp9/common/vp9_seg_common.c @@ -9,36 +9,41 @@ */ #include <assert.h> + #include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_quant_common.h" static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 }; -static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, 63, 3, 0 }; + +static const int seg_feature_data_max[SEG_LVL_MAX] = { + MAXQ, MAX_LOOP_FILTER, 3, 0 }; // These functions provide access to new segment level features. // Eventually these function may be "optimized out" but for the moment, // the coding mechanism is still subject to change so these provide a // convenient single point of change. -int vp9_segfeature_active(const MACROBLOCKD *xd, int segment_id, +int vp9_segfeature_active(const struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id) { - return xd->segmentation_enabled && - (xd->segment_feature_mask[segment_id] & (1 << feature_id)); + return seg->enabled && + (seg->feature_mask[segment_id] & (1 << feature_id)); } -void vp9_clearall_segfeatures(MACROBLOCKD *xd) { - vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); - vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask)); +void vp9_clearall_segfeatures(struct segmentation *seg) { + vp9_zero(seg->feature_data); + vp9_zero(seg->feature_mask); } -void vp9_enable_segfeature(MACROBLOCKD *xd, int segment_id, +void vp9_enable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id) { - xd->segment_feature_mask[segment_id] |= 1 << feature_id; + seg->feature_mask[segment_id] |= 1 << feature_id; } -void vp9_disable_segfeature(MACROBLOCKD *xd, int segment_id, +void vp9_disable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id) { - xd->segment_feature_mask[segment_id] &= ~(1 << feature_id); + seg->feature_mask[segment_id] &= ~(1 << feature_id); } int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id) { @@ -49,12 +54,12 @@ int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { return seg_feature_data_signed[feature_id]; } -void vp9_clear_segdata(MACROBLOCKD *xd, int segment_id, +void vp9_clear_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id) { - xd->segment_feature_data[segment_id][feature_id] = 0; + seg->feature_data[segment_id][feature_id] = 0; } -void vp9_set_segdata(MACROBLOCKD *xd, int segment_id, +void vp9_set_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id, int seg_data) { assert(seg_data <= seg_feature_data_max[feature_id]); if (seg_data < 0) { @@ -62,12 +67,12 @@ void vp9_set_segdata(MACROBLOCKD *xd, int segment_id, assert(-seg_data <= seg_feature_data_max[feature_id]); } - xd->segment_feature_data[segment_id][feature_id] = seg_data; + seg->feature_data[segment_id][feature_id] = seg_data; } -int vp9_get_segdata(const MACROBLOCKD *xd, int segment_id, +int vp9_get_segdata(const struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id) { - return xd->segment_feature_data[segment_id][feature_id]; + return seg->feature_data[segment_id][feature_id]; } diff --git a/libvpx/vp9/common/vp9_seg_common.h b/libvpx/vp9/common/vp9_seg_common.h index 74ba03c..f22239b 100644 --- a/libvpx/vp9/common/vp9_seg_common.h +++ b/libvpx/vp9/common/vp9_seg_common.h @@ -8,23 +8,54 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vp9/common/vp9_onyxc_int.h" -#include "vp9/common/vp9_blockd.h" - #ifndef VP9_COMMON_VP9_SEG_COMMON_H_ #define VP9_COMMON_VP9_SEG_COMMON_H_ -int vp9_segfeature_active(const MACROBLOCKD *xd, +#include "vp9/common/vp9_treecoder.h" + +#define SEGMENT_DELTADATA 0 +#define SEGMENT_ABSDATA 1 + +#define MAX_SEGMENTS 8 +#define SEG_TREE_PROBS (MAX_SEGMENTS-1) + +#define PREDICTION_PROBS 3 + +// Segment level features. +typedef enum { + SEG_LVL_ALT_Q = 0, // Use alternate Quantizer .... + SEG_LVL_ALT_LF = 1, // Use alternate loop filter value... + SEG_LVL_REF_FRAME = 2, // Optional Segment reference frame + SEG_LVL_SKIP = 3, // Optional Segment (0,0) + skip mode + SEG_LVL_MAX = 4 // Number of features supported +} SEG_LVL_FEATURES; + + +struct segmentation { + uint8_t enabled; + uint8_t update_map; + uint8_t update_data; + uint8_t abs_delta; + uint8_t temporal_update; + + vp9_prob tree_probs[SEG_TREE_PROBS]; + vp9_prob pred_probs[PREDICTION_PROBS]; + + int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX]; + unsigned int feature_mask[MAX_SEGMENTS]; +}; + +int vp9_segfeature_active(const struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); -void vp9_clearall_segfeatures(MACROBLOCKD *xd); +void vp9_clearall_segfeatures(struct segmentation *seg); -void vp9_enable_segfeature(MACROBLOCKD *xd, +void vp9_enable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); -void vp9_disable_segfeature(MACROBLOCKD *xd, +void vp9_disable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); @@ -32,16 +63,16 @@ int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id); int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id); -void vp9_clear_segdata(MACROBLOCKD *xd, +void vp9_clear_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); -void vp9_set_segdata(MACROBLOCKD *xd, +void vp9_set_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id, int seg_data); -int vp9_get_segdata(const MACROBLOCKD *xd, +int vp9_get_segdata(const struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); diff --git a/libvpx/vp9/common/vp9_tile_common.c b/libvpx/vp9/common/vp9_tile_common.c index 95296ad..a72d2ab 100644 --- a/libvpx/vp9/common/vp9_tile_common.c +++ b/libvpx/vp9/common/vp9_tile_common.c @@ -10,15 +10,16 @@ #include "vp9/common/vp9_tile_common.h" -#define MIN_TILE_WIDTH 256 -#define MAX_TILE_WIDTH 4096 -#define MIN_TILE_WIDTH_SBS (MIN_TILE_WIDTH >> 6) -#define MAX_TILE_WIDTH_SBS (MAX_TILE_WIDTH >> 6) - -static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off, - int *max_tile_off, int tile_idx, - int log2_n_tiles, int n_mis) { - const int n_sbs = (n_mis + 7) >> 3; +#define MIN_TILE_WIDTH_B64 4 +#define MAX_TILE_WIDTH_B64 64 + +static int to_sbs(n_mis) { + return mi_cols_aligned_to_sb(n_mis) >> LOG2_MI_BLOCK_SIZE; +} + +static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off, + int tile_idx, int log2_n_tiles, int n_mis) { + const int n_sbs = to_sbs(n_mis); const int sb_off1 = (tile_idx * n_sbs) >> log2_n_tiles; const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles; @@ -27,37 +28,34 @@ static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off, } void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) { - cm->cur_tile_col_idx = tile_col_idx; - vp9_get_tile_offsets(cm, &cm->cur_tile_mi_col_start, - &cm->cur_tile_mi_col_end, tile_col_idx, - cm->log2_tile_columns, cm->mi_cols); + vp9_get_tile_offsets(&cm->cur_tile_mi_col_start, &cm->cur_tile_mi_col_end, + tile_col_idx, cm->log2_tile_cols, cm->mi_cols); } void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) { - cm->cur_tile_row_idx = tile_row_idx; - vp9_get_tile_offsets(cm, &cm->cur_tile_mi_row_start, - &cm->cur_tile_mi_row_end, tile_row_idx, - cm->log2_tile_rows, cm->mi_rows); + vp9_get_tile_offsets(&cm->cur_tile_mi_row_start, &cm->cur_tile_mi_row_end, + tile_row_idx, cm->log2_tile_rows, cm->mi_rows); } -void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles_ptr, - int *delta_log2_n_tiles) { - const int sb_cols = (cm->mb_cols + 3) >> 2; +void vp9_get_tile_n_bits(int mi_cols, + int *min_log2_tile_cols, int *max_log2_tile_cols) { + const int sb_cols = to_sbs(mi_cols); int min_log2_n_tiles, max_log2_n_tiles; for (max_log2_n_tiles = 0; - (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_SBS; + (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_B64; max_log2_n_tiles++) {} max_log2_n_tiles--; if (max_log2_n_tiles < 0) max_log2_n_tiles = 0; for (min_log2_n_tiles = 0; - (MAX_TILE_WIDTH_SBS << min_log2_n_tiles) < sb_cols; + (MAX_TILE_WIDTH_B64 << min_log2_n_tiles) < sb_cols; min_log2_n_tiles++) {} - assert(max_log2_n_tiles >= min_log2_n_tiles); - *min_log2_n_tiles_ptr = min_log2_n_tiles; - *delta_log2_n_tiles = max_log2_n_tiles - min_log2_n_tiles; + assert(min_log2_n_tiles <= max_log2_n_tiles); + + *min_log2_tile_cols = min_log2_n_tiles; + *max_log2_tile_cols = max_log2_n_tiles; } diff --git a/libvpx/vp9/common/vp9_tile_common.h b/libvpx/vp9/common/vp9_tile_common.h index 7ea3772..6d14560 100644 --- a/libvpx/vp9/common/vp9_tile_common.h +++ b/libvpx/vp9/common/vp9_tile_common.h @@ -17,7 +17,7 @@ void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx); void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx); -void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles, - int *delta_log2_n_tiles); +void vp9_get_tile_n_bits(int mi_cols, + int *min_log2_tile_cols, int *max_log2_tile_cols); #endif // VP9_COMMON_VP9_TILE_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_treecoder.c b/libvpx/vp9/common/vp9_treecoder.c index 531fa75..2e21a5b 100644 --- a/libvpx/vp9/common/vp9_treecoder.c +++ b/libvpx/vp9/common/vp9_treecoder.c @@ -9,12 +9,9 @@ */ -#include "vpx_config.h" - -#if defined(CONFIG_DEBUG) && CONFIG_DEBUG #include <assert.h> -#endif +#include "./vpx_config.h" #include "vp9/common/vp9_treecoder.h" static void tree2tok(struct vp9_token *const p, vp9_tree t, diff --git a/libvpx/vp9/common/x86/vp9_asm_stubs.c b/libvpx/vp9/common/x86/vp9_asm_stubs.c index 2b66834..3f1c198 100644 --- a/libvpx/vp9/common/x86/vp9_asm_stubs.c +++ b/libvpx/vp9/common/x86/vp9_asm_stubs.c @@ -121,11 +121,12 @@ void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr, unsigned int output_height, const short *filter); -void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { + /* Ensure the filter can be compressed to int16_t. */ if (x_step_q4 == 16 && filter_x[3] != 128) { while (w >= 16) { vp9_filter_block1d16_h8_ssse3(src, src_stride, @@ -159,8 +160,8 @@ void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride, } } -void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -197,8 +198,8 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride, } } -void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -235,8 +236,8 @@ void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride, } } -void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -273,8 +274,8 @@ void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride, } } -void vp9_convolve8_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { @@ -294,8 +295,8 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride, } } -void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, +void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { diff --git a/libvpx/vp9/common/x86/vp9_copy_sse2.asm b/libvpx/vp9/common/x86/vp9_copy_sse2.asm new file mode 100644 index 0000000..dd522c6 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_copy_sse2.asm @@ -0,0 +1,152 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro convolve_fn 1 +INIT_XMM sse2 +cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ + fx, fxs, fy, fys, w, h + mov r4d, dword wm + cmp r4d, 4 + je .w4 + cmp r4d, 8 + je .w8 + cmp r4d, 16 + je .w16 + cmp r4d, 32 + je .w32 + + mov r4d, dword hm +.loop64: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] + add srcq, src_strideq +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+16] + pavgb m2, [dstq+32] + pavgb m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + add dstq, dst_strideq + dec r4d + jnz .loop64 + RET + +.w32: + mov r4d, dword hm +.loop32: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+src_strideq] + movu m3, [srcq+src_strideq+16] + lea srcq, [srcq+src_strideq*2] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq +16] + pavgb m2, [dstq+dst_strideq] + pavgb m3, [dstq+dst_strideq+16] +%endif + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+dst_strideq ], m2 + mova [dstq+dst_strideq+16], m3 + lea dstq, [dstq+dst_strideq*2] + sub r4d, 2 + jnz .loop32 + RET + +.w16: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop16: + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] + pavgb m2, [dstq+dst_strideq*2] + pavgb m3, [dstq+r6q] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop16 + RET + +INIT_MMX sse +.w8: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop8: + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] + pavgb m2, [dstq+dst_strideq*2] + pavgb m3, [dstq+r6q] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop8 + RET + +.w4: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop4: + movh m0, [srcq] + movh m1, [srcq+src_strideq] + movh m2, [srcq+src_strideq*2] + movh m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] + pavgb m2, [dstq+dst_strideq*2] + pavgb m3, [dstq+r6q] +%endif + movh [dstq ], m0 + movh [dstq+dst_strideq ], m1 + movh [dstq+dst_strideq*2], m2 + movh [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop4 + RET +%endmacro + +convolve_fn copy +convolve_fn avg diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index 599dcff..a1e14b4 100644 --- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -15,64 +15,6 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" -// In order to improve performance, clip absolute diff values to [0, 255], -// which allows to keep the additions/subtractions in 8 bits. -void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr, - uint8_t *dst_ptr, int pitch, int stride) { - int a1; - int16_t out; - uint8_t abs_diff; - __m128i p0, p1, p2, p3; - unsigned int extended_diff; - __m128i diff; - - out = dct_const_round_shift(input_dc * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 4); - - // Read prediction data. - p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch)); - p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch)); - p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch)); - p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch)); - - // Unpack prediction data, and store 4x4 array in 1 XMM register. - p0 = _mm_unpacklo_epi32(p0, p1); - p2 = _mm_unpacklo_epi32(p2, p3); - p0 = _mm_unpacklo_epi64(p0, p2); - - // Clip dc value to [0, 255] range. Then, do addition or subtraction - // according to its sign. - if (a1 >= 0) { - abs_diff = (a1 > 255) ? 255 : a1; - extended_diff = abs_diff * 0x01010101u; - diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0); - - p1 = _mm_adds_epu8(p0, diff); - } else { - abs_diff = (a1 < -255) ? 255 : -a1; - extended_diff = abs_diff * 0x01010101u; - diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0); - - p1 = _mm_subs_epu8(p0, diff); - } - - // Store results to dst. - *(int *)dst_ptr = _mm_cvtsi128_si32(p1); - dst_ptr += stride; - - p1 = _mm_srli_si128(p1, 4); - *(int *)dst_ptr = _mm_cvtsi128_si32(p1); - dst_ptr += stride; - - p1 = _mm_srli_si128(p1, 4); - *(int *)dst_ptr = _mm_cvtsi128_si32(p1); - dst_ptr += stride; - - p1 = _mm_srli_si128(p1, 4); - *(int *)dst_ptr = _mm_cvtsi128_si32(p1); -} - void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); @@ -206,6 +148,23 @@ void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE4X4(dest, input3); } +void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 4); + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE4X4(dest, dc_value); + RECON_AND_STORE4X4(dest, dc_value); + RECON_AND_STORE4X4(dest, dc_value); + RECON_AND_STORE4X4(dest, dc_value); +} + void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { const __m128i zero = _mm_setzero_si128(); const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, @@ -241,6 +200,155 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { _mm_storel_epi64((__m128i *)output, in); } +static INLINE void transpose_4x4(__m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); + res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); + res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); + + res[1] = _mm_unpackhi_epi64(res[0], res[0]); + res[3] = _mm_unpackhi_epi64(res[2], res[2]); +} + +void idct4_1d_sse2(__m128i *in) { + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8]; + + transpose_4x4(in); + // stage 1 + u[0] = _mm_unpacklo_epi16(in[0], in[2]); + u[1] = _mm_unpacklo_epi16(in[1], in[3]); + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + + u[0] = _mm_packs_epi32(v[0], v[2]); + u[1] = _mm_packs_epi32(v[1], v[3]); + u[2] = _mm_unpackhi_epi64(u[0], u[0]); + u[3] = _mm_unpackhi_epi64(u[1], u[1]); + + // stage 2 + in[0] = _mm_add_epi16(u[0], u[3]); + in[1] = _mm_add_epi16(u[1], u[2]); + in[2] = _mm_sub_epi16(u[1], u[2]); + in[3] = _mm_sub_epi16(u[0], u[3]); +} + +void iadst4_1d_sse2(__m128i *in) { + const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); + const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); + const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); + const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); + const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); + const __m128i kZero = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8], in7; + + transpose_4x4(in); + in7 = _mm_add_epi16(in[0], in[3]); + in7 = _mm_sub_epi16(in7, in[2]); + + u[0] = _mm_unpacklo_epi16(in[0], in[2]); + u[1] = _mm_unpacklo_epi16(in[1], in[3]); + u[2] = _mm_unpacklo_epi16(in7, kZero); + u[3] = _mm_unpacklo_epi16(in[1], kZero); + + v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 + v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 + v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 + v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 + v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 + v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(v[3], v[4]); + u[2] = v[2]; + u[3] = _mm_add_epi32(u[0], u[1]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_add_epi32(u[3], v[5]); + u[6] = _mm_sub_epi32(u[5], u[4]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[2]); + in[1] = _mm_packs_epi32(u[1], u[3]); + in[2] = _mm_unpackhi_epi64(in[0], in[0]); + in[3] = _mm_unpackhi_epi64(in[1], in[1]); +} + +void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in[4]; + const __m128i zero = _mm_setzero_si128(); + const __m128i eight = _mm_set1_epi16(8); + + in[0] = _mm_loadl_epi64((__m128i *)input); + in[1] = _mm_loadl_epi64((__m128i *)(input + 4)); + in[2] = _mm_loadl_epi64((__m128i *)(input + 8)); + in[3] = _mm_loadl_epi64((__m128i *)(input + 12)); + + switch (tx_type) { + case 0: // DCT_DCT + idct4_1d_sse2(in); + idct4_1d_sse2(in); + break; + case 1: // ADST_DCT + idct4_1d_sse2(in); + iadst4_1d_sse2(in); + break; + case 2: // DCT_ADST + iadst4_1d_sse2(in); + idct4_1d_sse2(in); + break; + case 3: // ADST_ADST + iadst4_1d_sse2(in); + iadst4_1d_sse2(in); + break; + default: + assert(0); + break; + } + + // Final round and shift + in[0] = _mm_add_epi16(in[0], eight); + in[1] = _mm_add_epi16(in[1], eight); + in[2] = _mm_add_epi16(in[2], eight); + in[3] = _mm_add_epi16(in[3], eight); + + in[0] = _mm_srai_epi16(in[0], 4); + in[1] = _mm_srai_epi16(in[1], 4); + in[2] = _mm_srai_epi16(in[2], 4); + in[3] = _mm_srai_epi16(in[3], 4); + + RECON_AND_STORE4X4(dest, in[0]); + RECON_AND_STORE4X4(dest, in[1]); + RECON_AND_STORE4X4(dest, in[2]); + RECON_AND_STORE4X4(dest, in[3]); +} + #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3, out4, out5, out6, out7) \ { \ @@ -489,6 +597,373 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, in7); } +// perform 8x8 transpose +static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + + res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); + res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); + res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); + res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); + res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); +} + +void idct8_1d_sse2(__m128i *in) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + in0 = in[0]; + in1 = in[1]; + in2 = in[2]; + in3 = in[3]; + in4 = in[4]; + in5 = in[5]; + in6 = in[6]; + in7 = in[7]; + + // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2() + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8x8_1D + in[0] = in0; + in[1] = in1; + in[2] = in2; + in[3] = in3; + in[4] = in4; + in[5] = in5; + in[6] = in6; + in[7] = in7; +} + +void iadst8_1d_sse2(__m128i *in) { + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__const_0 = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + + // transpose + array_transpose_8x8(in, in); + + // properly aligned for butterfly input + in0 = in[7]; + in1 = in[0]; + in2 = in[5]; + in3 = in[2]; + in4 = in[3]; + in5 = in[4]; + in6 = in[1]; + in7 = in[6]; + + // column transformation + // stage 1 + // interleave and multiply/add into 32-bit integer + s0 = _mm_unpacklo_epi16(in0, in1); + s1 = _mm_unpackhi_epi16(in0, in1); + s2 = _mm_unpacklo_epi16(in2, in3); + s3 = _mm_unpackhi_epi16(in2, in3); + s4 = _mm_unpacklo_epi16(in4, in5); + s5 = _mm_unpackhi_epi16(in4, in5); + s6 = _mm_unpacklo_epi16(in6, in7); + s7 = _mm_unpackhi_epi16(in6, in7); + + u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); + u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); + u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); + u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); + u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); + u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); + u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); + u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); + u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); + u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); + u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); + u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); + u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); + u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); + u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); + u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); + + // addition + w0 = _mm_add_epi32(u0, u8); + w1 = _mm_add_epi32(u1, u9); + w2 = _mm_add_epi32(u2, u10); + w3 = _mm_add_epi32(u3, u11); + w4 = _mm_add_epi32(u4, u12); + w5 = _mm_add_epi32(u5, u13); + w6 = _mm_add_epi32(u6, u14); + w7 = _mm_add_epi32(u7, u15); + w8 = _mm_sub_epi32(u0, u8); + w9 = _mm_sub_epi32(u1, u9); + w10 = _mm_sub_epi32(u2, u10); + w11 = _mm_sub_epi32(u3, u11); + w12 = _mm_sub_epi32(u4, u12); + w13 = _mm_sub_epi32(u5, u13); + w14 = _mm_sub_epi32(u6, u14); + w15 = _mm_sub_epi32(u7, u15); + + // shift and rounding + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); + v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); + v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); + v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); + v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); + v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); + v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); + v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); + u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); + u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); + u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); + u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); + u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); + u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); + u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); + + // back to 16-bit and pack 8 integers into __m128i + in[0] = _mm_packs_epi32(u0, u1); + in[1] = _mm_packs_epi32(u2, u3); + in[2] = _mm_packs_epi32(u4, u5); + in[3] = _mm_packs_epi32(u6, u7); + in[4] = _mm_packs_epi32(u8, u9); + in[5] = _mm_packs_epi32(u10, u11); + in[6] = _mm_packs_epi32(u12, u13); + in[7] = _mm_packs_epi32(u14, u15); + + // stage 2 + s0 = _mm_add_epi16(in[0], in[2]); + s1 = _mm_add_epi16(in[1], in[3]); + s2 = _mm_sub_epi16(in[0], in[2]); + s3 = _mm_sub_epi16(in[1], in[3]); + u0 = _mm_unpacklo_epi16(in[4], in[5]); + u1 = _mm_unpackhi_epi16(in[4], in[5]); + u2 = _mm_unpacklo_epi16(in[6], in[7]); + u3 = _mm_unpackhi_epi16(in[6], in[7]); + + v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); + v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); + v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); + v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); + v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); + v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); + v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); + v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); + + w0 = _mm_add_epi32(v0, v4); + w1 = _mm_add_epi32(v1, v5); + w2 = _mm_add_epi32(v2, v6); + w3 = _mm_add_epi32(v3, v7); + w4 = _mm_sub_epi32(v0, v4); + w5 = _mm_sub_epi32(v1, v5); + w6 = _mm_sub_epi32(v2, v6); + w7 = _mm_sub_epi32(v3, v7); + + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + // back to 16-bit intergers + s4 = _mm_packs_epi32(u0, u1); + s5 = _mm_packs_epi32(u2, u3); + s6 = _mm_packs_epi32(u4, u5); + s7 = _mm_packs_epi32(u6, u7); + + // stage 3 + u0 = _mm_unpacklo_epi16(s2, s3); + u1 = _mm_unpackhi_epi16(s2, s3); + u2 = _mm_unpacklo_epi16(s6, s7); + u3 = _mm_unpackhi_epi16(s6, s7); + + v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); + v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); + v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); + v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); + + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + s2 = _mm_packs_epi32(v0, v1); + s3 = _mm_packs_epi32(v2, v3); + s6 = _mm_packs_epi32(v4, v5); + s7 = _mm_packs_epi32(v6, v7); + + in[0] = s0; + in[1] = _mm_sub_epi16(k__const_0, s4); + in[2] = s6; + in[3] = _mm_sub_epi16(k__const_0, s2); + in[4] = s3; + in[5] = _mm_sub_epi16(k__const_0, s7); + in[6] = s5; + in[7] = _mm_sub_epi16(k__const_0, s1); +} + + +void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in[8]; + const __m128i zero = _mm_setzero_si128(); + const __m128i final_rounding = _mm_set1_epi16(1<<4); + + // load input data + in[0] = _mm_load_si128((__m128i *)input); + in[1] = _mm_load_si128((__m128i *)(input + 8 * 1)); + in[2] = _mm_load_si128((__m128i *)(input + 8 * 2)); + in[3] = _mm_load_si128((__m128i *)(input + 8 * 3)); + in[4] = _mm_load_si128((__m128i *)(input + 8 * 4)); + in[5] = _mm_load_si128((__m128i *)(input + 8 * 5)); + in[6] = _mm_load_si128((__m128i *)(input + 8 * 6)); + in[7] = _mm_load_si128((__m128i *)(input + 8 * 7)); + + switch (tx_type) { + case 0: // DCT_DCT + idct8_1d_sse2(in); + idct8_1d_sse2(in); + break; + case 1: // ADST_DCT + idct8_1d_sse2(in); + iadst8_1d_sse2(in); + break; + case 2: // DCT_ADST + iadst8_1d_sse2(in); + idct8_1d_sse2(in); + break; + case 3: // ADST_ADST + iadst8_1d_sse2(in); + iadst8_1d_sse2(in); + break; + default: + assert(0); + break; + } + + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 5); + in[1] = _mm_srai_epi16(in[1], 5); + in[2] = _mm_srai_epi16(in[2], 5); + in[3] = _mm_srai_epi16(in[3], 5); + in[4] = _mm_srai_epi16(in[4], 5); + in[5] = _mm_srai_epi16(in[5], 5); + in[6] = _mm_srai_epi16(in[6], 5); + in[7] = _mm_srai_epi16(in[7], 5); + + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); +} + void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -974,6 +1449,960 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { } } +static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { + __m128i tbuf[8]; + array_transpose_8x8(res0, res0); + array_transpose_8x8(res1, tbuf); + array_transpose_8x8(res0 + 8, res1); + array_transpose_8x8(res1 + 8, res1 + 8); + + res0[8] = tbuf[0]; + res0[9] = tbuf[1]; + res0[10] = tbuf[2]; + res0[11] = tbuf[3]; + res0[12] = tbuf[4]; + res0[13] = tbuf[5]; + res0[14] = tbuf[6]; + res0[15] = tbuf[7]; +} + +void iadst16_1d_8col(__m128i *in) { + // perform 16x16 1-D ADST for 8 columns + __m128i s[16], x[16], u[32], v[32]; + const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); + const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_set1_epi16(0); + + u[0] = _mm_unpacklo_epi16(in[15], in[0]); + u[1] = _mm_unpackhi_epi16(in[15], in[0]); + u[2] = _mm_unpacklo_epi16(in[13], in[2]); + u[3] = _mm_unpackhi_epi16(in[13], in[2]); + u[4] = _mm_unpacklo_epi16(in[11], in[4]); + u[5] = _mm_unpackhi_epi16(in[11], in[4]); + u[6] = _mm_unpacklo_epi16(in[9], in[6]); + u[7] = _mm_unpackhi_epi16(in[9], in[6]); + u[8] = _mm_unpacklo_epi16(in[7], in[8]); + u[9] = _mm_unpackhi_epi16(in[7], in[8]); + u[10] = _mm_unpacklo_epi16(in[5], in[10]); + u[11] = _mm_unpackhi_epi16(in[5], in[10]); + u[12] = _mm_unpacklo_epi16(in[3], in[12]); + u[13] = _mm_unpackhi_epi16(in[3], in[12]); + u[14] = _mm_unpacklo_epi16(in[1], in[14]); + u[15] = _mm_unpackhi_epi16(in[1], in[14]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); + v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); + v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); + v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); + v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); + v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); + v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); + v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); + v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); + v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); + v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); + v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); + v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); + v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); + v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); + v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); + v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); + v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); + v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); + v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); + v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); + v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); + v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); + v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); + v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); + v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); + v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); + v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); + v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); + v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); + v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); + v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); + + u[0] = _mm_add_epi32(v[0], v[16]); + u[1] = _mm_add_epi32(v[1], v[17]); + u[2] = _mm_add_epi32(v[2], v[18]); + u[3] = _mm_add_epi32(v[3], v[19]); + u[4] = _mm_add_epi32(v[4], v[20]); + u[5] = _mm_add_epi32(v[5], v[21]); + u[6] = _mm_add_epi32(v[6], v[22]); + u[7] = _mm_add_epi32(v[7], v[23]); + u[8] = _mm_add_epi32(v[8], v[24]); + u[9] = _mm_add_epi32(v[9], v[25]); + u[10] = _mm_add_epi32(v[10], v[26]); + u[11] = _mm_add_epi32(v[11], v[27]); + u[12] = _mm_add_epi32(v[12], v[28]); + u[13] = _mm_add_epi32(v[13], v[29]); + u[14] = _mm_add_epi32(v[14], v[30]); + u[15] = _mm_add_epi32(v[15], v[31]); + u[16] = _mm_sub_epi32(v[0], v[16]); + u[17] = _mm_sub_epi32(v[1], v[17]); + u[18] = _mm_sub_epi32(v[2], v[18]); + u[19] = _mm_sub_epi32(v[3], v[19]); + u[20] = _mm_sub_epi32(v[4], v[20]); + u[21] = _mm_sub_epi32(v[5], v[21]); + u[22] = _mm_sub_epi32(v[6], v[22]); + u[23] = _mm_sub_epi32(v[7], v[23]); + u[24] = _mm_sub_epi32(v[8], v[24]); + u[25] = _mm_sub_epi32(v[9], v[25]); + u[26] = _mm_sub_epi32(v[10], v[26]); + u[27] = _mm_sub_epi32(v[11], v[27]); + u[28] = _mm_sub_epi32(v[12], v[28]); + u[29] = _mm_sub_epi32(v[13], v[29]); + u[30] = _mm_sub_epi32(v[14], v[30]); + u[31] = _mm_sub_epi32(v[15], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); + v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); + v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); + v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); + v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); + v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); + v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); + v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); + v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); + v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); + v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); + v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); + v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); + v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); + v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); + v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); + u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); + u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); + u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); + u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); + u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); + u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); + u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); + u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); + u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); + u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); + u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); + u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); + u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); + u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); + u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_packs_epi32(u[8], u[9]); + s[5] = _mm_packs_epi32(u[10], u[11]); + s[6] = _mm_packs_epi32(u[12], u[13]); + s[7] = _mm_packs_epi32(u[14], u[15]); + s[8] = _mm_packs_epi32(u[16], u[17]); + s[9] = _mm_packs_epi32(u[18], u[19]); + s[10] = _mm_packs_epi32(u[20], u[21]); + s[11] = _mm_packs_epi32(u[22], u[23]); + s[12] = _mm_packs_epi32(u[24], u[25]); + s[13] = _mm_packs_epi32(u[26], u[27]); + s[14] = _mm_packs_epi32(u[28], u[29]); + s[15] = _mm_packs_epi32(u[30], u[31]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[9]); + u[1] = _mm_unpackhi_epi16(s[8], s[9]); + u[2] = _mm_unpacklo_epi16(s[10], s[11]); + u[3] = _mm_unpackhi_epi16(s[10], s[11]); + u[4] = _mm_unpacklo_epi16(s[12], s[13]); + u[5] = _mm_unpackhi_epi16(s[12], s[13]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); + v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); + v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); + v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); + v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); + v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); + v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); + v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + x[0] = _mm_add_epi16(s[0], s[4]); + x[1] = _mm_add_epi16(s[1], s[5]); + x[2] = _mm_add_epi16(s[2], s[6]); + x[3] = _mm_add_epi16(s[3], s[7]); + x[4] = _mm_sub_epi16(s[0], s[4]); + x[5] = _mm_sub_epi16(s[1], s[5]); + x[6] = _mm_sub_epi16(s[2], s[6]); + x[7] = _mm_sub_epi16(s[3], s[7]); + x[8] = _mm_packs_epi32(u[0], u[1]); + x[9] = _mm_packs_epi32(u[2], u[3]); + x[10] = _mm_packs_epi32(u[4], u[5]); + x[11] = _mm_packs_epi32(u[6], u[7]); + x[12] = _mm_packs_epi32(u[8], u[9]); + x[13] = _mm_packs_epi32(u[10], u[11]); + x[14] = _mm_packs_epi32(u[12], u[13]); + x[15] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + u[0] = _mm_unpacklo_epi16(x[4], x[5]); + u[1] = _mm_unpackhi_epi16(x[4], x[5]); + u[2] = _mm_unpacklo_epi16(x[6], x[7]); + u[3] = _mm_unpackhi_epi16(x[6], x[7]); + u[4] = _mm_unpacklo_epi16(x[12], x[13]); + u[5] = _mm_unpackhi_epi16(x[12], x[13]); + u[6] = _mm_unpacklo_epi16(x[14], x[15]); + u[7] = _mm_unpackhi_epi16(x[14], x[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); + v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_add_epi16(x[0], x[2]); + s[1] = _mm_add_epi16(x[1], x[3]); + s[2] = _mm_sub_epi16(x[0], x[2]); + s[3] = _mm_sub_epi16(x[1], x[3]); + s[4] = _mm_packs_epi32(v[0], v[1]); + s[5] = _mm_packs_epi32(v[2], v[3]); + s[6] = _mm_packs_epi32(v[4], v[5]); + s[7] = _mm_packs_epi32(v[6], v[7]); + s[8] = _mm_add_epi16(x[8], x[10]); + s[9] = _mm_add_epi16(x[9], x[11]); + s[10] = _mm_sub_epi16(x[8], x[10]); + s[11] = _mm_sub_epi16(x[9], x[11]); + s[12] = _mm_packs_epi32(v[8], v[9]); + s[13] = _mm_packs_epi32(v[10], v[11]); + s[14] = _mm_packs_epi32(v[12], v[13]); + s[15] = _mm_packs_epi32(v[14], v[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); + u[4] = _mm_unpacklo_epi16(s[10], s[11]); + u[5] = _mm_unpackhi_epi16(s[10], s[11]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); + v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); + v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); + v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); + v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); + v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); + v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); + v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[8]); + in[2] = s[12]; + in[3] = _mm_sub_epi16(kZero, s[4]); + in[4] = _mm_packs_epi32(v[4], v[5]); + in[5] = _mm_packs_epi32(v[12], v[13]); + in[6] = _mm_packs_epi32(v[8], v[9]); + in[7] = _mm_packs_epi32(v[0], v[1]); + in[8] = _mm_packs_epi32(v[2], v[3]); + in[9] = _mm_packs_epi32(v[10], v[11]); + in[10] = _mm_packs_epi32(v[14], v[15]); + in[11] = _mm_packs_epi32(v[6], v[7]); + in[12] = s[5]; + in[13] = _mm_sub_epi16(kZero, s[13]); + in[14] = s[9]; + in[15] = _mm_sub_epi16(kZero, s[1]); +} + +void idct16_1d_8col(__m128i *in) { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i v[16], u[16], s[16], t[16]; + + // stage 1 + s[0] = in[0]; + s[1] = in[8]; + s[2] = in[4]; + s[3] = in[12]; + s[4] = in[2]; + s[5] = in[10]; + s[6] = in[6]; + s[7] = in[14]; + s[8] = in[1]; + s[9] = in[9]; + s[10] = in[5]; + s[11] = in[13]; + s[12] = in[3]; + s[13] = in[11]; + s[14] = in[7]; + s[15] = in[15]; + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[15]); + u[1] = _mm_unpackhi_epi16(s[8], s[15]); + u[2] = _mm_unpacklo_epi16(s[9], s[14]); + u[3] = _mm_unpackhi_epi16(s[9], s[14]); + u[4] = _mm_unpacklo_epi16(s[10], s[13]); + u[5] = _mm_unpackhi_epi16(s[10], s[13]); + u[6] = _mm_unpacklo_epi16(s[11], s[12]); + u[7] = _mm_unpackhi_epi16(s[11], s[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); + v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); + v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); + v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); + v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); + v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); + v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); + v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); + v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); + v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); + v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); + v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); + v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); + v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); + v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[8] = _mm_packs_epi32(u[0], u[1]); + s[15] = _mm_packs_epi32(u[2], u[3]); + s[9] = _mm_packs_epi32(u[4], u[5]); + s[14] = _mm_packs_epi32(u[6], u[7]); + s[10] = _mm_packs_epi32(u[8], u[9]); + s[13] = _mm_packs_epi32(u[10], u[11]); + s[11] = _mm_packs_epi32(u[12], u[13]); + s[12] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + t[0] = s[0]; + t[1] = s[1]; + t[2] = s[2]; + t[3] = s[3]; + u[0] = _mm_unpacklo_epi16(s[4], s[7]); + u[1] = _mm_unpackhi_epi16(s[4], s[7]); + u[2] = _mm_unpacklo_epi16(s[5], s[6]); + u[3] = _mm_unpackhi_epi16(s[5], s[6]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[4] = _mm_packs_epi32(u[0], u[1]); + t[7] = _mm_packs_epi32(u[2], u[3]); + t[5] = _mm_packs_epi32(u[4], u[5]); + t[6] = _mm_packs_epi32(u[6], u[7]); + t[8] = _mm_add_epi16(s[8], s[9]); + t[9] = _mm_sub_epi16(s[8], s[9]); + t[10] = _mm_sub_epi16(s[11], s[10]); + t[11] = _mm_add_epi16(s[10], s[11]); + t[12] = _mm_add_epi16(s[12], s[13]); + t[13] = _mm_sub_epi16(s[12], s[13]); + t[14] = _mm_sub_epi16(s[15], s[14]); + t[15] = _mm_add_epi16(s[14], s[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(t[0], t[1]); + u[1] = _mm_unpackhi_epi16(t[0], t[1]); + u[2] = _mm_unpacklo_epi16(t[2], t[3]); + u[3] = _mm_unpackhi_epi16(t[2], t[3]); + u[4] = _mm_unpacklo_epi16(t[9], t[14]); + u[5] = _mm_unpackhi_epi16(t[9], t[14]); + u[6] = _mm_unpacklo_epi16(t[10], t[13]); + u[7] = _mm_unpackhi_epi16(t[10], t[13]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); + v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); + v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_add_epi16(t[4], t[5]); + s[5] = _mm_sub_epi16(t[4], t[5]); + s[6] = _mm_sub_epi16(t[7], t[6]); + s[7] = _mm_add_epi16(t[6], t[7]); + s[8] = t[8]; + s[15] = t[15]; + s[9] = _mm_packs_epi32(u[8], u[9]); + s[14] = _mm_packs_epi32(u[10], u[11]); + s[10] = _mm_packs_epi32(u[12], u[13]); + s[13] = _mm_packs_epi32(u[14], u[15]); + s[11] = t[11]; + s[12] = t[12]; + + // stage 5 + t[0] = _mm_add_epi16(s[0], s[3]); + t[1] = _mm_add_epi16(s[1], s[2]); + t[2] = _mm_sub_epi16(s[1], s[2]); + t[3] = _mm_sub_epi16(s[0], s[3]); + t[4] = s[4]; + t[7] = s[7]; + + u[0] = _mm_unpacklo_epi16(s[5], s[6]); + u[1] = _mm_unpackhi_epi16(s[5], s[6]); + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + t[5] = _mm_packs_epi32(u[0], u[1]); + t[6] = _mm_packs_epi32(u[2], u[3]); + + t[8] = _mm_add_epi16(s[8], s[11]); + t[9] = _mm_add_epi16(s[9], s[10]); + t[10] = _mm_sub_epi16(s[9], s[10]); + t[11] = _mm_sub_epi16(s[8], s[11]); + t[12] = _mm_sub_epi16(s[15], s[12]); + t[13] = _mm_sub_epi16(s[14], s[13]); + t[14] = _mm_add_epi16(s[13], s[14]); + t[15] = _mm_add_epi16(s[12], s[15]); + + // stage 6 + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + s[8] = t[8]; + s[9] = t[9]; + + u[0] = _mm_unpacklo_epi16(t[10], t[13]); + u[1] = _mm_unpackhi_epi16(t[10], t[13]); + u[2] = _mm_unpacklo_epi16(t[11], t[12]); + u[3] = _mm_unpackhi_epi16(t[11], t[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + s[10] = _mm_packs_epi32(u[0], u[1]); + s[13] = _mm_packs_epi32(u[2], u[3]); + s[11] = _mm_packs_epi32(u[4], u[5]); + s[12] = _mm_packs_epi32(u[6], u[7]); + s[14] = t[14]; + s[15] = t[15]; + + // stage 7 + in[0] = _mm_add_epi16(s[0], s[15]); + in[1] = _mm_add_epi16(s[1], s[14]); + in[2] = _mm_add_epi16(s[2], s[13]); + in[3] = _mm_add_epi16(s[3], s[12]); + in[4] = _mm_add_epi16(s[4], s[11]); + in[5] = _mm_add_epi16(s[5], s[10]); + in[6] = _mm_add_epi16(s[6], s[9]); + in[7] = _mm_add_epi16(s[7], s[8]); + in[8] = _mm_sub_epi16(s[7], s[8]); + in[9] = _mm_sub_epi16(s[6], s[9]); + in[10] = _mm_sub_epi16(s[5], s[10]); + in[11] = _mm_sub_epi16(s[4], s[11]); + in[12] = _mm_sub_epi16(s[3], s[12]); + in[13] = _mm_sub_epi16(s[2], s[13]); + in[14] = _mm_sub_epi16(s[1], s[14]); + in[15] = _mm_sub_epi16(s[0], s[15]); +} + +void idct16_1d_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + idct16_1d_8col(in0); + idct16_1d_8col(in1); +} + +void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + iadst16_1d_8col(in0); + iadst16_1d_8col(in1); +} + +static INLINE void load_buffer_8x16(int16_t *input, __m128i *in) { + in[0] = _mm_load_si128((__m128i *)(input + 0 * 16)); + in[1] = _mm_load_si128((__m128i *)(input + 1 * 16)); + in[2] = _mm_load_si128((__m128i *)(input + 2 * 16)); + in[3] = _mm_load_si128((__m128i *)(input + 3 * 16)); + in[4] = _mm_load_si128((__m128i *)(input + 4 * 16)); + in[5] = _mm_load_si128((__m128i *)(input + 5 * 16)); + in[6] = _mm_load_si128((__m128i *)(input + 6 * 16)); + in[7] = _mm_load_si128((__m128i *)(input + 7 * 16)); + + in[8] = _mm_load_si128((__m128i *)(input + 8 * 16)); + in[9] = _mm_load_si128((__m128i *)(input + 9 * 16)); + in[10] = _mm_load_si128((__m128i *)(input + 10 * 16)); + in[11] = _mm_load_si128((__m128i *)(input + 11 * 16)); + in[12] = _mm_load_si128((__m128i *)(input + 12 * 16)); + in[13] = _mm_load_si128((__m128i *)(input + 13 * 16)); + in[14] = _mm_load_si128((__m128i *)(input + 14 * 16)); + in[15] = _mm_load_si128((__m128i *)(input + 15 * 16)); +} + +static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); +} + +void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride, + int tx_type) { + __m128i in0[16], in1[16]; + + load_buffer_8x16(input, in0); + input += 8; + load_buffer_8x16(input, in1); + + switch (tx_type) { + case 0: // DCT_DCT + idct16_1d_sse2(in0, in1); + idct16_1d_sse2(in0, in1); + break; + case 1: // ADST_DCT + idct16_1d_sse2(in0, in1); + iadst16_1d_sse2(in0, in1); + break; + case 2: // DCT_ADST + iadst16_1d_sse2(in0, in1); + idct16_1d_sse2(in0, in1); + break; + case 3: // ADST_ADST + iadst16_1d_sse2(in0, in1); + iadst16_1d_sse2(in0, in1); + break; + default: + assert(0); + break; + } + + write_buffer_8x16(dest, in0, stride); + dest += 8; + write_buffer_8x16(dest, in1, stride); +} + void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); diff --git a/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm b/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm new file mode 100644 index 0000000..980b8b9 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm @@ -0,0 +1,341 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 + +SECTION .text + +INIT_MMX sse +cglobal dc_predictor_4x4, 4, 4, 2, dst, stride, above, left + pxor m1, m1 + movd m0, [aboveq] + punpckldq m0, [leftq] + psadbw m0, m1 + paddw m0, [pw_4] + psraw m0, 3 + pshufw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + RET + +INIT_MMX sse +cglobal dc_predictor_8x8, 4, 4, 3, dst, stride, above, left + pxor m1, m1 + movq m0, [aboveq] + movq m2, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + paddw m0, [pw_8] + psraw m0, 4 + pshufw m0, m0, 0x0 + packuswb m0, m0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal dc_predictor_16x16, 4, 4, 3, dst, stride, above, left + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [pw_16] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal dc_predictor_32x32, 4, 4, 5, dst, stride, above, left + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + psadbw m3, m1 + psadbw m4, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [pw_32] + psraw m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + REP_RET + +INIT_MMX sse +cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above + movd m0, [aboveq] + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + RET + +INIT_MMX sse +cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_MMX sse +cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + movd m0, [aboveq] + punpcklbw m2, m1 + punpcklbw m0, m1 + pshufw m2, m2, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -2 + add leftq, 4 + psubw m0, m2 +.loop: + movd m2, [leftq+lineq*2] + movd m3, [leftq+lineq*2+1] + punpcklbw m2, m1 + punpcklbw m3, m1 + pshufw m2, m2, 0x0 + pshufw m3, m3, 0x0 + paddw m2, m0 + paddw m3, m0 + packuswb m2, m2 + packuswb m3, m3 + movd [dstq ], m2 + movd [dstq+strideq], m3 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + movq m0, [aboveq] + punpcklbw m2, m1 + punpcklbw m0, m1 + pshuflw m2, m2, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -4 + punpcklqdq m2, m2 + add leftq, 8 + psubw m0, m2 +.loop: + movd m2, [leftq+lineq*2] + movd m3, [leftq+lineq*2+1] + punpcklbw m2, m1 + punpcklbw m3, m1 + pshuflw m2, m2, 0x0 + pshuflw m3, m3, 0x0 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + paddw m2, m0 + paddw m3, m0 + packuswb m2, m3 + movq [dstq ], m2 + movhps [dstq+strideq], m2 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + mova m0, [aboveq] + punpcklbw m2, m1 + punpckhbw m4, m0, m1 + punpcklbw m0, m1 + pshuflw m2, m2, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -8 + punpcklqdq m2, m2 + add leftq, 16 + psubw m0, m2 + psubw m4, m2 +.loop: + movd m2, [leftq+lineq*2] + movd m3, [leftq+lineq*2+1] + punpcklbw m2, m1 + punpcklbw m3, m1 + pshuflw m2, m2, 0x0 + pshuflw m3, m3, 0x0 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + paddw m5, m2, m0 + paddw m6, m3, m0 + paddw m2, m4 + paddw m3, m4 + packuswb m5, m2 + packuswb m6, m3 + mova [dstq ], m5 + mova [dstq+strideq], m6 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET + +%if ARCH_X86_64 +INIT_XMM sse2 +cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + mova m0, [aboveq] + mova m4, [aboveq+16] + punpcklbw m2, m1 + punpckhbw m3, m0, m1 + punpckhbw m5, m4, m1 + punpcklbw m0, m1 + punpcklbw m4, m1 + pshuflw m2, m2, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -16 + punpcklqdq m2, m2 + add leftq, 32 + psubw m0, m2 + psubw m3, m2 + psubw m4, m2 + psubw m5, m2 +.loop: + movd m2, [leftq+lineq*2] + movd m6, [leftq+lineq*2+1] + punpcklbw m2, m1 + punpcklbw m6, m1 + pshuflw m2, m2, 0x0 + pshuflw m6, m6, 0x0 + punpcklqdq m2, m2 + punpcklqdq m6, m6 + paddw m7, m2, m0 + paddw m8, m2, m3 + paddw m9, m2, m4 + paddw m2, m5 + packuswb m7, m8 + packuswb m9, m2 + paddw m2, m6, m0 + paddw m8, m6, m3 + mova [dstq ], m7 + paddw m7, m6, m4 + paddw m6, m5 + mova [dstq +16], m9 + packuswb m2, m8 + packuswb m7, m6 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16], m7 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET +%endif diff --git a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm new file mode 100644 index 0000000..bc8ed5c --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm @@ -0,0 +1,87 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +INIT_MMX ssse3 +cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left + movifnidn leftq, leftmp + add leftq, 4 + mov lineq, -2 + pxor m0, m0 +.loop: + movd m1, [leftq+lineq*2 ] + movd m2, [leftq+lineq*2+1] + pshufb m1, m0 + pshufb m2, m0 + movd [dstq ], m1 + movd [dstq+strideq], m2 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET + +INIT_MMX ssse3 +cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left + movifnidn leftq, leftmp + add leftq, 8 + mov lineq, -4 + pxor m0, m0 +.loop: + movd m1, [leftq+lineq*2 ] + movd m2, [leftq+lineq*2+1] + pshufb m1, m0 + pshufb m2, m0 + movq [dstq ], m1 + movq [dstq+strideq], m2 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET + +INIT_XMM ssse3 +cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left + movifnidn leftq, leftmp + add leftq, 16 + mov lineq, -8 + pxor m0, m0 +.loop: + movd m1, [leftq+lineq*2 ] + movd m2, [leftq+lineq*2+1] + pshufb m1, m0 + pshufb m2, m0 + mova [dstq ], m1 + mova [dstq+strideq], m2 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET + +INIT_XMM ssse3 +cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left + movifnidn leftq, leftmp + add leftq, 32 + mov lineq, -16 + pxor m0, m0 +.loop: + movd m1, [leftq+lineq*2 ] + movd m2, [leftq+lineq*2+1] + pshufb m1, m0 + pshufb m2, m0 + mova [dstq ], m1 + mova [dstq +16], m1 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16], m2 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET diff --git a/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm b/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm deleted file mode 100644 index 1af2521..0000000 --- a/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm +++ /dev/null @@ -1,173 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output) -global sym(vp9_short_inv_walsh4x4_1_mmx) PRIVATE -sym(vp9_short_inv_walsh4x4_1_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) - mov rax, 3 - - mov rdi, arg(1) - add rax, [rsi] ;input[0] + 3 - - movd mm0, eax - - punpcklwd mm0, mm0 ;x x val val - - punpckldq mm0, mm0 ;val val val val - - psraw mm0, 3 ;(input[0] + 3) >> 3 - - movq [rdi + 0], mm0 - movq [rdi + 8], mm0 - movq [rdi + 16], mm0 - movq [rdi + 24], mm0 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_short_inv_walsh4x4_mmx(short *input, short *output) -global sym(vp9_short_inv_walsh4x4_mmx) PRIVATE -sym(vp9_short_inv_walsh4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prolog - - mov rax, 3 - mov rsi, arg(0) - mov rdi, arg(1) - shl rax, 16 - - movq mm0, [rsi + 0] ;ip[0] - movq mm1, [rsi + 8] ;ip[4] - or rax, 3 ;00030003h - - movq mm2, [rsi + 16] ;ip[8] - movq mm3, [rsi + 24] ;ip[12] - - movq mm7, rax - movq mm4, mm0 - - punpcklwd mm7, mm7 ;0003000300030003h - movq mm5, mm1 - - paddw mm4, mm3 ;ip[0] + ip[12] aka al - paddw mm5, mm2 ;ip[4] + ip[8] aka bl - - movq mm6, mm4 ;temp al - - paddw mm4, mm5 ;al + bl - psubw mm6, mm5 ;al - bl - - psubw mm0, mm3 ;ip[0] - ip[12] aka d1 - psubw mm1, mm2 ;ip[4] - ip[8] aka c1 - - movq mm5, mm0 ;temp dl - - paddw mm0, mm1 ;dl + cl - psubw mm5, mm1 ;dl - cl - - ; 03 02 01 00 - ; 13 12 11 10 - ; 23 22 21 20 - ; 33 32 31 30 - - movq mm3, mm4 ; 03 02 01 00 - punpcklwd mm4, mm0 ; 11 01 10 00 - punpckhwd mm3, mm0 ; 13 03 12 02 - - movq mm1, mm6 ; 23 22 21 20 - punpcklwd mm6, mm5 ; 31 21 30 20 - punpckhwd mm1, mm5 ; 33 23 32 22 - - movq mm0, mm4 ; 11 01 10 00 - movq mm2, mm3 ; 13 03 12 02 - - punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] - punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] - - punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] - punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] -;~~~~~~~~~~~~~~~~~~~~~ - movq mm1, mm0 - movq mm5, mm4 - - paddw mm1, mm3 ;ip[0] + ip[12] aka al - paddw mm5, mm2 ;ip[4] + ip[8] aka bl - - movq mm6, mm1 ;temp al - - paddw mm1, mm5 ;al + bl - psubw mm6, mm5 ;al - bl - - psubw mm0, mm3 ;ip[0] - ip[12] aka d1 - psubw mm4, mm2 ;ip[4] - ip[8] aka c1 - - movq mm5, mm0 ;temp dl - - paddw mm0, mm4 ;dl + cl - psubw mm5, mm4 ;dl - cl -;~~~~~~~~~~~~~~~~~~~~~ - movq mm3, mm1 ; 03 02 01 00 - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm3, mm0 ; 13 03 12 02 - - movq mm4, mm6 ; 23 22 21 20 - punpcklwd mm6, mm5 ; 31 21 30 20 - punpckhwd mm4, mm5 ; 33 23 32 22 - - movq mm0, mm1 ; 11 01 10 00 - movq mm2, mm3 ; 13 03 12 02 - - punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] - punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4] - - punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8] - punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12] - - paddw mm0, mm7 - paddw mm1, mm7 - paddw mm2, mm7 - paddw mm3, mm7 - - psraw mm0, 3 - psraw mm1, 3 - psraw mm2, 3 - psraw mm3, 3 - - movq [rdi + 0], mm0 - movq [rdi + 8], mm1 - movq [rdi + 16], mm2 - movq [rdi + 24], mm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - diff --git a/libvpx/vp9/common/x86/vp9_iwalsh_sse2.asm b/libvpx/vp9/common/x86/vp9_iwalsh_sse2.asm deleted file mode 100644 index 84fa2fe..0000000 --- a/libvpx/vp9/common/x86/vp9_iwalsh_sse2.asm +++ /dev/null @@ -1,119 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_short_inv_walsh4x4_sse2(short *input, short *output) -global sym(vp9_short_inv_walsh4x4_sse2) PRIVATE -sym(vp9_short_inv_walsh4x4_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - SAVE_XMM 6 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) - mov rdi, arg(1) - mov rax, 3 - - movdqa xmm0, [rsi + 0] ;ip[4] ip[0] - movdqa xmm1, [rsi + 16] ;ip[12] ip[8] - - shl rax, 16 - or rax, 3 ;00030003h - - pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] - movdqa xmm3, xmm0 ;ip[4] ip[0] - - paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 - psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 - - movdqa xmm4, xmm0 - punpcklqdq xmm0, xmm3 ;d1 a1 - punpckhqdq xmm4, xmm3 ;c1 b1 - movd xmm6, eax - - movdqa xmm1, xmm4 ;c1 b1 - paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] - psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] - -;;;temp output -;; movdqu [rdi + 0], xmm4 -;; movdqu [rdi + 16], xmm3 - -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; 13 12 11 10 03 02 01 00 - ; - ; 33 32 31 30 23 22 21 20 - ; - movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 - punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 - punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 - movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 - punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 - ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] - movdqa xmm3, xmm4 ;ip[4] ip[0] - - pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03 - - paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 - psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 - - movdqa xmm5, xmm4 - punpcklqdq xmm4, xmm3 ;d1 a1 - punpckhqdq xmm5, xmm3 ;c1 b1 - - movdqa xmm1, xmm5 ;c1 b1 - paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] - psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; 13 12 11 10 03 02 01 00 - ; - ; 33 32 31 30 23 22 21 20 - ; - movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00 - punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00 - punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10 - movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00 - punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02 -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - paddw xmm5, xmm6 - paddw xmm1, xmm6 - - psraw xmm5, 3 - psraw xmm1, 3 - - movdqa [rdi + 0], xmm5 - movdqa [rdi + 16], xmm1 - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -x_s1sqr2: - times 4 dw 0x8A8C -align 16 -x_c1sqr2less1: - times 4 dw 0x4E7B -align 16 -fours: - times 4 dw 0x0004 diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c index 50f890a..4af4f94 100644 --- a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c +++ b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -12,17 +12,11 @@ #include "vp9/common/vp9_loopfilter.h" #include "vpx_ports/emmintrin_compat.h" -prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2); -prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2); - -extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2; -extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2; - -void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, - int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, + int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]); DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]); @@ -483,6 +477,490 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, } } +static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, + int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]); + DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]); + + DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]); + + DECLARE_ALIGNED(16, unsigned char, ap[8][16]); + DECLARE_ALIGNED(16, unsigned char, aq[8][16]); + + + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + int i = 0; + const unsigned int extended_thresh = _thresh[0] * 0x01010101u; + const unsigned int extended_limit = _limit[0] * 0x01010101u; + const unsigned int extended_blimit = _blimit[0] * 0x01010101u; + const __m128i thresh = + _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); + const __m128i limit = + _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); + const __m128i blimit = + _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); + + p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); + + _mm_store_si128((__m128i *)ap[4], p4); + _mm_store_si128((__m128i *)ap[3], p3); + _mm_store_si128((__m128i *)ap[2], p2); + _mm_store_si128((__m128i *)ap[1], p1); + _mm_store_si128((__m128i *)ap[0], p0); + _mm_store_si128((__m128i *)aq[4], q4); + _mm_store_si128((__m128i *)aq[3], q3); + _mm_store_si128((__m128i *)aq[2], q2); + _mm_store_si128((__m128i *)aq[1], q1); + _mm_store_si128((__m128i *)aq[0], q0); + + + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), + _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), + _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), + _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), + _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + // loopfilter done + + { + __m128i work; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), + _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), + _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), + _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), + _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), + _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), + _mm_subs_epu8(q0, q4))); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); + q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); + flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0), + _mm_subs_epu8(p0, p5)), + _mm_or_si128(_mm_subs_epu8(q5, q0), + _mm_subs_epu8(q0, q5))); + _mm_store_si128((__m128i *)ap[5], p5); + _mm_store_si128((__m128i *)aq[5], q5); + flat2 = _mm_max_epu8(work, flat2); + p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); + q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0), + _mm_subs_epu8(p0, p6)), + _mm_or_si128(_mm_subs_epu8(q6, q0), + _mm_subs_epu8(q0, q6))); + _mm_store_si128((__m128i *)ap[6], p6); + _mm_store_si128((__m128i *)aq[6], q6); + flat2 = _mm_max_epu8(work, flat2); + + p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); + q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0), + _mm_subs_epu8(p0, p7)), + _mm_or_si128(_mm_subs_epu8(q7, q0), + _mm_subs_epu8(q0, q7))); + _mm_store_si128((__m128i *)ap[7], p7); + _mm_store_si128((__m128i *)aq[7], q7); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i temp_flat2 = flat2; + unsigned char *src = s; + int i = 0; + do { + __m128i workp_shft; + __m128i a, b, c; + + unsigned int off = i * 8; + p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero); + p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero); + p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero); + p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero); + q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero); + q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero); + q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero); + q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero); + + c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 + c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); + + b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2)); + a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1)); + a = _mm_add_epi16(_mm_add_epi16(p0, q0), a); + + _mm_storel_epi64((__m128i *)&flat_op[2][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_add_epi16(p5, eight), c); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[6][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q1, a); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1); + _mm_storel_epi64((__m128i *)&flat_op[1][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[5][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q2, a); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0); + _mm_storel_epi64((__m128i *)&flat_op[0][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[4][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q3, a); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0); + _mm_storel_epi64((__m128i *)&flat_oq[0][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[3][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + b = _mm_add_epi16(q3, b); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1); + _mm_storel_epi64((__m128i *)&flat_oq[1][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(q4, c); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[2][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + b = _mm_add_epi16(q3, b); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2); + _mm_storel_epi64((__m128i *)&flat_oq[2][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + a = _mm_add_epi16(q5, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[1][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q6, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[0][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + temp_flat2 = _mm_srli_si128(temp_flat2, 8); + src += 8; + } while (++i < 2); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + work_a = _mm_load_si128((__m128i *)ap[2]); + p2 = _mm_load_si128((__m128i *)flat_op[2]); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + _mm_store_si128((__m128i *)flat_op[2], p2); + + p1 = _mm_load_si128((__m128i *)flat_op[1]); + work_a = _mm_andnot_si128(flat, ps1); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + _mm_store_si128((__m128i *)flat_op[1], p1); + + p0 = _mm_load_si128((__m128i *)flat_op[0]); + work_a = _mm_andnot_si128(flat, ps0); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + _mm_store_si128((__m128i *)flat_op[0], p0); + + q0 = _mm_load_si128((__m128i *)flat_oq[0]); + work_a = _mm_andnot_si128(flat, qs0); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + _mm_store_si128((__m128i *)flat_oq[0], q0); + + q1 = _mm_load_si128((__m128i *)flat_oq[1]); + work_a = _mm_andnot_si128(flat, qs1); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + _mm_store_si128((__m128i *)flat_oq[1], q1); + + work_a = _mm_load_si128((__m128i *)aq[2]); + q2 = _mm_load_si128((__m128i *)flat_oq[2]); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + _mm_store_si128((__m128i *)flat_oq[2], q2); + + // write out op6 - op3 + { + unsigned char *dst = (s - 7 * p); + for (i = 6; i > 2; i--) { + __m128i flat2_output; + work_a = _mm_load_si128((__m128i *)ap[i]); + flat2_output = _mm_load_si128((__m128i *)flat2_op[i]); + work_a = _mm_andnot_si128(flat2, work_a); + flat2_output = _mm_and_si128(flat2, flat2_output); + work_a = _mm_or_si128(work_a, flat2_output); + _mm_storeu_si128((__m128i *)dst, work_a); + dst += p; + } + } + + work_a = _mm_load_si128((__m128i *)flat_op[2]); + p2 = _mm_load_si128((__m128i *)flat2_op[2]); + work_a = _mm_andnot_si128(flat2, work_a); + p2 = _mm_and_si128(flat2, p2); + p2 = _mm_or_si128(work_a, p2); + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + + work_a = _mm_load_si128((__m128i *)flat_op[1]); + p1 = _mm_load_si128((__m128i *)flat2_op[1]); + work_a = _mm_andnot_si128(flat2, work_a); + p1 = _mm_and_si128(flat2, p1); + p1 = _mm_or_si128(work_a, p1); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + + work_a = _mm_load_si128((__m128i *)flat_op[0]); + p0 = _mm_load_si128((__m128i *)flat2_op[0]); + work_a = _mm_andnot_si128(flat2, work_a); + p0 = _mm_and_si128(flat2, p0); + p0 = _mm_or_si128(work_a, p0); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + + work_a = _mm_load_si128((__m128i *)flat_oq[0]); + q0 = _mm_load_si128((__m128i *)flat2_oq[0]); + work_a = _mm_andnot_si128(flat2, work_a); + q0 = _mm_and_si128(flat2, q0); + q0 = _mm_or_si128(work_a, q0); + _mm_storeu_si128((__m128i *)(s - 0 * p), q0); + + work_a = _mm_load_si128((__m128i *)flat_oq[1]); + q1 = _mm_load_si128((__m128i *)flat2_oq[1]); + work_a = _mm_andnot_si128(flat2, work_a); + q1 = _mm_and_si128(flat2, q1); + q1 = _mm_or_si128(work_a, q1); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + + work_a = _mm_load_si128((__m128i *)flat_oq[2]); + q2 = _mm_load_si128((__m128i *)flat2_oq[2]); + work_a = _mm_andnot_si128(flat2, work_a); + q2 = _mm_and_si128(flat2, q2); + q2 = _mm_or_si128(work_a, q2); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + + // write out oq3 - oq7 + { + unsigned char *dst = (s + 3 * p); + for (i = 3; i < 7; i++) { + __m128i flat2_output; + work_a = _mm_load_si128((__m128i *)aq[i]); + flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]); + work_a = _mm_andnot_si128(flat2, work_a); + flat2_output = _mm_and_si128(flat2, flat2_output); + work_a = _mm_or_si128(work_a, flat2_output); + _mm_storeu_si128((__m128i *)dst, work_a); + dst += p; + } + } + } +} + +void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, + int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh, + int count) { + if (count == 1) + mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh); + else + mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh); +} + void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, int p, const unsigned char *_blimit, @@ -722,79 +1200,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, } } -void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u, - int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh, - unsigned char *v) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, src, 160); - - /* Read source */ - const __m128i p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 5 * p)), - _mm_loadl_epi64((__m128i *)(v - 5 * p))); - const __m128i p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 4 * p)), - _mm_loadl_epi64((__m128i *)(v - 4 * p))); - const __m128i p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 3 * p)), - _mm_loadl_epi64((__m128i *)(v - 3 * p))); - const __m128i p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 2 * p)), - _mm_loadl_epi64((__m128i *)(v - 2 * p))); - const __m128i p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 1 * p)), - _mm_loadl_epi64((__m128i *)(v - 1 * p))); - const __m128i q0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u)), - _mm_loadl_epi64((__m128i *)(v))); - const __m128i q1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 1 * p)), - _mm_loadl_epi64((__m128i *)(v + 1 * p))); - const __m128i q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 2 * p)), - _mm_loadl_epi64((__m128i *)(v + 2 * p))); - const __m128i q3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 3 * p)), - _mm_loadl_epi64((__m128i *)(v + 3 * p))); - const __m128i q4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 4 * p)), - _mm_loadl_epi64((__m128i *)(v + 4 * p))); - - _mm_store_si128((__m128i *)(src), p4); - _mm_store_si128((__m128i *)(src + 16), p3); - _mm_store_si128((__m128i *)(src + 32), p2); - _mm_store_si128((__m128i *)(src + 48), p1); - _mm_store_si128((__m128i *)(src + 64), p0); - _mm_store_si128((__m128i *)(src + 80), q0); - _mm_store_si128((__m128i *)(src + 96), q1); - _mm_store_si128((__m128i *)(src + 112), q2); - _mm_store_si128((__m128i *)(src + 128), q3); - _mm_store_si128((__m128i *)(src + 144), q4); - - /* Loop filtering */ - vp9_mbloop_filter_horizontal_edge_sse2(src + 80, 16, _blimit, _limit, - _thresh, 1); - - /* Store result */ - _mm_storel_epi64((__m128i *)(u - 3 * p), - _mm_loadl_epi64((__m128i *)(src + 32))); - _mm_storel_epi64((__m128i *)(u - 2 * p), - _mm_loadl_epi64((__m128i *)(src + 48))); - _mm_storel_epi64((__m128i *)(u - p), - _mm_loadl_epi64((__m128i *)(src + 64))); - _mm_storel_epi64((__m128i *)u, - _mm_loadl_epi64((__m128i *)(src + 80))); - _mm_storel_epi64((__m128i *)(u + p), - _mm_loadl_epi64((__m128i *)(src + 96))); - _mm_storel_epi64((__m128i *)(u + 2 * p), - _mm_loadl_epi64((__m128i *)(src + 112))); - - _mm_storel_epi64((__m128i *)(v - 3 * p), - _mm_loadl_epi64((__m128i *)(src + 40))); - _mm_storel_epi64((__m128i *)(v - 2 * p), - _mm_loadl_epi64((__m128i *)(src + 56))); - _mm_storel_epi64((__m128i *)(v - p), - _mm_loadl_epi64((__m128i *)(src + 72))); - _mm_storel_epi64((__m128i *)v, - _mm_loadl_epi64((__m128i *)(src + 88))); - _mm_storel_epi64((__m128i *)(v + p), - _mm_loadl_epi64((__m128i *)(src + 104))); - _mm_storel_epi64((__m128i *)(v + 2 * p), - _mm_loadl_epi64((__m128i *)(src + 120))); -} - static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, int in_p, unsigned char *out, int out_p) { __m128i x0, x1, x2, x3, x4, x5, x6, x7; @@ -941,7 +1346,7 @@ void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, /* Loop filtering */ vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh, 1); + thresh, 1); src[0] = t_dst + 3 * 16; src[1] = t_dst + 3 * 16 + 8; @@ -953,10 +1358,10 @@ void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, } void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, - int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh) { + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); unsigned char *src[4]; unsigned char *dst[4]; @@ -972,7 +1377,7 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, /* Loop filtering */ vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh); + thresh, 1); src[0] = t_dst; src[1] = t_dst + 8 * 16; @@ -982,32 +1387,3 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, transpose(src, 16, dst, p, 2); } - - -void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u, - int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - unsigned char *v) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); - unsigned char *src[2]; - unsigned char *dst[2]; - - /* Transpose 16x16 */ - transpose8x16(u - 8, v - 8, p, t_dst, 16); - transpose8x16(u, v, p, t_dst + 16 * 8, 16); - - /* Loop filtering */ - vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh, 1); - - src[0] = t_dst + 3 * 16; - src[1] = t_dst + 3 * 16 + 8; - - dst[0] = u - 5; - dst[1] = v - 5; - - /* Transpose 16x8 */ - transpose(src, 16, dst, p, 2); -} diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm b/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm deleted file mode 100644 index 74236cf..0000000 --- a/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm +++ /dev/null @@ -1,872 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -; Use of pmaxub instead of psubusb to compute filter mask was seen -; in ffvp8 - -%macro LFH_FILTER_AND_HEV_MASK 1 -%if %1 - movdqa xmm2, [rdi+2*rax] ; q3 - movdqa xmm1, [rsi+2*rax] ; q2 - movdqa xmm4, [rsi+rax] ; q1 - movdqa xmm5, [rsi] ; q0 - neg rax ; negate pitch to deal with above border -%else - movlps xmm2, [rsi + rcx*2] ; q3 - movlps xmm1, [rsi + rcx] ; q2 - movlps xmm4, [rsi] ; q1 - movlps xmm5, [rsi + rax] ; q0 - - movhps xmm2, [rdi + rcx*2] - movhps xmm1, [rdi + rcx] - movhps xmm4, [rdi] - movhps xmm5, [rdi + rax] - - lea rsi, [rsi + rax*4] - lea rdi, [rdi + rax*4] - - movdqa XMMWORD PTR [rsp], xmm1 ; store q2 - movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1 -%endif - - movdqa xmm6, xmm1 ; q2 - movdqa xmm3, xmm4 ; q1 - - psubusb xmm1, xmm2 ; q2-=q3 - psubusb xmm2, xmm6 ; q3-=q2 - - psubusb xmm4, xmm6 ; q1-=q2 - psubusb xmm6, xmm3 ; q2-=q1 - - por xmm4, xmm6 ; abs(q2-q1) - por xmm1, xmm2 ; abs(q3-q2) - - movdqa xmm0, xmm5 ; q0 - pmaxub xmm1, xmm4 - - psubusb xmm5, xmm3 ; q0-=q1 - psubusb xmm3, xmm0 ; q1-=q0 - - por xmm5, xmm3 ; abs(q0-q1) - movdqa t0, xmm5 ; save to t0 - - pmaxub xmm1, xmm5 - -%if %1 - movdqa xmm2, [rsi+4*rax] ; p3 - movdqa xmm4, [rdi+4*rax] ; p2 - movdqa xmm6, [rsi+2*rax] ; p1 -%else - movlps xmm2, [rsi + rax] ; p3 - movlps xmm4, [rsi] ; p2 - movlps xmm6, [rsi + rcx] ; p1 - - movhps xmm2, [rdi + rax] - movhps xmm4, [rdi] - movhps xmm6, [rdi + rcx] - - movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2 - movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1 -%endif - - movdqa xmm5, xmm4 ; p2 - movdqa xmm3, xmm6 ; p1 - - psubusb xmm4, xmm2 ; p2-=p3 - psubusb xmm2, xmm5 ; p3-=p2 - - psubusb xmm3, xmm5 ; p1-=p2 - pmaxub xmm1, xmm4 ; abs(p3 - p2) - - psubusb xmm5, xmm6 ; p2-=p1 - pmaxub xmm1, xmm2 ; abs(p3 - p2) - - pmaxub xmm1, xmm5 ; abs(p2 - p1) - movdqa xmm2, xmm6 ; p1 - - pmaxub xmm1, xmm3 ; abs(p2 - p1) -%if %1 - movdqa xmm4, [rsi+rax] ; p0 - movdqa xmm3, [rdi] ; q1 -%else - movlps xmm4, [rsi + rcx*2] ; p0 - movhps xmm4, [rdi + rcx*2] - movdqa xmm3, q1 ; q1 -%endif - - movdqa xmm5, xmm4 ; p0 - psubusb xmm4, xmm6 ; p0-=p1 - - psubusb xmm6, xmm5 ; p1-=p0 - - por xmm6, xmm4 ; abs(p1 - p0) - mov rdx, arg(2) ; get blimit - - movdqa t1, xmm6 ; save to t1 - - movdqa xmm4, xmm3 ; q1 - pmaxub xmm1, xmm6 - - psubusb xmm3, xmm2 ; q1-=p1 - psubusb xmm2, xmm4 ; p1-=q1 - - psubusb xmm1, xmm7 - por xmm2, xmm3 ; abs(p1-q1) - - movdqa xmm7, XMMWORD PTR [rdx] ; blimit - - movdqa xmm3, xmm0 ; q0 - pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero - - mov rdx, arg(4) ; hev get thresh - - movdqa xmm6, xmm5 ; p0 - psrlw xmm2, 1 ; abs(p1-q1)/2 - - psubusb xmm5, xmm3 ; p0-=q0 - - psubusb xmm3, xmm6 ; q0-=p0 - por xmm5, xmm3 ; abs(p0 - q0) - - paddusb xmm5, xmm5 ; abs(p0-q0)*2 - - movdqa xmm4, t0 ; hev get abs (q1 - q0) - - movdqa xmm3, t1 ; get abs (p1 - p0) - - paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - movdqa xmm2, XMMWORD PTR [rdx] ; hev - - psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - psubusb xmm4, xmm2 ; hev - - psubusb xmm3, xmm2 ; hev - por xmm1, xmm5 - - pxor xmm7, xmm7 - paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - - pcmpeqb xmm4, xmm5 ; hev - pcmpeqb xmm3, xmm3 ; hev - - pcmpeqb xmm1, xmm7 ; mask xmm1 - pxor xmm4, xmm3 ; hev -%endmacro - -%macro B_FILTER 1 -%if %1 == 0 - movdqa xmm2, p1 ; p1 - movdqa xmm7, q1 ; q1 -%elif %1 == 1 - movdqa xmm2, [rsi+2*rax] ; p1 - movdqa xmm7, [rdi] ; q1 -%elif %1 == 2 - lea rdx, srct - - movdqa xmm2, [rdx] ; p1 - movdqa xmm7, [rdx+48] ; q1 - movdqa xmm6, [rdx+16] ; p0 - movdqa xmm0, [rdx+32] ; q0 -%endif - - pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - - psubsb xmm2, xmm7 ; p1 - q1 - pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values - - pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) - pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values - - movdqa xmm3, xmm0 ; q0 - psubsb xmm0, xmm6 ; q0 - p0 - - paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) - - paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) - - paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) - - pand xmm1, xmm2 ; mask filter values we don't care about - - movdqa xmm2, xmm1 - - paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 - paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - - punpckhbw xmm5, xmm2 ; axbxcxdx - punpcklbw xmm2, xmm2 ; exfxgxhx - - punpcklbw xmm0, xmm1 ; exfxgxhx - psraw xmm5, 11 ; sign extended shift right by 3 - - punpckhbw xmm1, xmm1 ; axbxcxdx - psraw xmm2, 11 ; sign extended shift right by 3 - - packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; - psraw xmm0, 11 ; sign extended shift right by 3 - - psraw xmm1, 11 ; sign extended shift right by 3 - movdqa xmm5, xmm0 ; save results - - packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 - paddsw xmm5, [GLOBAL(ones)] - - paddsw xmm1, [GLOBAL(ones)] - psraw xmm5, 1 ; partial shifted one more time for 2nd tap - - psraw xmm1, 1 ; partial shifted one more time for 2nd tap - - paddsb xmm6, xmm2 ; p0+= p0 add - packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 - -%if %1 == 0 - movdqa xmm1, p1 ; p1 -%elif %1 == 1 - movdqa xmm1, [rsi+2*rax] ; p1 -%elif %1 == 2 - movdqa xmm1, [rdx] ; p1 -%endif - pandn xmm4, xmm5 ; high edge variance additive - pxor xmm6, [GLOBAL(t80)] ; unoffset - - pxor xmm1, [GLOBAL(t80)] ; reoffset - psubsb xmm3, xmm0 ; q0-= q0 add - - paddsb xmm1, xmm4 ; p1+= p1 add - pxor xmm3, [GLOBAL(t80)] ; unoffset - - pxor xmm1, [GLOBAL(t80)] ; unoffset - psubsb xmm7, xmm4 ; q1-= q1 add - - pxor xmm7, [GLOBAL(t80)] ; unoffset -%if %1 == 0 - lea rsi, [rsi + rcx*2] - lea rdi, [rdi + rcx*2] - movq MMWORD PTR [rsi], xmm6 ; p0 - movhps MMWORD PTR [rdi], xmm6 - movq MMWORD PTR [rsi + rax], xmm1 ; p1 - movhps MMWORD PTR [rdi + rax], xmm1 - movq MMWORD PTR [rsi + rcx], xmm3 ; q0 - movhps MMWORD PTR [rdi + rcx], xmm3 - movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1 - movhps MMWORD PTR [rdi + rcx*2],xmm7 -%elif %1 == 1 - movdqa [rsi+rax], xmm6 ; write back - movdqa [rsi+2*rax], xmm1 ; write back - movdqa [rsi], xmm3 ; write back - movdqa [rdi], xmm7 ; write back -%endif - -%endmacro - - -;void vp9_loop_filter_horizontal_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp9_loop_filter_horizontal_edge_sse2) PRIVATE -sym(vp9_loop_filter_horizontal_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step - - mov rdx, arg(3) ;limit - movdqa xmm7, XMMWORD PTR [rdx] - - lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing - - ; calculate breakout conditions and high edge variance - LFH_FILTER_AND_HEV_MASK 1 - ; filter and write back the result - B_FILTER 1 - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_loop_filter_horizontal_edge_uv_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp9_loop_filter_horizontal_edge_uv_sse2) PRIVATE -sym(vp9_loop_filter_horizontal_edge_uv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; - %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; - %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; - %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; - %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; - - mov rsi, arg(0) ; u - mov rdi, arg(5) ; v - movsxd rax, dword ptr arg(1) ; src_pixel_step - mov rcx, rax - neg rax ; negate pitch to deal with above border - - mov rdx, arg(3) ;limit - movdqa xmm7, XMMWORD PTR [rdx] - - lea rsi, [rsi + rcx] - lea rdi, [rdi + rcx] - - ; calculate breakout conditions and high edge variance - LFH_FILTER_AND_HEV_MASK 0 - ; filter and write back the result - B_FILTER 0 - - add rsp, 96 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -%macro TRANSPOSE_16X8 2 - movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 - movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 - movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 - movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 - movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 - movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 - - punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 - - movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 - - movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 - punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 - - movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 - - punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 -%if %1 - lea rsi, [rsi+rax*8] -%else - mov rsi, arg(5) ; v_ptr -%endif - - movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 - punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 - - punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 - - punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 -%if %1 - lea rdi, [rdi+rax*8] -%else - lea rsi, [rsi - 4] -%endif - - punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 -%if %1 - lea rdx, srct -%else - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing -%endif - - movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 - punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 - - movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 - punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - - punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 - - punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 - - punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 - - movdqa t0, xmm2 ; save to free XMM2 - movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 - movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 - movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 - movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 - movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 - - punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 - - movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 - - punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 - - movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 - - punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 - - movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 - - punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 - - movdqa xmm6, xmm1 ; - punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 - - punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 - movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 - - punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 - - punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 - - movdqa xmm0, xmm5 - punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 - - punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 - - punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 - - punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 - movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 - - punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 - - punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 -%if %2 - movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - - punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - - movdqa [rdx], xmm2 ; save 2 - - movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 - punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - - movdqa [rdx+16], xmm3 ; save 3 - - punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 - - movdqa [rdx+32], xmm4 ; save 4 - movdqa [rdx+48], xmm5 ; save 5 - movdqa xmm1, t0 ; get - - movdqa xmm2, xmm1 ; - punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - - punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 -%else - movdqa [rdx+112], xmm7 ; save 7 - - movdqa [rdx+96], xmm6 ; save 6 - - movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - - punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - - movdqa [rdx+32], xmm2 ; save 2 - - movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 - punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - - movdqa [rdx+48], xmm3 ; save 3 - - punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 - - movdqa [rdx+64], xmm4 ; save 4 - movdqa [rdx+80], xmm5 ; save 5 - movdqa xmm1, t0 ; get - - movdqa xmm2, xmm1 - punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - - punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - - movdqa [rdx+16], xmm1 - - movdqa [rdx], xmm2 -%endif -%endmacro - -%macro LFV_FILTER_MASK_HEV_MASK 1 - movdqa xmm0, xmm6 ; q2 - psubusb xmm0, xmm7 ; q2-q3 - - psubusb xmm7, xmm6 ; q3-q2 - movdqa xmm4, xmm5 ; q1 - - por xmm7, xmm0 ; abs (q3-q2) - psubusb xmm4, xmm6 ; q1-q2 - - movdqa xmm0, xmm1 - psubusb xmm6, xmm5 ; q2-q1 - - por xmm6, xmm4 ; abs (q2-q1) - psubusb xmm0, xmm2 ; p2 - p3; - - psubusb xmm2, xmm1 ; p3 - p2; - por xmm0, xmm2 ; abs(p2-p3) -%if %1 - movdqa xmm2, [rdx] ; p1 -%else - movdqa xmm2, [rdx+32] ; p1 -%endif - movdqa xmm5, xmm2 ; p1 - pmaxub xmm0, xmm7 - - psubusb xmm5, xmm1 ; p1-p2 - psubusb xmm1, xmm2 ; p2-p1 - - movdqa xmm7, xmm3 ; p0 - psubusb xmm7, xmm2 ; p0-p1 - - por xmm1, xmm5 ; abs(p2-p1) - pmaxub xmm0, xmm6 - - pmaxub xmm0, xmm1 - movdqa xmm1, xmm2 ; p1 - - psubusb xmm2, xmm3 ; p1-p0 - lea rdx, srct - - por xmm2, xmm7 ; abs(p1-p0) - - movdqa t0, xmm2 ; save abs(p1-p0) - - pmaxub xmm0, xmm2 - -%if %1 - movdqa xmm5, [rdx+32] ; q0 - movdqa xmm7, [rdx+48] ; q1 -%else - movdqa xmm5, [rdx+64] ; q0 - movdqa xmm7, [rdx+80] ; q1 -%endif - mov rdx, arg(3) ; limit - - movdqa xmm6, xmm5 ; q0 - movdqa xmm2, xmm7 ; q1 - - psubusb xmm5, xmm7 ; q0-q1 - psubusb xmm7, xmm6 ; q1-q0 - - por xmm7, xmm5 ; abs(q1-q0) - - movdqa t1, xmm7 ; save abs(q1-q0) - - movdqa xmm4, XMMWORD PTR [rdx]; limit - - pmaxub xmm0, xmm7 - mov rdx, arg(2) ; blimit - - psubusb xmm0, xmm4 - movdqa xmm5, xmm2 ; q1 - - psubusb xmm5, xmm1 ; q1-=p1 - psubusb xmm1, xmm2 ; p1-=q1 - - por xmm5, xmm1 ; abs(p1-q1) - movdqa xmm1, xmm3 ; p0 - - pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero - psubusb xmm1, xmm6 ; p0-q0 - - psrlw xmm5, 1 ; abs(p1-q1)/2 - psubusb xmm6, xmm3 ; q0-p0 - - movdqa xmm4, XMMWORD PTR [rdx]; blimit - - mov rdx, arg(4) ; get thresh - - por xmm1, xmm6 ; abs(q0-p0) - - movdqa xmm6, t0 ; get abs (q1 - q0) - - paddusb xmm1, xmm1 ; abs(q0-p0)*2 - - movdqa xmm3, t1 ; get abs (p1 - p0) - - movdqa xmm7, XMMWORD PTR [rdx] - - paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh - - psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh - - psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - - por xmm1, xmm0 ; mask - pcmpeqb xmm6, xmm0 - - pxor xmm0, xmm0 - pcmpeqb xmm4, xmm4 - - pcmpeqb xmm1, xmm0 - pxor xmm4, xmm6 -%endmacro - -%macro BV_TRANSPOSE 0 - ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 - movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - - movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - - punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 - - punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 - - movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 - - punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 - movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - - punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 - - punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 - ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 - ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 - ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 - ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 -%endmacro - -%macro BV_WRITEBACK 2 - movd [rsi+2], %1 - psrldq %1, 4 - - movd [rdi+2], %1 - psrldq %1, 4 - - movd [rsi+2*rax+2], %1 - psrldq %1, 4 - - movd [rdi+2*rax+2], %1 - - movd [rsi+4*rax+2], %2 - psrldq %2, 4 - - movd [rdi+4*rax+2], %2 - psrldq %2, 4 - - movd [rsi+2*rcx+2], %2 - psrldq %2, 4 - - movd [rdi+2*rcx+2], %2 -%endmacro - - -;void vp9_loop_filter_vertical_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp9_loop_filter_vertical_edge_sse2) PRIVATE -sym(vp9_loop_filter_vertical_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; - - mov rsi, arg(0) ; src_ptr - movsxd rax, dword ptr arg(1) ; src_pixel_step - - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - lea rcx, [rax*2+rax] - - ;transpose 16x8 to 8x16, and store the 8-line result on stack. - TRANSPOSE_16X8 1, 1 - - ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK 1 - - ; start work on filters - B_FILTER 2 - - ; tranpose and write back - only work on q1, q0, p0, p1 - BV_TRANSPOSE - ; store 16-line result - - lea rdx, [rax] - neg rdx - - BV_WRITEBACK xmm1, xmm5 - - lea rsi, [rsi+rdx*8] - lea rdi, [rdi+rdx*8] - BV_WRITEBACK xmm2, xmm6 - - add rsp, 96 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_loop_filter_vertical_edge_uv_sse2 -;( -; unsigned char *u, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; unsigned char *v -;) -global sym(vp9_loop_filter_vertical_edge_uv_sse2) PRIVATE -sym(vp9_loop_filter_vertical_edge_uv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; - - mov rsi, arg(0) ; u_ptr - movsxd rax, dword ptr arg(1) ; src_pixel_step - - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - lea rcx, [rax+2*rax] - - lea rdx, srct - - ;transpose 16x8 to 8x16, and store the 8-line result on stack. - TRANSPOSE_16X8 0, 1 - - ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK 1 - - ; start work on filters - B_FILTER 2 - - ; tranpose and write back - only work on q1, q0, p0, p1 - BV_TRANSPOSE - - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - - ; store 16-line result - BV_WRITEBACK xmm1, xmm5 - - mov rsi, arg(0) ; u_ptr - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - BV_WRITEBACK xmm2, xmm6 - - add rsp, 96 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -tfe: - times 16 db 0xfe -align 16 -t80: - times 16 db 0x80 -align 16 -t1s: - times 16 db 0x01 -align 16 -t3: - times 16 db 0x03 -align 16 -t4: - times 16 db 0x04 -align 16 -ones: - times 8 dw 0x0001 -align 16 -s9: - times 8 dw 0x0900 -align 16 -s63: - times 8 dw 0x003f diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_x86.h b/libvpx/vp9/common/x86/vp9_loopfilter_x86.h deleted file mode 100644 index fb5af05..0000000 --- a/libvpx/vp9/common/x86/vp9_loopfilter_x86.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_X86_VP9_LOOPFILTER_X86_H_ -#define VP9_COMMON_X86_VP9_LOOPFILTER_X86_H_ - -/* Note: - * - * This platform is commonly built for runtime CPU detection. If you modify - * any of the function mappings present in this file, be sure to also update - * them in the function pointer initialization code - */ - -#if HAVE_MMX -extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx); -extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx); -extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx); -extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx); -#endif - -#if HAVE_SSE2 -extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2); -extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2); -extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2); -extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2); -#endif - -#endif // LOOPFILTER_X86_H diff --git a/libvpx/vp9/common/x86/vp9_mask_sse3.asm b/libvpx/vp9/common/x86/vp9_mask_sse3.asm deleted file mode 100644 index fe46823..0000000 --- a/libvpx/vp9/common/x86/vp9_mask_sse3.asm +++ /dev/null @@ -1,484 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void int vp8_makemask_sse3( -; unsigned char *y, -; unsigned char *u, -; unsigned char *v, -; unsigned char *ym, -; unsigned char *uvm, -; int yp, -; int uvp, -; int ys, -; int us, -; int vs, -; int yt, -; int ut, -; int vt) -global sym(vp8_makemask_sse3) PRIVATE -sym(vp8_makemask_sse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 14 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;y - mov rdi, arg(1) ;u - mov rcx, arg(2) ;v - mov rax, arg(3) ;ym - movsxd rbx, dword arg(4) ;yp - movsxd rdx, dword arg(5) ;uvp - - pxor xmm0,xmm0 - - ;make 16 copies of the center y value - movd xmm1, arg(6) - pshufb xmm1, xmm0 - - ; make 16 copies of the center u value - movd xmm2, arg(7) - pshufb xmm2, xmm0 - - ; make 16 copies of the center v value - movd xmm3, arg(8) - pshufb xmm3, xmm0 - unpcklpd xmm2, xmm3 - - ;make 16 copies of the y tolerance - movd xmm3, arg(9) - pshufb xmm3, xmm0 - - ;make 16 copies of the u tolerance - movd xmm4, arg(10) - pshufb xmm4, xmm0 - - ;make 16 copies of the v tolerance - movd xmm5, arg(11) - pshufb xmm5, xmm0 - unpckhpd xmm4, xmm5 - - mov r8,8 - -NextPairOfRows: - - ;grab the y source values - movdqu xmm0, [rsi] - - ;compute abs difference between source and y target - movdqa xmm6, xmm1 - movdqa xmm7, xmm0 - psubusb xmm0, xmm1 - psubusb xmm6, xmm7 - por xmm0, xmm6 - - ;compute abs difference between - movdqa xmm6, xmm3 - pcmpgtb xmm6, xmm0 - - ;grab the y source values - add rsi, rbx - movdqu xmm0, [rsi] - - ;compute abs difference between source and y target - movdqa xmm11, xmm1 - movdqa xmm7, xmm0 - psubusb xmm0, xmm1 - psubusb xmm11, xmm7 - por xmm0, xmm11 - - ;compute abs difference between - movdqa xmm11, xmm3 - pcmpgtb xmm11, xmm0 - - - ;grab the u and v source values - movdqu xmm7, [rdi] - movdqu xmm8, [rcx] - unpcklpd xmm7, xmm8 - - ;compute abs difference between source and uv targets - movdqa xmm9, xmm2 - movdqa xmm10, xmm7 - psubusb xmm7, xmm2 - psubusb xmm9, xmm10 - por xmm7, xmm9 - - ;check whether the number is < tolerance - movdqa xmm0, xmm4 - pcmpgtb xmm0, xmm7 - - ;double u and v masks - movdqa xmm8, xmm0 - punpckhbw xmm0, xmm0 - punpcklbw xmm8, xmm8 - - ;mask row 0 and output - pand xmm6, xmm8 - pand xmm6, xmm0 - movdqa [rax],xmm6 - - ;mask row 1 and output - pand xmm11, xmm8 - pand xmm11, xmm0 - movdqa [rax+16],xmm11 - - - ; to the next row or set of rows - add rsi, rbx - add rdi, rdx - add rcx, rdx - add rax,32 - dec r8 - jnz NextPairOfRows - - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;GROW_HORIZ (register for result, source register or mem local) -; takes source and shifts left and ors with source -; then shifts right and ors with source -%macro GROW_HORIZ 2 - movdqa %1, %2 - movdqa xmm14, %1 - movdqa xmm15, %1 - pslldq xmm14, 1 - psrldq xmm15, 1 - por %1,xmm14 - por %1,xmm15 -%endmacro -;GROW_VERT (result, center row, above row, below row) -%macro GROW_VERT 4 - movdqa %1,%2 - por %1,%3 - por %1,%4 -%endmacro - -;GROW_NEXTLINE (new line to grow, new source, line to write) -%macro GROW_NEXTLINE 3 - GROW_HORIZ %1, %2 - GROW_VERT xmm3, xmm0, xmm1, xmm2 - movdqa %3,xmm3 -%endmacro - - -;void int vp8_growmaskmb_sse3( -; unsigned char *om, -; unsigned char *nm, -global sym(vp8_growmaskmb_sse3) PRIVATE -sym(vp8_growmaskmb_sse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src - mov rdi, arg(1) ;rst - - GROW_HORIZ xmm0, [rsi] - GROW_HORIZ xmm1, [rsi+16] - GROW_HORIZ xmm2, [rsi+32] - - GROW_VERT xmm3, xmm0, xmm1, xmm2 - por xmm0,xmm1 - movdqa [rdi], xmm0 - movdqa [rdi+16],xmm3 - - GROW_NEXTLINE xmm0,[rsi+48],[rdi+32] - GROW_NEXTLINE xmm1,[rsi+64],[rdi+48] - GROW_NEXTLINE xmm2,[rsi+80],[rdi+64] - GROW_NEXTLINE xmm0,[rsi+96],[rdi+80] - GROW_NEXTLINE xmm1,[rsi+112],[rdi+96] - GROW_NEXTLINE xmm2,[rsi+128],[rdi+112] - GROW_NEXTLINE xmm0,[rsi+144],[rdi+128] - GROW_NEXTLINE xmm1,[rsi+160],[rdi+144] - GROW_NEXTLINE xmm2,[rsi+176],[rdi+160] - GROW_NEXTLINE xmm0,[rsi+192],[rdi+176] - GROW_NEXTLINE xmm1,[rsi+208],[rdi+192] - GROW_NEXTLINE xmm2,[rsi+224],[rdi+208] - GROW_NEXTLINE xmm0,[rsi+240],[rdi+224] - - por xmm0,xmm2 - movdqa [rdi+240], xmm0 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int vp8_sad16x16_masked_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; unsigned char *mask) -global sym(vp8_sad16x16_masked_wmt) PRIVATE -sym(vp8_sad16x16_masked_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rbx, arg(4) ;mask - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - mov rcx, 16 - - pxor xmm3, xmm3 - -NextSadRow: - movdqu xmm0, [rsi] - movdqu xmm1, [rdi] - movdqu xmm2, [rbx] - pand xmm0, xmm2 - pand xmm1, xmm2 - - psadbw xmm0, xmm1 - paddw xmm3, xmm0 - - add rsi, rax - add rdi, rdx - add rbx, 16 - - dec rcx - jnz NextSadRow - - movdqa xmm4 , xmm3 - psrldq xmm4, 8 - paddw xmm3, xmm4 - movq rax, xmm3 - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_sad16x16_unmasked_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; unsigned char *mask) -global sym(vp8_sad16x16_unmasked_wmt) PRIVATE -sym(vp8_sad16x16_unmasked_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rbx, arg(4) ;mask - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - mov rcx, 16 - - pxor xmm3, xmm3 - -next_vp8_sad16x16_unmasked_wmt: - movdqu xmm0, [rsi] - movdqu xmm1, [rdi] - movdqu xmm2, [rbx] - por xmm0, xmm2 - por xmm1, xmm2 - - psadbw xmm0, xmm1 - paddw xmm3, xmm0 - - add rsi, rax - add rdi, rdx - add rbx, 16 - - dec rcx - jnz next_vp8_sad16x16_unmasked_wmt - - movdqa xmm4 , xmm3 - psrldq xmm4, 8 - paddw xmm3, xmm4 - movq rax, xmm3 - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_masked_predictor_wmt( -; unsigned char *masked, -; unsigned char *unmasked, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; unsigned char *mask) -global sym(vp8_masked_predictor_wmt) PRIVATE -sym(vp8_masked_predictor_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;ref_ptr - - mov rbx, arg(5) ;mask - movsxd rax, dword ptr arg(2) ;src_stride - mov r11, arg(3) ; destination - movsxd rdx, dword ptr arg(4) ;dst_stride - - mov rcx, 16 - - pxor xmm3, xmm3 - -next_vp8_masked_predictor_wmt: - movdqu xmm0, [rsi] - movdqu xmm1, [rdi] - movdqu xmm2, [rbx] - - pand xmm0, xmm2 - pandn xmm2, xmm1 - por xmm0, xmm2 - movdqu [r11], xmm0 - - add r11, rdx - add rsi, rax - add rdi, rdx - add rbx, 16 - - dec rcx - jnz next_vp8_masked_predictor_wmt - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;unsigned int vp8_masked_predictor_uv_wmt( -; unsigned char *masked, -; unsigned char *unmasked, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; unsigned char *mask) -global sym(vp8_masked_predictor_uv_wmt) PRIVATE -sym(vp8_masked_predictor_uv_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;ref_ptr - - mov rbx, arg(5) ;mask - movsxd rax, dword ptr arg(2) ;src_stride - mov r11, arg(3) ; destination - movsxd rdx, dword ptr arg(4) ;dst_stride - - mov rcx, 8 - - pxor xmm3, xmm3 - -next_vp8_masked_predictor_uv_wmt: - movq xmm0, [rsi] - movq xmm1, [rdi] - movq xmm2, [rbx] - - pand xmm0, xmm2 - pandn xmm2, xmm1 - por xmm0, xmm2 - movq [r11], xmm0 - - add r11, rdx - add rsi, rax - add rdi, rax - add rbx, 8 - - dec rcx - jnz next_vp8_masked_predictor_uv_wmt - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_uv_from_y_mask( -; unsigned char *ymask, -; unsigned char *uvmask) -global sym(vp8_uv_from_y_mask) PRIVATE -sym(vp8_uv_from_y_mask): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;dst_ptr - - - mov rcx, 8 - - pxor xmm3, xmm3 - -next_p8_uv_from_y_mask: - movdqu xmm0, [rsi] - pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)] - movq [rdi],xmm0 - add rdi, 8 - add rsi,32 - - dec rcx - jnz next_p8_uv_from_y_mask - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -shuf1b: - db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 - diff --git a/libvpx/vp9/common/x86/vp9_recon_mmx.asm b/libvpx/vp9/common/x86/vp9_recon_mmx.asm deleted file mode 100644 index 6fbbe48..0000000 --- a/libvpx/vp9/common/x86/vp9_recon_mmx.asm +++ /dev/null @@ -1,272 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -;void copy_mem8x8_mmx( -; unsigned char *src, -; int src_stride, -; unsigned char *dst, -; int dst_stride -; ) -global sym(vp9_copy_mem8x8_mmx) PRIVATE -sym(vp9_copy_mem8x8_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src; - movq mm0, [rsi] - - movsxd rax, dword ptr arg(1) ;src_stride; - mov rdi, arg(2) ;dst; - - movq mm1, [rsi+rax] - movq mm2, [rsi+rax*2] - - movsxd rcx, dword ptr arg(3) ;dst_stride - lea rsi, [rsi+rax*2] - - movq [rdi], mm0 - add rsi, rax - - movq [rdi+rcx], mm1 - movq [rdi+rcx*2], mm2 - - - lea rdi, [rdi+rcx*2] - movq mm3, [rsi] - - add rdi, rcx - movq mm4, [rsi+rax] - - movq mm5, [rsi+rax*2] - movq [rdi], mm3 - - lea rsi, [rsi+rax*2] - movq [rdi+rcx], mm4 - - movq [rdi+rcx*2], mm5 - lea rdi, [rdi+rcx*2] - - movq mm0, [rsi+rax] - movq mm1, [rsi+rax*2] - - movq [rdi+rcx], mm0 - movq [rdi+rcx*2],mm1 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void copy_mem8x4_mmx( -; unsigned char *src, -; int src_stride, -; unsigned char *dst, -; int dst_stride -; ) -global sym(vp9_copy_mem8x4_mmx) PRIVATE -sym(vp9_copy_mem8x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src; - movq mm0, [rsi] - - movsxd rax, dword ptr arg(1) ;src_stride; - mov rdi, arg(2) ;dst; - - movq mm1, [rsi+rax] - movq mm2, [rsi+rax*2] - - movsxd rcx, dword ptr arg(3) ;dst_stride - lea rsi, [rsi+rax*2] - - movq [rdi], mm0 - movq [rdi+rcx], mm1 - - movq [rdi+rcx*2], mm2 - lea rdi, [rdi+rcx*2] - - movq mm3, [rsi+rax] - movq [rdi+rcx], mm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void copy_mem16x16_mmx( -; unsigned char *src, -; int src_stride, -; unsigned char *dst, -; int dst_stride -; ) -global sym(vp9_copy_mem16x16_mmx) PRIVATE -sym(vp9_copy_mem16x16_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src; - movsxd rax, dword ptr arg(1) ;src_stride; - - mov rdi, arg(2) ;dst; - movsxd rcx, dword ptr arg(3) ;dst_stride - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq mm1, [rsi+rax] - movq mm4, [rsi+rax+8] - - movq mm2, [rsi+rax*2] - movq mm5, [rsi+rax*2+8] - - lea rsi, [rsi+rax*2] - add rsi, rax - - movq [rdi], mm0 - movq [rdi+8], mm3 - - movq [rdi+rcx], mm1 - movq [rdi+rcx+8], mm4 - - movq [rdi+rcx*2], mm2 - movq [rdi+rcx*2+8], mm5 - - lea rdi, [rdi+rcx*2] - add rdi, rcx - - movq mm0, [rsi] - movq mm3, [rsi+8]; - - movq [rdi], mm0 - movq [rdi+8], mm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/libvpx/vp9/common/x86/vp9_recon_sse2.asm b/libvpx/vp9/common/x86/vp9_recon_sse2.asm deleted file mode 100644 index 9ee3043..0000000 --- a/libvpx/vp9/common/x86/vp9_recon_sse2.asm +++ /dev/null @@ -1,572 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -;void copy_mem16x16_sse2( -; unsigned char *src, -; int src_stride, -; unsigned char *dst, -; int dst_stride -; ) -global sym(vp9_copy_mem16x16_sse2) PRIVATE -sym(vp9_copy_mem16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src; - movdqu xmm0, [rsi] - - movsxd rax, dword ptr arg(1) ;src_stride; - mov rdi, arg(2) ;dst; - - movdqu xmm1, [rsi+rax] - movdqu xmm2, [rsi+rax*2] - - movsxd rcx, dword ptr arg(3) ;dst_stride - lea rsi, [rsi+rax*2] - - movdqa [rdi], xmm0 - add rsi, rax - - movdqa [rdi+rcx], xmm1 - movdqa [rdi+rcx*2],xmm2 - - lea rdi, [rdi+rcx*2] - movdqu xmm3, [rsi] - - add rdi, rcx - movdqu xmm4, [rsi+rax] - - movdqu xmm5, [rsi+rax*2] - lea rsi, [rsi+rax*2] - - movdqa [rdi], xmm3 - add rsi, rax - - movdqa [rdi+rcx], xmm4 - movdqa [rdi+rcx*2],xmm5 - - lea rdi, [rdi+rcx*2] - movdqu xmm0, [rsi] - - add rdi, rcx - movdqu xmm1, [rsi+rax] - - movdqu xmm2, [rsi+rax*2] - lea rsi, [rsi+rax*2] - - movdqa [rdi], xmm0 - add rsi, rax - - movdqa [rdi+rcx], xmm1 - - movdqa [rdi+rcx*2], xmm2 - movdqu xmm3, [rsi] - - movdqu xmm4, [rsi+rax] - lea rdi, [rdi+rcx*2] - - add rdi, rcx - movdqu xmm5, [rsi+rax*2] - - lea rsi, [rsi+rax*2] - movdqa [rdi], xmm3 - - add rsi, rax - movdqa [rdi+rcx], xmm4 - - movdqa [rdi+rcx*2],xmm5 - movdqu xmm0, [rsi] - - lea rdi, [rdi+rcx*2] - movdqu xmm1, [rsi+rax] - - add rdi, rcx - movdqu xmm2, [rsi+rax*2] - - lea rsi, [rsi+rax*2] - movdqa [rdi], xmm0 - - movdqa [rdi+rcx], xmm1 - movdqa [rdi+rcx*2],xmm2 - - movdqu xmm3, [rsi+rax] - lea rdi, [rdi+rcx*2] - - movdqa [rdi+rcx], xmm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_intra_pred_uv_dc_mmx2( -; unsigned char *dst, -; int dst_stride -; unsigned char *src, -; int src_stride, -; ) -global sym(vp9_intra_pred_uv_dc_mmx2) PRIVATE -sym(vp9_intra_pred_uv_dc_mmx2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - ; from top - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rsi, rax - pxor mm0, mm0 - movq mm1, [rsi] - psadbw mm1, mm0 - - ; from left - dec rsi - lea rdi, [rax*3] - movzx ecx, byte [rsi+rax] - movzx edx, byte [rsi+rax*2] - add ecx, edx - movzx edx, byte [rsi+rdi] - add ecx, edx - lea rsi, [rsi+rax*4] - movzx edx, byte [rsi] - add ecx, edx - movzx edx, byte [rsi+rax] - add ecx, edx - movzx edx, byte [rsi+rax*2] - add ecx, edx - movzx edx, byte [rsi+rdi] - add ecx, edx - movzx edx, byte [rsi+rax*4] - add ecx, edx - - ; add up - pextrw edx, mm1, 0x0 - lea edx, [edx+ecx+8] - sar edx, 4 - movd mm1, edx - pshufw mm1, mm1, 0x0 - packuswb mm1, mm1 - - ; write out - mov rdi, arg(0) ;dst; - movsxd rcx, dword ptr arg(1) ;dst_stride - lea rax, [rcx*3] - - movq [rdi ], mm1 - movq [rdi+rcx ], mm1 - movq [rdi+rcx*2], mm1 - movq [rdi+rax ], mm1 - lea rdi, [rdi+rcx*4] - movq [rdi ], mm1 - movq [rdi+rcx ], mm1 - movq [rdi+rcx*2], mm1 - movq [rdi+rax ], mm1 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_intra_pred_uv_dctop_mmx2( -; unsigned char *dst, -; int dst_stride -; unsigned char *src, -; int src_stride, -; ) -global sym(vp9_intra_pred_uv_dctop_mmx2) PRIVATE -sym(vp9_intra_pred_uv_dctop_mmx2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ; from top - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rsi, rax - pxor mm0, mm0 - movq mm1, [rsi] - psadbw mm1, mm0 - - ; add up - paddw mm1, [GLOBAL(dc_4)] - psraw mm1, 3 - pshufw mm1, mm1, 0x0 - packuswb mm1, mm1 - - ; write out - mov rdi, arg(0) ;dst; - movsxd rcx, dword ptr arg(1) ;dst_stride - lea rax, [rcx*3] - - movq [rdi ], mm1 - movq [rdi+rcx ], mm1 - movq [rdi+rcx*2], mm1 - movq [rdi+rax ], mm1 - lea rdi, [rdi+rcx*4] - movq [rdi ], mm1 - movq [rdi+rcx ], mm1 - movq [rdi+rcx*2], mm1 - movq [rdi+rax ], mm1 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_intra_pred_uv_dcleft_mmx2( -; unsigned char *dst, -; int dst_stride -; unsigned char *src, -; int src_stride, -; ) -global sym(vp9_intra_pred_uv_dcleft_mmx2) PRIVATE -sym(vp9_intra_pred_uv_dcleft_mmx2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - ; from left - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - dec rsi - lea rdi, [rax*3] - movzx ecx, byte [rsi] - movzx edx, byte [rsi+rax] - add ecx, edx - movzx edx, byte [rsi+rax*2] - add ecx, edx - movzx edx, byte [rsi+rdi] - add ecx, edx - lea rsi, [rsi+rax*4] - movzx edx, byte [rsi] - add ecx, edx - movzx edx, byte [rsi+rax] - add ecx, edx - movzx edx, byte [rsi+rax*2] - add ecx, edx - movzx edx, byte [rsi+rdi] - lea edx, [ecx+edx+4] - - ; add up - shr edx, 3 - movd mm1, edx - pshufw mm1, mm1, 0x0 - packuswb mm1, mm1 - - ; write out - mov rdi, arg(0) ;dst; - movsxd rcx, dword ptr arg(1) ;dst_stride - lea rax, [rcx*3] - - movq [rdi ], mm1 - movq [rdi+rcx ], mm1 - movq [rdi+rcx*2], mm1 - movq [rdi+rax ], mm1 - lea rdi, [rdi+rcx*4] - movq [rdi ], mm1 - movq [rdi+rcx ], mm1 - movq [rdi+rcx*2], mm1 - movq [rdi+rax ], mm1 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_intra_pred_uv_dc128_mmx( -; unsigned char *dst, -; int dst_stride -; unsigned char *src, -; int src_stride, -; ) -global sym(vp9_intra_pred_uv_dc128_mmx) PRIVATE -sym(vp9_intra_pred_uv_dc128_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - GET_GOT rbx - ; end prolog - - ; write out - movq mm1, [GLOBAL(dc_128)] - mov rax, arg(0) ;dst; - movsxd rdx, dword ptr arg(1) ;dst_stride - lea rcx, [rdx*3] - - movq [rax ], mm1 - movq [rax+rdx ], mm1 - movq [rax+rdx*2], mm1 - movq [rax+rcx ], mm1 - lea rax, [rax+rdx*4] - movq [rax ], mm1 - movq [rax+rdx ], mm1 - movq [rax+rdx*2], mm1 - movq [rax+rcx ], mm1 - - ; begin epilog - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_intra_pred_uv_tm_sse2( -; unsigned char *dst, -; int dst_stride -; unsigned char *src, -; int src_stride, -; ) -%macro vp9_intra_pred_uv_tm 1 -global sym(vp9_intra_pred_uv_tm_%1) PRIVATE -sym(vp9_intra_pred_uv_tm_%1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ; read top row - mov edx, 4 - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rsi, rax - pxor xmm0, xmm0 -%ifidn %1, ssse3 - movdqa xmm2, [GLOBAL(dc_1024)] -%endif - movq xmm1, [rsi] - punpcklbw xmm1, xmm0 - - ; set up left ptrs ans subtract topleft - movd xmm3, [rsi-1] - lea rsi, [rsi+rax-1] -%ifidn %1, sse2 - punpcklbw xmm3, xmm0 - pshuflw xmm3, xmm3, 0x0 - punpcklqdq xmm3, xmm3 -%else - pshufb xmm3, xmm2 -%endif - psubw xmm1, xmm3 - - ; set up dest ptrs - mov rdi, arg(0) ;dst; - movsxd rcx, dword ptr arg(1) ;dst_stride - -.vp9_intra_pred_uv_tm_%1_loop: - movd xmm3, [rsi] - movd xmm5, [rsi+rax] -%ifidn %1, sse2 - punpcklbw xmm3, xmm0 - punpcklbw xmm5, xmm0 - pshuflw xmm3, xmm3, 0x0 - pshuflw xmm5, xmm5, 0x0 - punpcklqdq xmm3, xmm3 - punpcklqdq xmm5, xmm5 -%else - pshufb xmm3, xmm2 - pshufb xmm5, xmm2 -%endif - paddw xmm3, xmm1 - paddw xmm5, xmm1 - packuswb xmm3, xmm5 - movq [rdi ], xmm3 - movhps[rdi+rcx], xmm3 - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rcx*2] - dec edx - jnz .vp9_intra_pred_uv_tm_%1_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret -%endmacro - -vp9_intra_pred_uv_tm sse2 -vp9_intra_pred_uv_tm ssse3 - -;void vp9_intra_pred_uv_ve_mmx( -; unsigned char *dst, -; int dst_stride -; unsigned char *src, -; int src_stride, -; ) -global sym(vp9_intra_pred_uv_ve_mmx) PRIVATE -sym(vp9_intra_pred_uv_ve_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - ; end prolog - - ; read from top - mov rax, arg(2) ;src; - movsxd rdx, dword ptr arg(3) ;src_stride; - sub rax, rdx - movq mm1, [rax] - - ; write out - mov rax, arg(0) ;dst; - movsxd rdx, dword ptr arg(1) ;dst_stride - lea rcx, [rdx*3] - - movq [rax ], mm1 - movq [rax+rdx ], mm1 - movq [rax+rdx*2], mm1 - movq [rax+rcx ], mm1 - lea rax, [rax+rdx*4] - movq [rax ], mm1 - movq [rax+rdx ], mm1 - movq [rax+rdx*2], mm1 - movq [rax+rcx ], mm1 - - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_intra_pred_uv_ho_mmx2( -; unsigned char *dst, -; int dst_stride -; unsigned char *src, -; int src_stride, -; ) -%macro vp9_intra_pred_uv_ho 1 -global sym(vp9_intra_pred_uv_ho_%1) PRIVATE -sym(vp9_intra_pred_uv_ho_%1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi -%ifidn %1, ssse3 -%ifndef GET_GOT_SAVE_ARG - push rbx -%endif - GET_GOT rbx -%endif - ; end prolog - - ; read from left and write out -%ifidn %1, mmx2 - mov edx, 4 -%endif - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - mov rdi, arg(0) ;dst; - movsxd rcx, dword ptr arg(1) ;dst_stride -%ifidn %1, ssse3 - lea rdx, [rcx*3] - movdqa xmm2, [GLOBAL(dc_00001111)] - lea rbx, [rax*3] -%endif - dec rsi -%ifidn %1, mmx2 -.vp9_intra_pred_uv_ho_%1_loop: - movd mm0, [rsi] - movd mm1, [rsi+rax] - punpcklbw mm0, mm0 - punpcklbw mm1, mm1 - pshufw mm0, mm0, 0x0 - pshufw mm1, mm1, 0x0 - movq [rdi ], mm0 - movq [rdi+rcx], mm1 - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rcx*2] - dec edx - jnz .vp9_intra_pred_uv_ho_%1_loop -%else - movd xmm0, [rsi] - movd xmm3, [rsi+rax] - movd xmm1, [rsi+rax*2] - movd xmm4, [rsi+rbx] - punpcklbw xmm0, xmm3 - punpcklbw xmm1, xmm4 - pshufb xmm0, xmm2 - pshufb xmm1, xmm2 - movq [rdi ], xmm0 - movhps [rdi+rcx], xmm0 - movq [rdi+rcx*2], xmm1 - movhps [rdi+rdx], xmm1 - lea rsi, [rsi+rax*4] - lea rdi, [rdi+rcx*4] - movd xmm0, [rsi] - movd xmm3, [rsi+rax] - movd xmm1, [rsi+rax*2] - movd xmm4, [rsi+rbx] - punpcklbw xmm0, xmm3 - punpcklbw xmm1, xmm4 - pshufb xmm0, xmm2 - pshufb xmm1, xmm2 - movq [rdi ], xmm0 - movhps [rdi+rcx], xmm0 - movq [rdi+rcx*2], xmm1 - movhps [rdi+rdx], xmm1 -%endif - - ; begin epilog -%ifidn %1, ssse3 - RESTORE_GOT -%ifndef GET_GOT_SAVE_ARG - pop rbx -%endif -%endif - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret -%endmacro - -vp9_intra_pred_uv_ho mmx2 -vp9_intra_pred_uv_ho ssse3 - -SECTION_RODATA -dc_128: - times 8 db 128 -dc_4: - times 4 dw 4 -align 16 -dc_1024: - times 8 dw 0x400 -align 16 -dc_00001111: - times 8 db 0 - times 8 db 1 diff --git a/libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c b/libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c deleted file mode 100644 index 97148fb..0000000 --- a/libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "vpx_mem/vpx_mem.h" -#include "vp9/common/vp9_blockd.h" - -#define build_intra_predictors_mbuv_prototype(sym) \ - void sym(unsigned char *dst, int dst_stride, \ - const unsigned char *src, int src_stride) -typedef build_intra_predictors_mbuv_prototype((*build_intra_pred_mbuv_fn_t)); - -extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc_mmx2); -extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dctop_mmx2); -extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dcleft_mmx2); -extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc128_mmx); -extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_mmx2); -extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_ssse3); -extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ve_mmx); -extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_sse2); -extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_ssse3); - -static void build_intra_predictors_mbuv_x86(MACROBLOCKD *xd, - unsigned char *dst_u, - unsigned char *dst_v, - int dst_stride, - build_intra_pred_mbuv_fn_t tm_fn, - build_intra_pred_mbuv_fn_t ho_fn) { - int mode = xd->mode_info_context->mbmi.uv_mode; - build_intra_pred_mbuv_fn_t fn; - int src_stride = xd->plane[1].dst.stride; - - switch (mode) { - case V_PRED: - fn = vp9_intra_pred_uv_ve_mmx; - break; - case H_PRED: - fn = ho_fn; - break; - case TM_PRED: - fn = tm_fn; - break; - case DC_PRED: - if (xd->up_available) { - if (xd->left_available) { - fn = vp9_intra_pred_uv_dc_mmx2; - break; - } else { - fn = vp9_intra_pred_uv_dctop_mmx2; - break; - } - } else if (xd->left_available) { - fn = vp9_intra_pred_uv_dcleft_mmx2; - break; - } else { - fn = vp9_intra_pred_uv_dc128_mmx; - break; - } - break; - default: - return; - } - - fn(dst_u, dst_stride, xd->plane[1].dst.buf, src_stride); - fn(dst_v, dst_stride, xd->plane[2].dst.buf, src_stride); -} - -void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) { - build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf, - xd->plane[2].dst.buf, xd->plane[1].dst.stride, - vp9_intra_pred_uv_tm_sse2, - vp9_intra_pred_uv_ho_mmx2); -} - -void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) { - build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf, - xd->plane[2].dst.buf, xd->plane[1].dst.stride, - vp9_intra_pred_uv_tm_ssse3, - vp9_intra_pred_uv_ho_ssse3); -} - -void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) { - build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf, - xd->plane[2].dst.buf, xd->plane[1].dst.stride, - vp9_intra_pred_uv_tm_sse2, - vp9_intra_pred_uv_ho_mmx2); -} - -void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) { - build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf, - xd->plane[2].dst.buf, xd->plane[1].dst.stride, - vp9_intra_pred_uv_tm_ssse3, - vp9_intra_pred_uv_ho_ssse3); -} diff --git a/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c b/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c deleted file mode 100644 index ed873a5..0000000 --- a/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <emmintrin.h> /* SSE2 */ -#include "vpx/vpx_integer.h" -#include "vpx_ports/emmintrin_compat.h" - -unsigned int vp9_sad16x3_sse2( - const unsigned char *src_ptr, - int src_stride, - const unsigned char *ref_ptr, - int ref_stride) { - __m128i s0, s1, s2; - __m128i r0, r1, r2; - __m128i sad; - - s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride)); - s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride)); - s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride)); - - r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * ref_stride)); - r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * ref_stride)); - r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride)); - - sad = _mm_sad_epu8(s0, r0); - sad = _mm_add_epi16(sad, _mm_sad_epu8(s1, r1)); - sad = _mm_add_epi16(sad, _mm_sad_epu8(s2, r2)); - sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8)); - - return _mm_cvtsi128_si32(sad); -} - -unsigned int vp9_sad3x16_sse2( - const unsigned char *src_ptr, - int src_stride, - const unsigned char *ref_ptr, - int ref_stride) { - int r; - __m128i s0, s1, s2, s3; - __m128i r0, r1, r2, r3; - __m128i sad = _mm_setzero_si128(); - __m128i mask; - const int offset = (uintptr_t)src_ptr & 3; - - /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off. - * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd - * takes much less time. - */ - if (offset == 1) - src_ptr -= 1; - - /* mask = 0xffffffffffff0000ffffffffffff0000 */ - mask = _mm_cmpeq_epi32(sad, sad); - mask = _mm_slli_epi64(mask, 16); - - for (r = 0; r < 16; r += 4) { - s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride)); - s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride)); - s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride)); - s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride)); - r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride)); - r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride)); - r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride)); - r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride)); - - s0 = _mm_unpacklo_epi8(s0, s1); - r0 = _mm_unpacklo_epi8(r0, r1); - s2 = _mm_unpacklo_epi8(s2, s3); - r2 = _mm_unpacklo_epi8(r2, r3); - s0 = _mm_unpacklo_epi64(s0, s2); - r0 = _mm_unpacklo_epi64(r0, r2); - - // throw out extra byte - if (offset == 1) - s0 = _mm_and_si128(s0, mask); - else - s0 = _mm_slli_epi64(s0, 16); - r0 = _mm_slli_epi64(r0, 16); - - sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0)); - - src_ptr += src_stride*4; - ref_ptr += ref_stride*4; - } - - sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8)); - return _mm_cvtsi128_si32(sad); -} diff --git a/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm b/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm new file mode 100644 index 0000000..174e747 --- /dev/null +++ b/libvpx/vp9/decoder/arm/neon/vp9_add_constant_residual_neon.asm @@ -0,0 +1,230 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_add_constant_residual_8x8_neon| + EXPORT |vp9_add_constant_residual_16x16_neon| + EXPORT |vp9_add_constant_residual_32x32_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + MACRO + LD_16x8 $src, $stride + vld1.8 {q8}, [$src], $stride + vld1.8 {q9}, [$src], $stride + vld1.8 {q10}, [$src], $stride + vld1.8 {q11}, [$src], $stride + vld1.8 {q12}, [$src], $stride + vld1.8 {q13}, [$src], $stride + vld1.8 {q14}, [$src], $stride + vld1.8 {q15}, [$src], $stride + MEND + + MACRO + ADD_DIFF_16x8 $diff + vqadd.u8 q8, q8, $diff + vqadd.u8 q9, q9, $diff + vqadd.u8 q10, q10, $diff + vqadd.u8 q11, q11, $diff + vqadd.u8 q12, q12, $diff + vqadd.u8 q13, q13, $diff + vqadd.u8 q14, q14, $diff + vqadd.u8 q15, q15, $diff + MEND + + MACRO + SUB_DIFF_16x8 $diff + vqsub.u8 q8, q8, $diff + vqsub.u8 q9, q9, $diff + vqsub.u8 q10, q10, $diff + vqsub.u8 q11, q11, $diff + vqsub.u8 q12, q12, $diff + vqsub.u8 q13, q13, $diff + vqsub.u8 q14, q14, $diff + vqsub.u8 q15, q15, $diff + MEND + + MACRO + ST_16x8 $dst, $stride + vst1.8 {q8}, [$dst], $stride + vst1.8 {q9}, [$dst], $stride + vst1.8 {q10}, [$dst], $stride + vst1.8 {q11}, [$dst], $stride + vst1.8 {q12}, [$dst], $stride + vst1.8 {q13}, [$dst], $stride + vst1.8 {q14}, [$dst], $stride + vst1.8 {q15}, [$dst], $stride + MEND + +; void add_constant_residual(const int16_t diff, uint8_t *dest, int stride, +; int width, int height) { +; int r, c; +; +; for (r = 0; r < height; r++) { +; for (c = 0; c < width; c++) +; dest[c] = clip_pixel(diff + dest[c]); +; +; dest += stride; +; } +;} +;void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, +; int stride) { +; add_constant_residual(diff, dest, stride, 8, 8); +;} +; r0 : const int16_t diff +; r1 : const uint8_t *dest +; r2 : int stride +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +|vp9_add_constant_residual_8x8_neon| PROC + mov r3, r1 ; r3: save dest to r3 + vld1.8 {d0}, [r1], r2 + vld1.8 {d1}, [r1], r2 + vld1.8 {d2}, [r1], r2 + vld1.8 {d3}, [r1], r2 + vld1.8 {d4}, [r1], r2 + vld1.8 {d5}, [r1], r2 + vld1.8 {d6}, [r1], r2 + vld1.8 {d7}, [r1], r2 + cmp r0, #0 + bge DIFF_POSITIVE_8x8 + +DIFF_NEGATIVE_8x8 ; diff < 0 + neg r0, r0 + usat r0, #8, r0 + vdup.u8 q8, r0 + + vqsub.u8 q0, q0, q8 + vqsub.u8 q1, q1, q8 + vqsub.u8 q2, q2, q8 + vqsub.u8 q3, q3, q8 + b DIFF_SAVE_8x8 + +DIFF_POSITIVE_8x8 ; diff >= 0 + usat r0, #8, r0 + vdup.u8 q8, r0 + + vqadd.u8 q0, q0, q8 + vqadd.u8 q1, q1, q8 + vqadd.u8 q2, q2, q8 + vqadd.u8 q3, q3, q8 + +DIFF_SAVE_8x8 + vst1.8 {d0}, [r3], r2 + vst1.8 {d1}, [r3], r2 + vst1.8 {d2}, [r3], r2 + vst1.8 {d3}, [r3], r2 + vst1.8 {d4}, [r3], r2 + vst1.8 {d5}, [r3], r2 + vst1.8 {d6}, [r3], r2 + vst1.8 {d7}, [r3], r2 + + bx lr + ENDP + +;void vp9_add_constant_residual_16x16_c(const int16_t diff, uint8_t *dest, +; int stride) { +; add_constant_residual(diff, dest, stride, 16, 16); +;} +; r0 : const int16_t diff +; r1 : const uint8_t *dest +; r2 : int stride +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +|vp9_add_constant_residual_16x16_neon| PROC + mov r3, r1 + LD_16x8 r1, r2 + cmp r0, #0 + bge DIFF_POSITIVE_16x16 + +|DIFF_NEGATIVE_16x16| + neg r0, r0 + usat r0, #8, r0 + vdup.u8 q0, r0 + + SUB_DIFF_16x8 q0 + ST_16x8 r3, r2 + LD_16x8 r1, r2 + SUB_DIFF_16x8 q0 + b DIFF_SAVE_16x16 + +|DIFF_POSITIVE_16x16| + usat r0, #8, r0 + vdup.u8 q0, r0 + + ADD_DIFF_16x8 q0 + ST_16x8 r3, r2 + LD_16x8 r1, r2 + ADD_DIFF_16x8 q0 + +|DIFF_SAVE_16x16| + ST_16x8 r3, r2 + bx lr + ENDP + +;void vp9_add_constant_residual_32x32_c(const int16_t diff, uint8_t *dest, +; int stride) { +; add_constant_residual(diff, dest, stride, 32, 32); +;} +; r0 : const int16_t diff +; r1 : const uint8_t *dest +; r2 : int stride +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +|vp9_add_constant_residual_32x32_neon| PROC + push {r4,lr} + pld [r1] + mov r3, r1 + add r4, r1, #16 ; r4 dest + 16 for second loop + cmp r0, #0 + bge DIFF_POSITIVE_32x32 + +|DIFF_NEGATIVE_32x32| + neg r0, r0 + usat r0, #8, r0 + vdup.u8 q0, r0 + mov r0, #4 + +|DIFF_NEGATIVE_32x32_LOOP| + sub r0, #1 + LD_16x8 r1, r2 + SUB_DIFF_16x8 q0 + ST_16x8 r3, r2 + + LD_16x8 r1, r2 + SUB_DIFF_16x8 q0 + ST_16x8 r3, r2 + cmp r0, #2 + moveq r1, r4 + moveq r3, r4 + cmp r0, #0 + bne DIFF_NEGATIVE_32x32_LOOP + pop {r4,pc} + +|DIFF_POSITIVE_32x32| + usat r0, #8, r0 + vdup.u8 q0, r0 + mov r0, #4 + +|DIFF_POSITIVE_32x32_LOOP| + sub r0, #1 + LD_16x8 r1, r2 + ADD_DIFF_16x8 q0 + ST_16x8 r3, r2 + + LD_16x8 r1, r2 + ADD_DIFF_16x8 q0 + ST_16x8 r3, r2 + cmp r0, #2 + moveq r1, r4 + moveq r3, r4 + cmp r0, #0 + bne DIFF_POSITIVE_32x32_LOOP + pop {r4,pc} + ENDP + + END diff --git a/libvpx/vp9/decoder/vp9_asm_dec_offsets.c b/libvpx/vp9/decoder/vp9_asm_dec_offsets.c deleted file mode 100644 index e4b9c97..0000000 --- a/libvpx/vp9/decoder/vp9_asm_dec_offsets.c +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/asm_offsets.h" -#include "vp9/decoder/vp9_onyxd_int.h" - -BEGIN - -END - -/* add asserts for any offset that is not supported by assembly code */ -/* add asserts for any size that is not supported by assembly code */ diff --git a/libvpx/vp9/decoder/vp9_dboolhuff.c b/libvpx/vp9/decoder/vp9_dboolhuff.c index df77d65..31b1ae2 100644 --- a/libvpx/vp9/decoder/vp9_dboolhuff.c +++ b/libvpx/vp9/decoder/vp9_dboolhuff.c @@ -13,6 +13,12 @@ #include "vp9/decoder/vp9_dboolhuff.h" +// This is meant to be a large, positive constant that can still be efficiently +// loaded as an immediate (on platforms like ARM, for example). +// Even relatively modest values like 100 would work fine. +#define VP9_LOTS_OF_BITS 0x40000000 + + int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) { int marker_bit; @@ -67,3 +73,20 @@ const uint8_t *vp9_reader_find_end(vp9_reader *r) { return r->buffer; } +int vp9_reader_has_error(vp9_reader *r) { + // Check if we have reached the end of the buffer. + // + // Variable 'count' stores the number of bits in the 'value' buffer, minus + // 8. The top byte is part of the algorithm, and the remainder is buffered + // to be shifted into it. So if count == 8, the top 16 bits of 'value' are + // occupied, 8 for the algorithm and 8 in the buffer. + // + // When reading a byte from the user's buffer, count is filled with 8 and + // one byte is filled into the value buffer. When we reach the end of the + // data, count is additionally filled with VP9_LOTS_OF_BITS. So when + // count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted. + // + // 1 if we have tried to decode bits after the end of stream was encountered. + // 0 No error. + return r->count > VP9_BD_VALUE_SIZE && r->count < VP9_LOTS_OF_BITS; +} diff --git a/libvpx/vp9/decoder/vp9_dboolhuff.h b/libvpx/vp9/decoder/vp9_dboolhuff.h index b50aa35..c46dd73 100644 --- a/libvpx/vp9/decoder/vp9_dboolhuff.h +++ b/libvpx/vp9/decoder/vp9_dboolhuff.h @@ -22,11 +22,6 @@ typedef size_t VP9_BD_VALUE; #define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT) -// This is meant to be a large, positive constant that can still be efficiently -// loaded as an immediate (on platforms like ARM, for example). -// Even relatively modest values like 100 would work fine. -#define VP9_LOTS_OF_BITS 0x40000000 - typedef struct { const uint8_t *buffer_end; const uint8_t *buffer; @@ -93,22 +88,6 @@ static int vp9_read_literal(vp9_reader *br, int bits) { return z; } -static int vp9_reader_has_error(vp9_reader *r) { - // Check if we have reached the end of the buffer. - // - // Variable 'count' stores the number of bits in the 'value' buffer, minus - // 8. The top byte is part of the algorithm, and the remainder is buffered - // to be shifted into it. So if count == 8, the top 16 bits of 'value' are - // occupied, 8 for the algorithm and 8 in the buffer. - // - // When reading a byte from the user's buffer, count is filled with 8 and - // one byte is filled into the value buffer. When we reach the end of the - // data, count is additionally filled with VP9_LOTS_OF_BITS. So when - // count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted. - // - // 1 if we have tried to decode bits after the end of stream was encountered. - // 0 No error. - return r->count > VP9_BD_VALUE_SIZE && r->count < VP9_LOTS_OF_BITS; -} +int vp9_reader_has_error(vp9_reader *r); #endif // VP9_DECODER_VP9_DBOOLHUFF_H_ diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c index b3d41be..6f0044a 100644 --- a/libvpx/vp9/decoder/vp9_decodemv.c +++ b/libvpx/vp9/decoder/vp9_decodemv.c @@ -8,151 +8,188 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> -#include "vp9/decoder/vp9_treereader.h" -#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" -#include "vp9/common/vp9_reconinter.h" -#include "vp9/decoder/vp9_onyxd_int.h" +#include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_findnearmv.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_mvref_common.h" #include "vp9/common/vp9_pred_common.h" -#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_seg_common.h" + #include "vp9/decoder/vp9_decodemv.h" #include "vp9/decoder/vp9_decodframe.h" -#include "vp9/common/vp9_mvref_common.h" -#if CONFIG_DEBUG -#include <assert.h> -#endif +#include "vp9/decoder/vp9_onyxd_int.h" +#include "vp9/decoder/vp9_dsubexp.h" +#include "vp9/decoder/vp9_treereader.h" + +static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) { + return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p); +} -// #define DEBUG_DEC_MV -#ifdef DEBUG_DEC_MV -int dec_mvcount = 0; -#endif +static MB_PREDICTION_MODE read_inter_mode(vp9_reader *r, const vp9_prob *p) { + return (MB_PREDICTION_MODE)treed_read(r, vp9_inter_mode_tree, p); +} -// #define DEC_DEBUG -#ifdef DEC_DEBUG -extern int dec_debug; -#endif +static int read_segment_id(vp9_reader *r, const struct segmentation *seg) { + return treed_read(r, vp9_segment_tree, seg->tree_probs); +} -static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) { - MB_PREDICTION_MODE m = treed_read(r, vp9_intra_mode_tree, p); - return m; +static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, + BLOCK_SIZE_TYPE bsize, vp9_reader *r) { + const uint8_t context = vp9_get_pred_context_tx_size(xd); + const vp9_prob *tx_probs = get_tx_probs(bsize, context, &cm->fc.tx_probs); + TX_SIZE tx_size = vp9_read(r, tx_probs[0]); + if (tx_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) { + tx_size += vp9_read(r, tx_probs[1]); + if (tx_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32) + tx_size += vp9_read(r, tx_probs[2]); + } + + update_tx_counts(bsize, context, tx_size, &cm->counts.tx); + return tx_size; } -static int read_mb_segid(vp9_reader *r, MACROBLOCKD *xd) { - return treed_read(r, vp9_segment_tree, xd->mb_segment_tree_probs); +static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode, + BLOCK_SIZE_TYPE bsize, int select_cond, + vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + + if (tx_mode == TX_MODE_SELECT && bsize >= BLOCK_SIZE_SB8X8 && select_cond) + return read_selected_tx_size(cm, xd, bsize, r); + else if (tx_mode >= ALLOW_32X32 && bsize >= BLOCK_SIZE_SB32X32) + return TX_32X32; + else if (tx_mode >= ALLOW_16X16 && bsize >= BLOCK_SIZE_MB16X16) + return TX_16X16; + else if (tx_mode >= ALLOW_8X8 && bsize >= BLOCK_SIZE_SB8X8) + return TX_8X8; + else + return TX_4X4; } -static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi, +static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize, int mi_row, int mi_col, int segment_id) { - const int mi_index = mi_row * cm->mi_cols + mi_col; - const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type; - const int bw = 1 << mi_width_log2(sb_type); - const int bh = 1 << mi_height_log2(sb_type); - const int ymis = MIN(cm->mi_rows - mi_row, bh); + const int mi_offset = mi_row * cm->mi_cols + mi_col; + const int bw = 1 << mi_width_log2(bsize); + const int bh = 1 << mi_height_log2(bsize); const int xmis = MIN(cm->mi_cols - mi_col, bw); + const int ymis = MIN(cm->mi_rows - mi_row, bh); int x, y; - for (y = 0; y < ymis; y++) { - for (x = 0; x < xmis; x++) { - const int index = mi_index + (y * cm->mi_cols + x); - cm->last_frame_seg_map[index] = segment_id; - } - } -} + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); -static TX_SIZE select_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd, - vp9_reader *r, BLOCK_SIZE_TYPE bsize) { - const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE); - const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE); - TX_SIZE txfm_size = vp9_read(r, tx_probs[0]); - if (txfm_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) { - txfm_size += vp9_read(r, tx_probs[1]); - if (txfm_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32) - txfm_size += vp9_read(r, tx_probs[2]); - } - if (bsize >= BLOCK_SIZE_SB32X32) { - cm->fc.tx_count_32x32p[context][txfm_size]++; - } else if (bsize >= BLOCK_SIZE_MB16X16) { - cm->fc.tx_count_16x16p[context][txfm_size]++; - } else { - cm->fc.tx_count_8x8p[context][txfm_size]++; - } - return txfm_size; + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) + cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id; } +static int read_intra_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, + vp9_reader *r) { + MACROBLOCKD *const xd = &pbi->mb; + struct segmentation *const seg = &xd->seg; + const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; + int segment_id; + + if (!seg->enabled) + return 0; // Default for disabled segmentation + + if (!seg->update_map) + return 0; -static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m, - int mi_row, int mi_col, - vp9_reader *r) { + segment_id = read_segment_id(r, seg); + set_segment_id(&pbi->common, bsize, mi_row, mi_col, segment_id); + return segment_id; +} + +static int read_inter_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, + vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - const int mis = cm->mode_info_stride; + struct segmentation *const seg = &xd->seg; + const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; + int pred_segment_id, segment_id; - // Read segmentation map if it is being updated explicitly this frame - m->mbmi.segment_id = 0; - if (xd->segmentation_enabled && xd->update_mb_segmentation_map) { - m->mbmi.segment_id = read_mb_segid(r, xd); - set_segment_id(cm, &m->mbmi, mi_row, mi_col, m->mbmi.segment_id); - } + if (!seg->enabled) + return 0; // Default for disabled segmentation - m->mbmi.mb_skip_coeff = vp9_segfeature_active(xd, m->mbmi.segment_id, - SEG_LVL_SKIP); - if (!m->mbmi.mb_skip_coeff) { - m->mbmi.mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP)); - cm->fc.mbskip_count[vp9_get_pred_context(cm, xd, PRED_MBSKIP)] - [m->mbmi.mb_skip_coeff]++; + pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map, + bsize, mi_row, mi_col); + if (!seg->update_map) + return pred_segment_id; + + if (seg->temporal_update) { + const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(xd); + const int pred_flag = vp9_read(r, pred_prob); + vp9_set_pred_flag_seg_id(cm, bsize, mi_row, mi_col, pred_flag); + segment_id = pred_flag ? pred_segment_id + : read_segment_id(r, seg); + } else { + segment_id = read_segment_id(r, seg); } + set_segment_id(cm, bsize, mi_row, mi_col, segment_id); + return segment_id; +} - if (cm->txfm_mode == TX_MODE_SELECT && - m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) { - m->mbmi.txfm_size = select_txfm_size(cm, xd, r, m->mbmi.sb_type); - } else if (cm->txfm_mode >= ALLOW_32X32 && - m->mbmi.sb_type >= BLOCK_SIZE_SB32X32) { - m->mbmi.txfm_size = TX_32X32; - } else if (cm->txfm_mode >= ALLOW_16X16 && - m->mbmi.sb_type >= BLOCK_SIZE_MB16X16) { - m->mbmi.txfm_size = TX_16X16; - } else if (cm->txfm_mode >= ALLOW_8X8 && - m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) { - m->mbmi.txfm_size = TX_8X8; - } else { - m->mbmi.txfm_size = TX_4X4; +static uint8_t read_skip_coeff(VP9D_COMP *pbi, int segment_id, vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + int skip_coeff = vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP); + if (!skip_coeff) { + const int ctx = vp9_get_pred_context_mbskip(xd); + skip_coeff = vp9_read(r, vp9_get_pred_prob_mbskip(cm, xd)); + cm->counts.mbskip[ctx][skip_coeff]++; } + return skip_coeff; +} - // luma mode - m->mbmi.ref_frame[0] = INTRA_FRAME; - if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) { +static void read_intra_mode_info(VP9D_COMP *pbi, MODE_INFO *m, + int mi_row, int mi_col, vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + MB_MODE_INFO *const mbmi = &m->mbmi; + const BLOCK_SIZE_TYPE bsize = mbmi->sb_type; + const int mis = cm->mode_info_stride; + + mbmi->segment_id = read_intra_segment_id(pbi, mi_row, mi_col, r); + mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r); + mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r); + mbmi->ref_frame[0] = INTRA_FRAME; + + if (bsize >= BLOCK_SIZE_SB8X8) { const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis); const MB_PREDICTION_MODE L = xd->left_available ? left_block_mode(m, 0) : DC_PRED; - m->mbmi.mode = read_intra_mode(r, cm->kf_y_mode_prob[A][L]); + mbmi->mode = read_intra_mode(r, vp9_kf_y_mode_prob[A][L]); } else { + // Only 4x4, 4x8, 8x4 blocks + const int bw = 1 << b_width_log2(bsize); + const int bh = 1 << b_height_log2(bsize); int idx, idy; - int bw = 1 << b_width_log2(m->mbmi.sb_type); - int bh = 1 << b_height_log2(m->mbmi.sb_type); for (idy = 0; idy < 2; idy += bh) { for (idx = 0; idx < 2; idx += bw) { - int ib = idy * 2 + idx; - int k; + const int ib = idy * 2 + idx; const MB_PREDICTION_MODE A = above_block_mode(m, ib, mis); const MB_PREDICTION_MODE L = (xd->left_available || idx) ? left_block_mode(m, ib) : DC_PRED; - m->bmi[ib].as_mode.first = - read_intra_mode(r, cm->kf_y_mode_prob[A][L]); - for (k = 1; k < bh; ++k) - m->bmi[ib + k * 2].as_mode.first = m->bmi[ib].as_mode.first; - for (k = 1; k < bw; ++k) - m->bmi[ib + k].as_mode.first = m->bmi[ib].as_mode.first; + const MB_PREDICTION_MODE b_mode = read_intra_mode(r, + vp9_kf_y_mode_prob[A][L]); + m->bmi[ib].as_mode = b_mode; + if (bh == 2) + m->bmi[ib + 2].as_mode = b_mode; + if (bw == 2) + m->bmi[ib + 1].as_mode = b_mode; } } - m->mbmi.mode = m->bmi[3].as_mode.first; + + mbmi->mode = m->bmi[3].as_mode; } - m->mbmi.uv_mode = read_intra_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]); + mbmi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mbmi->mode]); } static int read_mv_component(vp9_reader *r, @@ -161,9 +198,10 @@ static int read_mv_component(vp9_reader *r, int mag, d, fr, hp; const int sign = vp9_read(r, mvcomp->sign); const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes); + const int class0 = mv_class == MV_CLASS_0; // Integer part - if (mv_class == MV_CLASS_0) { + if (class0) { d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0); } else { int i; @@ -176,66 +214,77 @@ static int read_mv_component(vp9_reader *r, // Fractional part fr = treed_read(r, vp9_mv_fp_tree, - mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp); + class0 ? mvcomp->class0_fp[d] : mvcomp->fp); // High precision part (if hp is not used, the default value of the hp is 1) - hp = usehp ? vp9_read(r, - mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp) + hp = usehp ? vp9_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp) : 1; - // result + // Result mag = vp9_get_mv_mag(mv_class, (d << 3) | (fr << 1) | hp) + 1; return sign ? -mag : mag; } -static void update_nmv(vp9_reader *r, vp9_prob *const p, - const vp9_prob upd_p) { - if (vp9_read(r, upd_p)) { -#ifdef LOW_PRECISION_MV_UPDATE +static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref, + const nmv_context *ctx, + nmv_context_counts *counts, int usehp) { + const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints); + MV diff = {0, 0}; + + usehp = usehp && vp9_use_mv_hp(ref); + if (mv_joint_vertical(j)) + diff.row = read_mv_component(r, &ctx->comps[0], usehp); + + if (mv_joint_horizontal(j)) + diff.col = read_mv_component(r, &ctx->comps[1], usehp); + + vp9_inc_mv(&diff, counts); + + mv->row = ref->row + diff.row; + mv->col = ref->col + diff.col; +} + +static void update_mv(vp9_reader *r, vp9_prob *p, vp9_prob upd_p) { + if (vp9_read(r, upd_p)) *p = (vp9_read_literal(r, 7) << 1) | 1; -#else - *p = (vp9_read_literal(r, 8)); -#endif - } } -static void read_nmvprobs(vp9_reader *r, nmv_context *mvctx, - int usehp) { +static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int usehp) { int i, j, k; -#ifdef MV_GROUP_UPDATE - if (!vp9_read_bit(r)) - return; -#endif for (j = 0; j < MV_JOINTS - 1; ++j) - update_nmv(r, &mvctx->joints[j], VP9_NMV_UPDATE_PROB); + update_mv(r, &mvc->joints[j], VP9_NMV_UPDATE_PROB); for (i = 0; i < 2; ++i) { - update_nmv(r, &mvctx->comps[i].sign, VP9_NMV_UPDATE_PROB); + nmv_component *const comp = &mvc->comps[i]; + + update_mv(r, &comp->sign, VP9_NMV_UPDATE_PROB); for (j = 0; j < MV_CLASSES - 1; ++j) - update_nmv(r, &mvctx->comps[i].classes[j], VP9_NMV_UPDATE_PROB); + update_mv(r, &comp->classes[j], VP9_NMV_UPDATE_PROB); for (j = 0; j < CLASS0_SIZE - 1; ++j) - update_nmv(r, &mvctx->comps[i].class0[j], VP9_NMV_UPDATE_PROB); + update_mv(r, &comp->class0[j], VP9_NMV_UPDATE_PROB); for (j = 0; j < MV_OFFSET_BITS; ++j) - update_nmv(r, &mvctx->comps[i].bits[j], VP9_NMV_UPDATE_PROB); + update_mv(r, &comp->bits[j], VP9_NMV_UPDATE_PROB); } for (i = 0; i < 2; ++i) { + nmv_component *const comp = &mvc->comps[i]; + for (j = 0; j < CLASS0_SIZE; ++j) for (k = 0; k < 3; ++k) - update_nmv(r, &mvctx->comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB); + update_mv(r, &comp->class0_fp[j][k], VP9_NMV_UPDATE_PROB); for (j = 0; j < 3; ++j) - update_nmv(r, &mvctx->comps[i].fp[j], VP9_NMV_UPDATE_PROB); + update_mv(r, &comp->fp[j], VP9_NMV_UPDATE_PROB); } if (usehp) { for (i = 0; i < 2; ++i) { - update_nmv(r, &mvctx->comps[i].class0_hp, VP9_NMV_UPDATE_PROB); - update_nmv(r, &mvctx->comps[i].hp, VP9_NMV_UPDATE_PROB); + update_mv(r, &mvc->comps[i].class0_hp, VP9_NMV_UPDATE_PROB); + update_mv(r, &mvc->comps[i].hp, VP9_NMV_UPDATE_PROB); } } } @@ -245,205 +294,71 @@ static void read_ref_frame(VP9D_COMP *pbi, vp9_reader *r, int segment_id, MV_REFERENCE_FRAME ref_frame[2]) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - const int seg_ref_active = vp9_segfeature_active(xd, segment_id, - SEG_LVL_REF_FRAME); + FRAME_CONTEXT *const fc = &cm->fc; + FRAME_COUNTS *const counts = &cm->counts; - // Segment reference frame features not available. - if (!seg_ref_active) { + if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME)) { + ref_frame[0] = vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME); + ref_frame[1] = NONE; + } else { + const int comp_ctx = vp9_get_pred_context_comp_inter_inter(cm, xd); int is_comp; - int comp_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_INTER_INTER); if (cm->comp_pred_mode == HYBRID_PREDICTION) { - is_comp = vp9_read(r, cm->fc.comp_inter_prob[comp_ctx]); - cm->fc.comp_inter_count[comp_ctx][is_comp]++; + is_comp = vp9_read(r, fc->comp_inter_prob[comp_ctx]); + counts->comp_inter[comp_ctx][is_comp]++; } else { is_comp = cm->comp_pred_mode == COMP_PREDICTION_ONLY; } // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding if (is_comp) { - int b, fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; - int ref_ctx = vp9_get_pred_context(cm, xd, PRED_COMP_REF_P); - - ref_frame[fix_ref_idx] = cm->comp_fixed_ref; - b = vp9_read(r, cm->fc.comp_ref_prob[ref_ctx]); - cm->fc.comp_ref_count[ref_ctx][b]++; + const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + const int ref_ctx = vp9_get_pred_context_comp_ref_p(cm, xd); + const int b = vp9_read(r, fc->comp_ref_prob[ref_ctx]); + counts->comp_ref[ref_ctx][b]++; + ref_frame[fix_ref_idx] = cm->comp_fixed_ref; ref_frame[!fix_ref_idx] = cm->comp_var_ref[b]; } else { - int ref1_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1); + const int ref1_ctx = vp9_get_pred_context_single_ref_p1(xd); ref_frame[1] = NONE; - if (vp9_read(r, cm->fc.single_ref_prob[ref1_ctx][0])) { - int ref2_ctx = vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P2); - int b2 = vp9_read(r, cm->fc.single_ref_prob[ref2_ctx][1]); - ref_frame[0] = b2 ? ALTREF_FRAME : GOLDEN_FRAME; - cm->fc.single_ref_count[ref1_ctx][0][1]++; - cm->fc.single_ref_count[ref2_ctx][1][b2]++; + if (vp9_read(r, fc->single_ref_prob[ref1_ctx][0])) { + const int ref2_ctx = vp9_get_pred_context_single_ref_p2(xd); + const int b = vp9_read(r, fc->single_ref_prob[ref2_ctx][1]); + ref_frame[0] = b ? ALTREF_FRAME : GOLDEN_FRAME; + counts->single_ref[ref1_ctx][0][1]++; + counts->single_ref[ref2_ctx][1][b]++; } else { ref_frame[0] = LAST_FRAME; - cm->fc.single_ref_count[ref1_ctx][0][0]++; + counts->single_ref[ref1_ctx][0][0]++; } } - } else { - ref_frame[0] = vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME); - ref_frame[1] = NONE; } } -static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *r, const vp9_prob *p) { - return (MB_PREDICTION_MODE) treed_read(r, vp9_sb_mv_ref_tree, p); -} - -#ifdef VPX_MODE_COUNT -unsigned int vp9_mv_cont_count[5][4] = { - { 0, 0, 0, 0 }, - { 0, 0, 0, 0 }, - { 0, 0, 0, 0 }, - { 0, 0, 0, 0 }, - { 0, 0, 0, 0 } -}; -#endif - -static void read_switchable_interp_probs(VP9_COMMON* const cm, vp9_reader *r) { +static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) { int i, j; - for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) - for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) { - cm->fc.switchable_interp_prob[j][i] = - // vp9_read_prob(r); - vp9_read_prob_diff_update(r, cm->fc.switchable_interp_prob[j][i]); - } - } + for (j = 0; j < VP9_SWITCHABLE_FILTERS + 1; ++j) + for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]); } -static void read_inter_mode_probs(VP9_COMMON *const cm, vp9_reader *r) { +static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) { int i, j; for (i = 0; i < INTER_MODE_CONTEXTS; ++i) - for (j = 0; j < VP9_INTER_MODES - 1; ++j) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) { - // cm->fc.inter_mode_probs[i][j] = vp9_read_prob(r); - cm->fc.inter_mode_probs[i][j] = - vp9_read_prob_diff_update(r, cm->fc.inter_mode_probs[i][j]); - } - } + for (j = 0; j < VP9_INTER_MODES - 1; ++j) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]); } static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) { COMPPREDMODE_TYPE mode = vp9_read_bit(r); if (mode) - mode += vp9_read_bit(r); + mode += vp9_read_bit(r); return mode; } -static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; - - if ((cm->frame_type != KEY_FRAME) && (!cm->intra_only)) { - nmv_context *const nmvc = &pbi->common.fc.nmvc; - MACROBLOCKD *const xd = &pbi->mb; - int i, j; - - read_inter_mode_probs(cm, r); - - if (cm->mcomp_filter_type == SWITCHABLE) - read_switchable_interp_probs(cm, r); - - for (i = 0; i < INTRA_INTER_CONTEXTS; i++) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) - cm->fc.intra_inter_prob[i] = - vp9_read_prob_diff_update(r, cm->fc.intra_inter_prob[i]); - } - - if (cm->allow_comp_inter_inter) { - cm->comp_pred_mode = read_comp_pred_mode(r); - if (cm->comp_pred_mode == HYBRID_PREDICTION) - for (i = 0; i < COMP_INTER_CONTEXTS; i++) - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) - cm->fc.comp_inter_prob[i] = - vp9_read_prob_diff_update(r, cm->fc.comp_inter_prob[i]); - } else { - cm->comp_pred_mode = SINGLE_PREDICTION_ONLY; - } - - if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) - for (i = 0; i < REF_CONTEXTS; i++) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) - cm->fc.single_ref_prob[i][0] = - vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][0]); - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) - cm->fc.single_ref_prob[i][1] = - vp9_read_prob_diff_update(r, cm->fc.single_ref_prob[i][1]); - } - - if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) - for (i = 0; i < REF_CONTEXTS; i++) - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) - cm->fc.comp_ref_prob[i] = - vp9_read_prob_diff_update(r, cm->fc.comp_ref_prob[i]); - - // VP9_INTRA_MODES - for (j = 0; j < BLOCK_SIZE_GROUPS; j++) { - for (i = 0; i < VP9_INTRA_MODES - 1; ++i) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) { - cm->fc.y_mode_prob[j][i] = - vp9_read_prob_diff_update(r, cm->fc.y_mode_prob[j][i]); - } - } - } - for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j) { - for (i = 0; i < PARTITION_TYPES - 1; ++i) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) { - cm->fc.partition_prob[INTER_FRAME][j][i] = - vp9_read_prob_diff_update(r, - cm->fc.partition_prob[INTER_FRAME][j][i]); - } - } - } - - read_nmvprobs(r, nmvc, xd->allow_high_precision_mv); - } -} - -// This function either reads the segment id for the current macroblock from -// the bitstream or if the value is temporally predicted asserts the predicted -// value -static int read_mb_segment_id(VP9D_COMP *pbi, int mi_row, int mi_col, - vp9_reader *r) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; - MODE_INFO *const mi = xd->mode_info_context; - MB_MODE_INFO *const mbmi = &mi->mbmi; - - if (!xd->segmentation_enabled) - return 0; // Default for disabled segmentation - - if (xd->update_mb_segmentation_map) { - int segment_id; - - if (cm->temporal_update) { - // Temporal coding of the segment id for this mb is enabled. - // Get the context based probability for reading the - // prediction status flag - const vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_SEG_ID); - const int pred_flag = vp9_read(r, pred_prob); - vp9_set_pred_flag(xd, PRED_SEG_ID, pred_flag); - - // If the value is flagged as correctly predicted - // then use the predicted value, otherwise decode it explicitly - segment_id = pred_flag ? vp9_get_pred_mi_segid(cm, mbmi->sb_type, - mi_row, mi_col) - : read_mb_segid(r, xd); - } else { - segment_id = read_mb_segid(r, xd); // Normal unpredicted coding mode - } - - set_segment_id(cm, mbmi, mi_row, mi_col, segment_id); // Side effect - return segment_id; - } else { - return vp9_get_pred_mi_segid(cm, mbmi->sb_type, mi_row, mi_col); - } -} - - static INLINE void assign_and_clamp_mv(int_mv *dst, const int_mv *src, int mb_to_left_edge, int mb_to_right_edge, @@ -454,242 +369,188 @@ static INLINE void assign_and_clamp_mv(int_mv *dst, const int_mv *src, mb_to_bottom_edge); } -static INLINE void decode_mv(vp9_reader *r, MV *mv, const MV *ref, - const nmv_context *ctx, - nmv_context_counts *counts, - int usehp) { - const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints); - MV diff = {0, 0}; - - usehp = usehp && vp9_use_nmv_hp(ref); - if (mv_joint_vertical(j)) - diff.row = read_mv_component(r, &ctx->comps[0], usehp); +static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type( + VP9D_COMP *pbi, vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + const vp9_prob *probs = vp9_get_pred_probs_switchable_interp(cm, xd); + const int index = treed_read(r, vp9_switchable_interp_tree, probs); + const int ctx = vp9_get_pred_context_switchable_interp(xd); + ++cm->counts.switchable_interp[ctx][index]; + return vp9_switchable_interp[index]; +} - if (mv_joint_horizontal(j)) - diff.col = read_mv_component(r, &ctx->comps[1], usehp); +static void read_intra_block_modes(VP9D_COMP *pbi, MODE_INFO *mi, + vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; + MB_MODE_INFO *const mbmi = &mi->mbmi; + const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type; + const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); - vp9_increment_nmv(&diff, ref, counts, usehp); + if (bsize >= BLOCK_SIZE_SB8X8) { + const int size_group = MIN(3, MIN(bwl, bhl)); + mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[size_group]); + cm->counts.y_mode[size_group][mbmi->mode]++; + } else { + // Only 4x4, 4x8, 8x4 blocks + const int bw = 1 << bwl, bh = 1 << bhl; + int idx, idy; + + for (idy = 0; idy < 2; idy += bh) { + for (idx = 0; idx < 2; idx += bw) { + const int ib = idy * 2 + idx; + const int b_mode = read_intra_mode(r, cm->fc.y_mode_prob[0]); + mi->bmi[ib].as_mode = b_mode; + cm->counts.y_mode[0][b_mode]++; + + if (bh == 2) + mi->bmi[ib + 2].as_mode = b_mode; + if (bw == 2) + mi->bmi[ib + 1].as_mode = b_mode; + } + } + mbmi->mode = mi->bmi[3].as_mode; + } - mv->row = diff.row + ref->row; - mv->col = diff.col + ref->col; + mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]); + cm->counts.uv_mode[mbmi->mode][mbmi->uv_mode]++; } -static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type( - VP9D_COMP *pbi, vp9_reader *r) { - const int index = treed_read(r, vp9_switchable_interp_tree, - vp9_get_pred_probs(&pbi->common, &pbi->mb, - PRED_SWITCHABLE_INTERP)); - ++pbi->common.fc.switchable_interp_count - [vp9_get_pred_context( - &pbi->common, &pbi->mb, PRED_SWITCHABLE_INTERP)][index]; - return vp9_switchable_interp[index]; +static MV_REFERENCE_FRAME read_reference_frame(VP9D_COMP *pbi, int segment_id, + vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + + MV_REFERENCE_FRAME ref; + if (!vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME)) { + const int ctx = vp9_get_pred_context_intra_inter(xd); + ref = (MV_REFERENCE_FRAME) + vp9_read(r, vp9_get_pred_prob_intra_inter(cm, xd)); + cm->counts.intra_inter[ctx][ref != INTRA_FRAME]++; + } else { + ref = (MV_REFERENCE_FRAME) vp9_get_segdata(&xd->seg, segment_id, + SEG_LVL_REF_FRAME) != INTRA_FRAME; + } + return ref; } -static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, - int mi_row, int mi_col, - vp9_reader *r) { +static void read_inter_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, + int mi_row, int mi_col, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; - nmv_context *const nmvc = &cm->fc.nmvc; MACROBLOCKD *const xd = &pbi->mb; + nmv_context *const nmvc = &cm->fc.nmvc; + MB_MODE_INFO *const mbmi = &mi->mbmi; int_mv *const mv0 = &mbmi->mv[0]; int_mv *const mv1 = &mbmi->mv[1]; - BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type; - int bw = 1 << b_width_log2(bsize); - int bh = 1 << b_height_log2(bsize); + const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type; + const int bw = 1 << b_width_log2(bsize); + const int bh = 1 << b_height_log2(bsize); - int mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge; - int j, idx, idy; + int idx, idy; + mbmi->segment_id = read_inter_segment_id(pbi, mi_row, mi_col, r); + mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r); + mbmi->ref_frame[0] = read_reference_frame(pbi, mbmi->segment_id, r); mbmi->ref_frame[1] = NONE; + mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, bsize, + (!mbmi->mb_skip_coeff || mbmi->ref_frame[0] == INTRA_FRAME), r); - // Make sure the MACROBLOCKD mode info pointer is pointed at the - // correct entry for the current macroblock. - xd->mode_info_context = mi; - - // Distance of Mb to the various image edges. - // These specified to 8th pel as they are always compared to MV values - // that are in 1/8th pel units - set_mi_row_col(cm, xd, mi_row, 1 << mi_height_log2(bsize), - mi_col, 1 << mi_width_log2(bsize)); - - mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN; - mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN; - mb_to_left_edge = xd->mb_to_left_edge - LEFT_TOP_MARGIN; - mb_to_right_edge = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN; - - // Read the macroblock segment id. - mbmi->segment_id = read_mb_segment_id(pbi, mi_row, mi_col, r); - - mbmi->mb_skip_coeff = vp9_segfeature_active(xd, mbmi->segment_id, - SEG_LVL_SKIP); - if (!mbmi->mb_skip_coeff) { - mbmi->mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP)); - cm->fc.mbskip_count[vp9_get_pred_context(cm, xd, PRED_MBSKIP)] - [mbmi->mb_skip_coeff]++; - } - - // Read the reference frame - if (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_REF_FRAME)) { - mbmi->ref_frame[0] = - vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER)); - cm->fc.intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)] - [mbmi->ref_frame[0] != INTRA_FRAME]++; - } else { - mbmi->ref_frame[0] = - vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME; - } - - if (cm->txfm_mode == TX_MODE_SELECT && - (mbmi->mb_skip_coeff == 0 || mbmi->ref_frame[0] == INTRA_FRAME) && - bsize >= BLOCK_SIZE_SB8X8) { - mbmi->txfm_size = select_txfm_size(cm, xd, r, bsize); - } else if (bsize >= BLOCK_SIZE_SB32X32 && - cm->txfm_mode >= ALLOW_32X32) { - mbmi->txfm_size = TX_32X32; - } else if (cm->txfm_mode >= ALLOW_16X16 && - bsize >= BLOCK_SIZE_MB16X16) { - mbmi->txfm_size = TX_16X16; - } else if (cm->txfm_mode >= ALLOW_8X8 && (bsize >= BLOCK_SIZE_SB8X8)) { - mbmi->txfm_size = TX_8X8; - } else { - mbmi->txfm_size = TX_4X4; - } - - // If reference frame is an Inter frame if (mbmi->ref_frame[0] != INTRA_FRAME) { int_mv nearest, nearby, best_mv; int_mv nearest_second, nearby_second, best_mv_second; vp9_prob *mv_ref_p; + MV_REFERENCE_FRAME ref0, ref1; read_ref_frame(pbi, r, mbmi->segment_id, mbmi->ref_frame); + ref0 = mbmi->ref_frame[0]; + ref1 = mbmi->ref_frame[1]; - { -#ifdef DEC_DEBUG - if (dec_debug) - printf("%d %d\n", xd->mode_info_context->mbmi.mv[0].as_mv.row, - xd->mode_info_context->mbmi.mv[0].as_mv.col); -#endif - vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context, - mbmi->ref_frame[0], mbmi->ref_mvs[mbmi->ref_frame[0]], - cm->ref_frame_sign_bias); - - mv_ref_p = cm->fc.inter_mode_probs[ - mbmi->mb_mode_context[mbmi->ref_frame[0]]]; - - // If the segment level skip mode enabled - if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) { - mbmi->mode = ZEROMV; - } else if (bsize >= BLOCK_SIZE_SB8X8) { - mbmi->mode = read_sb_mv_ref(r, mv_ref_p); - vp9_accum_mv_refs(cm, mbmi->mode, - mbmi->mb_mode_context[mbmi->ref_frame[0]]); - } + vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context, + ref0, mbmi->ref_mvs[ref0], cm->ref_frame_sign_bias); - if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) { - vp9_find_best_ref_mvs(xd, - mbmi->ref_mvs[mbmi->ref_frame[0]], - &nearest, &nearby); + mv_ref_p = cm->fc.inter_mode_probs[mbmi->mb_mode_context[ref0]]; - best_mv.as_int = mbmi->ref_mvs[mbmi->ref_frame[0]][0].as_int; - } + if (vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + mbmi->mode = ZEROMV; + } else if (bsize >= BLOCK_SIZE_SB8X8) { + mbmi->mode = read_inter_mode(r, mv_ref_p); + vp9_accum_mv_refs(cm, mbmi->mode, mbmi->mb_mode_context[ref0]); + } + mbmi->uv_mode = DC_PRED; -#ifdef DEC_DEBUG - if (dec_debug) - printf("[D %d %d] %d %d %d %d\n", ref_frame, - mbmi->mb_mode_context[ref_frame], - mv_ref_p[0], mv_ref_p[1], mv_ref_p[2], mv_ref_p[3]); -#endif + // nearest, nearby + if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) { + vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref0], &nearest, &nearby); + best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int; } mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE ? read_switchable_filter_type(pbi, r) : cm->mcomp_filter_type; - if (mbmi->ref_frame[1] > INTRA_FRAME) { + if (ref1 > INTRA_FRAME) { vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context, - mbmi->ref_frame[1], - mbmi->ref_mvs[mbmi->ref_frame[1]], - cm->ref_frame_sign_bias); + ref1, mbmi->ref_mvs[ref1], cm->ref_frame_sign_bias); if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) { - vp9_find_best_ref_mvs(xd, - mbmi->ref_mvs[mbmi->ref_frame[1]], - &nearest_second, - &nearby_second); - best_mv_second.as_int = mbmi->ref_mvs[mbmi->ref_frame[1]][0].as_int; + vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref1], + &nearest_second, &nearby_second); + best_mv_second.as_int = mbmi->ref_mvs[ref1][0].as_int; } } - mbmi->uv_mode = DC_PRED; + if (mbmi->sb_type < BLOCK_SIZE_SB8X8) { for (idy = 0; idy < 2; idy += bh) { for (idx = 0; idx < 2; idx += bw) { int_mv blockmv, secondmv; - int blockmode; - int i; - j = idy * 2 + idx; + const int j = idy * 2 + idx; + const int blockmode = read_inter_mode(r, mv_ref_p); - blockmode = read_sb_mv_ref(r, mv_ref_p); - vp9_accum_mv_refs(cm, blockmode, - mbmi->mb_mode_context[mbmi->ref_frame[0]]); + vp9_accum_mv_refs(cm, blockmode, mbmi->mb_mode_context[ref0]); if (blockmode == NEARESTMV || blockmode == NEARMV) { - MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1]; vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0); - if (rf2 > 0) { + if (ref1 > 0) vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest_second, &nearby_second, j, 1); - } } switch (blockmode) { case NEWMV: - decode_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc, - &cm->fc.NMVcount, xd->allow_high_precision_mv); + read_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc, + &cm->counts.mv, xd->allow_high_precision_mv); - if (mbmi->ref_frame[1] > 0) - decode_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc, - &cm->fc.NMVcount, xd->allow_high_precision_mv); - -#ifdef VPX_MODE_COUNT - vp9_mv_cont_count[mv_contz][3]++; -#endif + if (ref1 > 0) + read_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc, + &cm->counts.mv, xd->allow_high_precision_mv); break; case NEARESTMV: blockmv.as_int = nearest.as_int; - if (mbmi->ref_frame[1] > 0) + if (ref1 > 0) secondmv.as_int = nearest_second.as_int; -#ifdef VPX_MODE_COUNT - vp9_mv_cont_count[mv_contz][0]++; -#endif break; case NEARMV: blockmv.as_int = nearby.as_int; - if (mbmi->ref_frame[1] > 0) + if (ref1 > 0) secondmv.as_int = nearby_second.as_int; -#ifdef VPX_MODE_COUNT - vp9_mv_cont_count[mv_contz][1]++; -#endif break; case ZEROMV: blockmv.as_int = 0; - if (mbmi->ref_frame[1] > 0) + if (ref1 > 0) secondmv.as_int = 0; -#ifdef VPX_MODE_COUNT - vp9_mv_cont_count[mv_contz][2]++; -#endif break; default: - break; + assert(!"Invalid inter mode value"); } mi->bmi[j].as_mv[0].as_int = blockmv.as_int; - if (mbmi->ref_frame[1] > 0) + if (ref1 > 0) mi->bmi[j].as_mv[1].as_int = secondmv.as_int; - for (i = 1; i < bh; ++i) - vpx_memcpy(&mi->bmi[j + i * 2], &mi->bmi[j], sizeof(mi->bmi[j])); - for (i = 1; i < bw; ++i) - vpx_memcpy(&mi->bmi[j + i], &mi->bmi[j], sizeof(mi->bmi[j])); + if (bh == 2) + mi->bmi[j + 2] = mi->bmi[j]; + if (bw == 2) + mi->bmi[j + 1] = mi->bmi[j]; mi->mbmi.mode = blockmode; } } @@ -697,6 +558,11 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, mv0->as_int = mi->bmi[3].as_mv[0].as_int; mv1->as_int = mi->bmi[3].as_mv[1].as_int; } else { + const int mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN; + const int mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN; + const int mb_to_left_edge = xd->mb_to_left_edge - LEFT_TOP_MARGIN; + const int mb_to_right_edge = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN; + switch (mbmi->mode) { case NEARMV: // Clip "next_nearest" so that it does not extend to far out of image @@ -704,7 +570,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge); - if (mbmi->ref_frame[1] > 0) + if (ref1 > 0) assign_and_clamp_mv(mv1, &nearby_second, mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, @@ -717,7 +583,7 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge); - if (mbmi->ref_frame[1] > 0) + if (ref1 > 0) assign_and_clamp_mv(mv1, &nearest_second, mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, @@ -726,98 +592,109 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, case ZEROMV: mv0->as_int = 0; - if (mbmi->ref_frame[1] > 0) + if (ref1 > 0) mv1->as_int = 0; break; case NEWMV: - decode_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->fc.NMVcount, - xd->allow_high_precision_mv); - if (mbmi->ref_frame[1] > 0) - decode_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc, - &cm->fc.NMVcount, xd->allow_high_precision_mv); + read_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->counts.mv, + xd->allow_high_precision_mv); + if (ref1 > 0) + read_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc, + &cm->counts.mv, xd->allow_high_precision_mv); break; default: -#if CONFIG_DEBUG - assert(0); -#endif - break; + assert(!"Invalid inter mode value"); } } } else { - // required for left and above block mv - mv0->as_int = 0; - - if (bsize >= BLOCK_SIZE_SB8X8) { - const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; - const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); - const int bsl = MIN(bwl, bhl); - mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[MIN(3, bsl)]); - cm->fc.y_mode_counts[MIN(3, bsl)][mbmi->mode]++; - } else { - int idx, idy; - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { - int ib = idy * 2 + idx, k; - int m = read_intra_mode(r, cm->fc.y_mode_prob[0]); - mi->bmi[ib].as_mode.first = m; - cm->fc.y_mode_counts[0][m]++; - for (k = 1; k < bh; ++k) - mi->bmi[ib + k * 2].as_mode.first = m; - for (k = 1; k < bw; ++k) - mi->bmi[ib + k].as_mode.first = m; - } - } - mbmi->mode = mi->bmi[3].as_mode.first; + mv0->as_int = 0; // required for left and above block mv + read_intra_block_modes(pbi, mi, r); + } +} + +static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) { + int i; + + cm->comp_pred_mode = cm->allow_comp_inter_inter ? read_comp_pred_mode(r) + : SINGLE_PREDICTION_ONLY; + + if (cm->comp_pred_mode == HYBRID_PREDICTION) + for (i = 0; i < COMP_INTER_CONTEXTS; i++) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]); + + if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) + for (i = 0; i < REF_CONTEXTS; i++) { + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]); + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]); } - mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]); - cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++; - } + if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) + for (i = 0; i < REF_CONTEXTS; i++) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]); } -void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r) { - VP9_COMMON *cm = &pbi->common; +void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; int k; // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove. // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs)); - for (k = 0; k < MBSKIP_CONTEXTS; ++k) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) { - cm->fc.mbskip_probs[k] = - vp9_read_prob_diff_update(r, cm->fc.mbskip_probs[k]); - } - // cm->fc.mbskip_probs[k] = vp9_read_prob(r); - } + for (k = 0; k < MBSKIP_CONTEXTS; ++k) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]); + + if (cm->frame_type != KEY_FRAME && !cm->intra_only) { + nmv_context *const nmvc = &pbi->common.fc.nmvc; + MACROBLOCKD *const xd = &pbi->mb; + int i, j; + + read_inter_mode_probs(&cm->fc, r); - mb_mode_mv_init(pbi, r); + if (cm->mcomp_filter_type == SWITCHABLE) + read_switchable_interp_probs(&cm->fc, r); + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]); + + read_comp_pred(cm, r); + + for (j = 0; j < BLOCK_SIZE_GROUPS; j++) + for (i = 0; i < VP9_INTRA_MODES - 1; ++i) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]); + + for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j) + for (i = 0; i < PARTITION_TYPES - 1; ++i) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]); + + read_mv_probs(r, nmvc, xd->allow_high_precision_mv); + } } -void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi, - MACROBLOCKD* const xd, - int mi_row, - int mi_col, - vp9_reader *r) { +void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; MODE_INFO *mi = xd->mode_info_context; - MB_MODE_INFO *const mbmi = &mi->mbmi; + const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type; + const int bw = 1 << mi_width_log2(bsize); + const int bh = 1 << mi_height_log2(bsize); + const int y_mis = MIN(bh, cm->mi_rows - mi_row); + const int x_mis = MIN(bw, cm->mi_cols - mi_col); + int x, y; - if ((cm->frame_type == KEY_FRAME) || cm->intra_only) { - kfread_modes(pbi, mi, mi_row, mi_col, r); - } else { - read_mb_modes_mv(pbi, mi, &mi->mbmi, mi_row, mi_col, r); - } + if (cm->frame_type == KEY_FRAME || cm->intra_only) + read_intra_mode_info(pbi, mi, mi_row, mi_col, r); + else + read_inter_mode_info(pbi, mi, mi_row, mi_col, r); - if (1) { - const int bw = 1 << mi_width_log2(mbmi->sb_type); - const int bh = 1 << mi_height_log2(mbmi->sb_type); - const int y_mis = MIN(bh, cm->mi_rows - mi_row); - const int x_mis = MIN(bw, cm->mi_cols - mi_col); - const int mis = cm->mode_info_stride; - int x, y; - - for (y = 0; y < y_mis; y++) - for (x = !y; x < x_mis; x++) - mi[y * mis + x] = *mi; - } + for (y = 0; y < y_mis; y++) + for (x = !y; x < x_mis; x++) + mi[y * cm->mode_info_stride + x] = *mi; } diff --git a/libvpx/vp9/decoder/vp9_decodemv.h b/libvpx/vp9/decoder/vp9_decodemv.h index bf5e83c..4073d9e 100644 --- a/libvpx/vp9/decoder/vp9_decodemv.h +++ b/libvpx/vp9/decoder/vp9_decodemv.h @@ -13,11 +13,8 @@ #include "vp9/decoder/vp9_onyxd_int.h" -void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi, - MACROBLOCKD* const xd, - int mb_row, - int mb_col, - vp9_reader *r); -void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r); +void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r); + +void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r); #endif // VP9_DECODER_VP9_DECODEMV_H_ diff --git a/libvpx/vp9/decoder/vp9_decodframe.c b/libvpx/vp9/decoder/vp9_decodframe.c index 49b181d..ffec8ea 100644 --- a/libvpx/vp9/decoder/vp9_decodframe.c +++ b/libvpx/vp9/decoder/vp9_decodframe.c @@ -14,15 +14,15 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_scale/vpx_scale.h" -#include "vp9/common/vp9_extend.h" -#include "vp9/common/vp9_modecont.h" +#include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_reconintra.h" -#include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_entropy.h" -#include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_extend.h" +#include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_tile_common.h" @@ -30,169 +30,58 @@ #include "vp9/decoder/vp9_decodframe.h" #include "vp9/decoder/vp9_detokenize.h" #include "vp9/decoder/vp9_decodemv.h" +#include "vp9/decoder/vp9_dsubexp.h" #include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/decoder/vp9_read_bit_buffer.h" - -// #define DEC_DEBUG -#ifdef DEC_DEBUG -int dec_debug = 0; -#endif - static int read_be32(const uint8_t *p) { return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; } // len == 0 is not allowed -static int read_is_valid(const uint8_t *start, size_t len, - const uint8_t *end) { +static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) { return start + len > start && start + len <= end; } -static void setup_txfm_mode(VP9_COMMON *pc, int lossless, vp9_reader *r) { - if (lossless) { - pc->txfm_mode = ONLY_4X4; - } else { - pc->txfm_mode = vp9_read_literal(r, 2); - if (pc->txfm_mode == ALLOW_32X32) - pc->txfm_mode += vp9_read_bit(r); - if (pc->txfm_mode == TX_MODE_SELECT) { - int i, j; - for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { - for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) - pc->fc.tx_probs_8x8p[i][j] = - vp9_read_prob_diff_update(r, pc->fc.tx_probs_8x8p[i][j]); - } - } - for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { - for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) - pc->fc.tx_probs_16x16p[i][j] = - vp9_read_prob_diff_update(r, pc->fc.tx_probs_16x16p[i][j]); - } - } - for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { - for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) - pc->fc.tx_probs_32x32p[i][j] = - vp9_read_prob_diff_update(r, pc->fc.tx_probs_32x32p[i][j]); - } - } - } - } -} - -static int get_unsigned_bits(unsigned int num_values) { - int cat = 0; - if (num_values <= 1) - return 0; - num_values--; - while (num_values > 0) { - cat++; - num_values >>= 1; - } - return cat; -} - -static int inv_recenter_nonneg(int v, int m) { - if (v > 2 * m) - return v; - - return v % 2 ? m - (v + 1) / 2 : m + v / 2; -} - -static int decode_uniform(vp9_reader *r, int n) { - int v; - const int l = get_unsigned_bits(n); - const int m = (1 << l) - n; - if (!l) - return 0; - - v = vp9_read_literal(r, l - 1); - return v < m ? v : (v << 1) - m + vp9_read_bit(r); -} - -static int decode_term_subexp(vp9_reader *r, int k, int num_syms) { - int i = 0, mk = 0, word; - while (1) { - const int b = i ? k + i - 1 : k; - const int a = 1 << b; - if (num_syms <= mk + 3 * a) { - word = decode_uniform(r, num_syms - mk) + mk; - break; - } else { - if (vp9_read_bit(r)) { - i++; - mk += a; - } else { - word = vp9_read_literal(r, b) + mk; - break; - } - } - } - return word; -} - static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) { const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max)); return data > max ? max : data; } -static int merge_index(int v, int n, int modulus) { - int max1 = (n - 1 - modulus / 2) / modulus + 1; - if (v < max1) { - v = v * modulus + modulus / 2; - } else { - int w; - v -= max1; - w = v; - v += (v + modulus - modulus / 2) / modulus; - while (v % modulus == modulus / 2 || - w != v - (v + modulus - modulus / 2) / modulus) v++; - } - return v; -} - -static int inv_remap_prob(int v, int m) { - const int n = 255; - - v = merge_index(v, n - 1, MODULUS_PARAM); - m--; - if ((m << 1) <= n) { - return 1 + inv_recenter_nonneg(v + 1, m); - } else { - return n - inv_recenter_nonneg(v + 1, n - 1 - m); - } +static TX_MODE read_tx_mode(vp9_reader *r) { + TX_MODE tx_mode = vp9_read_literal(r, 2); + if (tx_mode == ALLOW_32X32) + tx_mode += vp9_read_bit(r); + return tx_mode; } -vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp) { - int delp = decode_term_subexp(r, SUBEXP_PARAM, 255); - return (vp9_prob)inv_remap_prob(delp, oldp); -} +static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) { + int i, j; -void vp9_init_dequantizer(VP9_COMMON *pc) { - int q; + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) + for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]); - for (q = 0; q < QINDEX_RANGE; q++) { - // DC value - pc->y_dequant[q][0] = vp9_dc_quant(q, pc->y_dc_delta_q); - pc->uv_dequant[q][0] = vp9_dc_quant(q, pc->uv_dc_delta_q); + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) + for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]); - // AC values - pc->y_dequant[q][1] = vp9_ac_quant(q, 0); - pc->uv_dequant[q][1] = vp9_ac_quant(q, pc->uv_ac_delta_q); - } + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) + for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]); } -static void mb_init_dequantizer(VP9_COMMON *pc, MACROBLOCKD *xd) { +static void init_dequantizer(VP9_COMMON *cm, MACROBLOCKD *xd) { int i; const int segment_id = xd->mode_info_context->mbmi.segment_id; - xd->q_index = vp9_get_qindex(xd, segment_id, pc->base_qindex); + xd->q_index = vp9_get_qindex(xd, segment_id, cm->base_qindex); - xd->plane[0].dequant = pc->y_dequant[xd->q_index]; + xd->plane[0].dequant = cm->y_dequant[xd->q_index]; for (i = 1; i < MAX_MB_PLANE; i++) - xd->plane[i].dequant = pc->uv_dequant[xd->q_index]; + xd->plane[i].dequant = cm->uv_dequant[xd->q_index]; } static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, @@ -201,32 +90,32 @@ static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, struct macroblockd_plane *pd = &xd->plane[plane]; int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16); const int stride = pd->dst.stride; + const int eob = pd->eobs[block]; const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane, block, ss_txfrm_size); uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane, raster_block, pd->dst.buf, stride); - TX_TYPE tx_type; - switch (ss_txfrm_size / 2) { - case TX_4X4: - tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT; + case TX_4X4: { + const TX_TYPE tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block); if (tx_type == DCT_DCT) - xd->itxm_add(qcoeff, dst, stride, pd->eobs[block]); + xd->itxm_add(qcoeff, dst, stride, eob); else - vp9_iht_add_c(tx_type, qcoeff, dst, stride, pd->eobs[block]); + vp9_iht_add_c(tx_type, qcoeff, dst, stride, eob); break; + } case TX_8X8: - tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT; - vp9_iht_add_8x8_c(tx_type, qcoeff, dst, stride, pd->eobs[block]); + vp9_iht_add_8x8_c(get_tx_type_8x8(pd->plane_type, xd), qcoeff, dst, + stride, eob); break; case TX_16X16: - tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT; - vp9_iht_add_16x16_c(tx_type, qcoeff, dst, stride, pd->eobs[block]); + vp9_iht_add_16x16_c(get_tx_type_16x16(pd->plane_type, xd), qcoeff, dst, + stride, eob); break; case TX_32X32: - vp9_idct_add_32x32(qcoeff, dst, stride, pd->eobs[block]); + vp9_idct_add_32x32(qcoeff, dst, stride, eob); break; } } @@ -235,6 +124,7 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, int ss_txfrm_size, void *arg) { MACROBLOCKD* const xd = arg; struct macroblockd_plane *pd = &xd->plane[plane]; + MODE_INFO *const mi = xd->mode_info_context; const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane, block, ss_txfrm_size); @@ -245,13 +135,12 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, int b_mode; int plane_b_size; const int tx_ib = raster_block >> tx_size; - const int mode = plane == 0 ? xd->mode_info_context->mbmi.mode - : xd->mode_info_context->mbmi.uv_mode; + const int mode = plane == 0 ? mi->mbmi.mode + : mi->mbmi.uv_mode; - - if (plane == 0 && xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) { + if (plane == 0 && mi->mbmi.sb_type < BLOCK_SIZE_SB8X8) { assert(bsize == BLOCK_SIZE_SB8X8); - b_mode = xd->mode_info_context->bmi[raster_block].as_mode.first; + b_mode = mi->bmi[raster_block].as_mode; } else { b_mode = mode; } @@ -261,97 +150,28 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, plane_b_size = b_width_log2(bsize) - pd->subsampling_x; vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode, + dst, pd->dst.stride, dst, pd->dst.stride); // Early exit if there are no coefficients - if (xd->mode_info_context->mbmi.mb_skip_coeff) + if (mi->mbmi.mb_skip_coeff) return; decode_block(plane, block, bsize, ss_txfrm_size, arg); } -static void decode_atom(VP9D_COMP *pbi, MACROBLOCKD *xd, - int mi_row, int mi_col, - vp9_reader *r, BLOCK_SIZE_TYPE bsize) { - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; - - assert(mbmi->ref_frame[0] != INTRA_FRAME); - - if ((pbi->common.frame_type != KEY_FRAME) && (!pbi->common.intra_only)) - vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common); - - // prediction - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - - if (mbmi->mb_skip_coeff) { - vp9_reset_sb_tokens_context(xd, bsize); - } else { - if (xd->segmentation_enabled) - mb_init_dequantizer(&pbi->common, xd); - - if (!vp9_reader_has_error(r)) - vp9_decode_tokens(pbi, r, bsize); - - foreach_transformed_block(xd, bsize, decode_block, xd); - } -} +static int decode_tokens(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize, vp9_reader *r) { + MACROBLOCKD *const xd = &pbi->mb; -static void decode_sb_intra(VP9D_COMP *pbi, MACROBLOCKD *xd, - int mi_row, int mi_col, - vp9_reader *r, BLOCK_SIZE_TYPE bsize) { - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; - if (mbmi->mb_skip_coeff) { + if (xd->mode_info_context->mbmi.mb_skip_coeff) { vp9_reset_sb_tokens_context(xd, bsize); + return -1; } else { - if (xd->segmentation_enabled) - mb_init_dequantizer(&pbi->common, xd); - - if (!vp9_reader_has_error(r)) - vp9_decode_tokens(pbi, r, bsize); - } + if (xd->seg.enabled) + init_dequantizer(&pbi->common, xd); - foreach_transformed_block(xd, bsize, decode_block_intra, xd); -} - - -static void decode_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, int mi_row, int mi_col, - vp9_reader *r, BLOCK_SIZE_TYPE bsize) { - const int bwl = mi_width_log2(bsize), bhl = mi_height_log2(bsize); - const int bw = 1 << bwl, bh = 1 << bhl; - int n, eobtotal; - VP9_COMMON *const pc = &pbi->common; - MODE_INFO *const mi = xd->mode_info_context; - MB_MODE_INFO *const mbmi = &mi->mbmi; - const int mis = pc->mode_info_stride; - - assert(mbmi->sb_type == bsize); - assert(mbmi->ref_frame[0] != INTRA_FRAME); - - if (pbi->common.frame_type != KEY_FRAME) - vp9_setup_interp_filters(xd, mbmi->interp_filter, pc); - - // generate prediction - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - - if (mbmi->mb_skip_coeff) { - vp9_reset_sb_tokens_context(xd, bsize); - } else { - // re-initialize macroblock dequantizer before detokenization - if (xd->segmentation_enabled) - mb_init_dequantizer(pc, xd); - - // dequantization and idct - eobtotal = vp9_decode_tokens(pbi, r, bsize); - if (eobtotal == 0) { // skip loopfilter - for (n = 0; n < bw * bh; n++) { - const int x_idx = n & (bw - 1), y_idx = n >> bwl; - - if (mi_col + x_idx < pc->mi_cols && mi_row + y_idx < pc->mi_rows) - mi[y_idx * mis + x_idx].mbmi.mb_skip_coeff = 1; - } - } else { - foreach_transformed_block(xd, bsize, decode_block, xd); - } + // TODO(dkovalev) if (!vp9_reader_has_error(r)) + return vp9_decode_tokens(pbi, r, bsize); } } @@ -377,8 +197,8 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize, pd->left_context = cm->left_context[i] + (((mi_row * 2) & 15) >> pd->subsampling_y); } - xd->above_seg_context = cm->above_seg_context + mi_col; - xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK); + + set_partition_seg_context(cm, xd, mi_row, mi_col); // Distance of Mb to the various image edges. These are specified to 8th pel // as they are always compared to values that are in 1/8th pel units @@ -387,53 +207,65 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE_TYPE bsize, setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col); } -static void set_refs(VP9D_COMP *pbi, int mi_row, int mi_col) { +static void set_ref(VP9D_COMP *pbi, int i, int mi_row, int mi_col) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + const int ref = mbmi->ref_frame[i] - 1; - if (mbmi->ref_frame[0] > INTRA_FRAME) { - // Select the appropriate reference frame for this MB - const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1]; - const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx]; - xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1]; - xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1]; - setup_pre_planes(xd, cfg, NULL, mi_row, mi_col, - xd->scale_factor, xd->scale_factor_uv); - xd->corrupted |= cfg->corrupted; - - if (mbmi->ref_frame[1] > INTRA_FRAME) { - // Select the appropriate reference frame for this MB - const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1]; - const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx]; - xd->scale_factor[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1]; - xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1]; - setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col, - xd->scale_factor, xd->scale_factor_uv); - xd->corrupted |= second_cfg->corrupted; - } - } + const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[ref]]; + xd->scale_factor[i] = cm->active_ref_scale[ref]; + setup_pre_planes(xd, i, cfg, mi_row, mi_col, &xd->scale_factor[i]); + xd->corrupted |= cfg->corrupted; } static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col, vp9_reader *r, BLOCK_SIZE_TYPE bsize) { + VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; + const int less8x8 = bsize < BLOCK_SIZE_SB8X8; + MB_MODE_INFO *mbmi; - if (bsize < BLOCK_SIZE_SB8X8) + if (less8x8) if (xd->ab_index > 0) return; + set_offsets(pbi, bsize, mi_row, mi_col); - vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r); - set_refs(pbi, mi_row, mi_col); + vp9_read_mode_info(pbi, mi_row, mi_col, r); - if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) - decode_sb_intra(pbi, xd, mi_row, mi_col, r, (bsize < BLOCK_SIZE_SB8X8) ? - BLOCK_SIZE_SB8X8 : bsize); - else if (bsize < BLOCK_SIZE_SB8X8) - decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8); - else - decode_sb(pbi, xd, mi_row, mi_col, r, bsize); + if (less8x8) + bsize = BLOCK_SIZE_SB8X8; + // Has to be called after set_offsets + mbmi = &xd->mode_info_context->mbmi; + + if (mbmi->ref_frame[0] == INTRA_FRAME) { + // Intra reconstruction + decode_tokens(pbi, bsize, r); + foreach_transformed_block(xd, bsize, decode_block_intra, xd); + } else { + // Inter reconstruction + int eobtotal; + + set_ref(pbi, 0, mi_row, mi_col); + if (mbmi->ref_frame[1] > INTRA_FRAME) + set_ref(pbi, 1, mi_row, mi_col); + + vp9_setup_interp_filters(xd, mbmi->interp_filter, cm); + vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); + eobtotal = decode_tokens(pbi, bsize, r); + if (less8x8) { + if (eobtotal >= 0) + foreach_transformed_block(xd, bsize, decode_block, xd); + } else { + assert(mbmi->sb_type == bsize); + if (eobtotal == 0) + // skip loopfilter + vp9_set_pred_flag_mbskip(cm, bsize, mi_row, mi_col, 1); + else if (eobtotal > 0) + foreach_transformed_block(xd, bsize, decode_block, xd); + } + } xd->corrupted |= vp9_reader_has_error(r); } @@ -448,16 +280,13 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col, if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols) return; - if (bsize < BLOCK_SIZE_SB8X8) + if (bsize < BLOCK_SIZE_SB8X8) { if (xd->ab_index != 0) return; - - if (bsize >= BLOCK_SIZE_SB8X8) { + } else { int pl; - int idx = check_bsize_coverage(pc, xd, mi_row, mi_col, bsize); - // read the partition information - xd->left_seg_context = pc->left_seg_context + (mi_row & MI_MASK); - xd->above_seg_context = pc->above_seg_context + mi_col; + const int idx = check_bsize_coverage(pc, xd, mi_row, mi_col, bsize); + set_partition_seg_context(pc, xd, mi_row, mi_col); pl = partition_plane_context(xd, bsize); if (idx == 0) @@ -469,7 +298,7 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col, else partition = PARTITION_SPLIT; - pc->fc.partition_counts[pl][partition]++; + pc->counts.partition[pl][partition]++; } subsize = get_subsize(bsize, partition); @@ -499,8 +328,9 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col, } break; default: - assert(0); + assert(!"Invalid partition type"); } + // update partition context if (bsize >= BLOCK_SIZE_SB8X8 && (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) { @@ -527,142 +357,118 @@ static void setup_token_decoder(VP9D_COMP *pbi, "Failed to allocate bool decoder %d", 1); } -static void read_coef_probs_common(FRAME_CONTEXT *fc, TX_SIZE tx_size, +static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs, vp9_reader *r) { - vp9_coeff_probs_model *coef_probs = fc->coef_probs[tx_size]; - - if (vp9_read_bit(r)) { - int i, j, k, l, m; - for (i = 0; i < BLOCK_TYPES; i++) { - for (j = 0; j < REF_TYPES; j++) { - for (k = 0; k < COEF_BANDS; k++) { - for (l = 0; l < PREV_COEF_CONTEXTS; l++) { - if (l >= 3 && k == 0) - continue; - - for (m = 0; m < UNCONSTRAINED_NODES; m++) { - vp9_prob *const p = coef_probs[i][j][k][l] + m; - - if (vp9_read(r, VP9_COEF_UPDATE_PROB)) - *p = vp9_read_prob_diff_update(r, *p); - } - } - } - } - } - } -} + int i, j, k, l, m; -static void read_coef_probs(VP9D_COMP *pbi, vp9_reader *r) { - const TXFM_MODE txfm_mode = pbi->common.txfm_mode; - FRAME_CONTEXT *const fc = &pbi->common.fc; + if (vp9_read_bit(r)) + for (i = 0; i < BLOCK_TYPES; i++) + for (j = 0; j < REF_TYPES; j++) + for (k = 0; k < COEF_BANDS; k++) + for (l = 0; l < PREV_COEF_CONTEXTS; l++) + if (k > 0 || l < 3) + for (m = 0; m < UNCONSTRAINED_NODES; m++) + if (vp9_read(r, VP9_COEF_UPDATE_PROB)) + vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]); +} - read_coef_probs_common(fc, TX_4X4, r); +static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, + vp9_reader *r) { + read_coef_probs_common(fc->coef_probs[TX_4X4], r); - if (txfm_mode > ONLY_4X4) - read_coef_probs_common(fc, TX_8X8, r); + if (tx_mode > ONLY_4X4) + read_coef_probs_common(fc->coef_probs[TX_8X8], r); - if (txfm_mode > ALLOW_8X8) - read_coef_probs_common(fc, TX_16X16, r); + if (tx_mode > ALLOW_8X8) + read_coef_probs_common(fc->coef_probs[TX_16X16], r); - if (txfm_mode > ALLOW_16X16) - read_coef_probs_common(fc, TX_32X32, r); + if (tx_mode > ALLOW_16X16) + read_coef_probs_common(fc->coef_probs[TX_32X32], r); } -static void setup_segmentation(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) { +static void setup_segmentation(struct segmentation *seg, + struct vp9_read_bit_buffer *rb) { int i, j; - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; - - xd->update_mb_segmentation_map = 0; - xd->update_mb_segmentation_data = 0; + seg->update_map = 0; + seg->update_data = 0; - xd->segmentation_enabled = vp9_rb_read_bit(rb); - if (!xd->segmentation_enabled) + seg->enabled = vp9_rb_read_bit(rb); + if (!seg->enabled) return; // Segmentation map update - xd->update_mb_segmentation_map = vp9_rb_read_bit(rb); - if (xd->update_mb_segmentation_map) { - for (i = 0; i < MB_SEG_TREE_PROBS; i++) - xd->mb_segment_tree_probs[i] = vp9_rb_read_bit(rb) ? - vp9_rb_read_literal(rb, 8) : MAX_PROB; - - cm->temporal_update = vp9_rb_read_bit(rb); - if (cm->temporal_update) { + seg->update_map = vp9_rb_read_bit(rb); + if (seg->update_map) { + for (i = 0; i < SEG_TREE_PROBS; i++) + seg->tree_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8) + : MAX_PROB; + + seg->temporal_update = vp9_rb_read_bit(rb); + if (seg->temporal_update) { for (i = 0; i < PREDICTION_PROBS; i++) - cm->segment_pred_probs[i] = vp9_rb_read_bit(rb) ? - vp9_rb_read_literal(rb, 8) : MAX_PROB; + seg->pred_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8) + : MAX_PROB; } else { for (i = 0; i < PREDICTION_PROBS; i++) - cm->segment_pred_probs[i] = MAX_PROB; + seg->pred_probs[i] = MAX_PROB; } } // Segmentation data update - xd->update_mb_segmentation_data = vp9_rb_read_bit(rb); - if (xd->update_mb_segmentation_data) { - xd->mb_segment_abs_delta = vp9_rb_read_bit(rb); + seg->update_data = vp9_rb_read_bit(rb); + if (seg->update_data) { + seg->abs_delta = vp9_rb_read_bit(rb); - vp9_clearall_segfeatures(xd); + vp9_clearall_segfeatures(seg); - for (i = 0; i < MAX_MB_SEGMENTS; i++) { + for (i = 0; i < MAX_SEGMENTS; i++) { for (j = 0; j < SEG_LVL_MAX; j++) { int data = 0; const int feature_enabled = vp9_rb_read_bit(rb); if (feature_enabled) { - vp9_enable_segfeature(xd, i, j); + vp9_enable_segfeature(seg, i, j); data = decode_unsigned_max(rb, vp9_seg_feature_data_max(j)); if (vp9_is_segfeature_signed(j)) data = vp9_rb_read_bit(rb) ? -data : data; } - vp9_set_segdata(xd, i, j, data); + vp9_set_segdata(seg, i, j, data); } } } } -static void setup_loopfilter(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; +static void setup_loopfilter(struct loopfilter *lf, + struct vp9_read_bit_buffer *rb) { - cm->filter_level = vp9_rb_read_literal(rb, 6); - cm->sharpness_level = vp9_rb_read_literal(rb, 3); + lf->filter_level = vp9_rb_read_literal(rb, 6); + lf->sharpness_level = vp9_rb_read_literal(rb, 3); // Read in loop filter deltas applied at the MB level based on mode or ref // frame. - xd->mode_ref_lf_delta_update = 0; + lf->mode_ref_delta_update = 0; - xd->mode_ref_lf_delta_enabled = vp9_rb_read_bit(rb); - if (xd->mode_ref_lf_delta_enabled) { - xd->mode_ref_lf_delta_update = vp9_rb_read_bit(rb); - if (xd->mode_ref_lf_delta_update) { + lf->mode_ref_delta_enabled = vp9_rb_read_bit(rb); + if (lf->mode_ref_delta_enabled) { + lf->mode_ref_delta_update = vp9_rb_read_bit(rb); + if (lf->mode_ref_delta_update) { int i; - for (i = 0; i < MAX_REF_LF_DELTAS; i++) { - if (vp9_rb_read_bit(rb)) { - const int value = vp9_rb_read_literal(rb, 6); - xd->ref_lf_deltas[i] = vp9_rb_read_bit(rb) ? -value : value; - } - } + for (i = 0; i < MAX_REF_LF_DELTAS; i++) + if (vp9_rb_read_bit(rb)) + lf->ref_deltas[i] = vp9_rb_read_signed_literal(rb, 6); - for (i = 0; i < MAX_MODE_LF_DELTAS; i++) { - if (vp9_rb_read_bit(rb)) { - const int value = vp9_rb_read_literal(rb, 6); - xd->mode_lf_deltas[i] = vp9_rb_read_bit(rb) ? -value : value; - } - } + for (i = 0; i < MAX_MODE_LF_DELTAS; i++) + if (vp9_rb_read_bit(rb)) + lf->mode_deltas[i] = vp9_rb_read_signed_literal(rb, 6); } } } static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) { const int old = *delta_q; - if (vp9_rb_read_bit(rb)) { - const int value = vp9_rb_read_literal(rb, 4); - *delta_q = vp9_rb_read_bit(rb) ? -value : value; - } + if (vp9_rb_read_bit(rb)) + *delta_q = vp9_rb_read_signed_literal(rb, 4); return old != *delta_q; } @@ -682,11 +488,9 @@ static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) { cm->y_dc_delta_q == 0 && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; - if (xd->lossless) { - xd->itxm_add = vp9_idct_add_lossless_c; - } else { - xd->itxm_add = vp9_idct_add; - } + + xd->itxm_add = xd->lossless ? vp9_idct_add_lossless_c + : vp9_idct_add; } static INTERPOLATIONFILTERTYPE read_interp_filter_type( @@ -778,108 +582,90 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi, apply_frame_size(pbi, width, height); } -static void update_frame_context(FRAME_CONTEXT *fc) { - vp9_copy(fc->pre_coef_probs, fc->coef_probs); - vp9_copy(fc->pre_y_mode_prob, fc->y_mode_prob); - vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob); - vp9_copy(fc->pre_partition_prob, fc->partition_prob[1]); - vp9_copy(fc->pre_intra_inter_prob, fc->intra_inter_prob); - vp9_copy(fc->pre_comp_inter_prob, fc->comp_inter_prob); - vp9_copy(fc->pre_single_ref_prob, fc->single_ref_prob); - vp9_copy(fc->pre_comp_ref_prob, fc->comp_ref_prob); - fc->pre_nmvc = fc->nmvc; - vp9_copy(fc->pre_switchable_interp_prob, fc->switchable_interp_prob); - vp9_copy(fc->pre_inter_mode_probs, fc->inter_mode_probs); - vp9_copy(fc->pre_tx_probs_8x8p, fc->tx_probs_8x8p); - vp9_copy(fc->pre_tx_probs_16x16p, fc->tx_probs_16x16p); - vp9_copy(fc->pre_tx_probs_32x32p, fc->tx_probs_32x32p); - vp9_copy(fc->pre_mbskip_probs, fc->mbskip_probs); - - vp9_zero(fc->coef_counts); - vp9_zero(fc->eob_branch_counts); - vp9_zero(fc->y_mode_counts); - vp9_zero(fc->uv_mode_counts); - vp9_zero(fc->NMVcount); - vp9_zero(fc->inter_mode_counts); - vp9_zero(fc->partition_counts); - vp9_zero(fc->switchable_interp_count); - vp9_zero(fc->intra_inter_count); - vp9_zero(fc->comp_inter_count); - vp9_zero(fc->single_ref_count); - vp9_zero(fc->comp_ref_count); - vp9_zero(fc->tx_count_8x8p); - vp9_zero(fc->tx_count_16x16p); - vp9_zero(fc->tx_count_32x32p); - vp9_zero(fc->mbskip_count); -} - static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) { VP9_COMMON *const pc = &pbi->common; int mi_row, mi_col; - for (mi_row = pc->cur_tile_mi_row_start; - mi_row < pc->cur_tile_mi_row_end; mi_row += 64 / MI_SIZE) { + if (pbi->do_loopfilter_inline) { + vp9_loop_filter_frame_init(pc, &pbi->mb, pbi->mb.lf.filter_level); + } + + for (mi_row = pc->cur_tile_mi_row_start; mi_row < pc->cur_tile_mi_row_end; + mi_row += MI_BLOCK_SIZE) { // For a SB there are 2 left contexts, each pertaining to a MB row within vpx_memset(&pc->left_context, 0, sizeof(pc->left_context)); vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context)); - for (mi_col = pc->cur_tile_mi_col_start; - mi_col < pc->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE) + for (mi_col = pc->cur_tile_mi_col_start; mi_col < pc->cur_tile_mi_col_end; + mi_col += MI_BLOCK_SIZE) { decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64); + } + + if (pbi->do_loopfilter_inline) { + YV12_BUFFER_CONFIG *const fb = + &pbi->common.yv12_fb[pbi->common.new_fb_idx]; + // delay the loopfilter by 1 macroblock row. + const int lf_start = mi_row - MI_BLOCK_SIZE; + if (lf_start < 0) continue; + vp9_loop_filter_rows(fb, pc, &pbi->mb, lf_start, mi_row, 0); + } + } + + if (pbi->do_loopfilter_inline) { + YV12_BUFFER_CONFIG *const fb = &pbi->common.yv12_fb[pbi->common.new_fb_idx]; + vp9_loop_filter_rows(fb, pc, &pbi->mb, + mi_row - MI_BLOCK_SIZE, pc->mi_rows, 0); } } static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { - int delta_log2_tiles; + int min_log2_tile_cols, max_log2_tile_cols, max_ones; + vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); - vp9_get_tile_n_bits(cm, &cm->log2_tile_columns, &delta_log2_tiles); - while (delta_log2_tiles--) { - if (vp9_rb_read_bit(rb)) { - cm->log2_tile_columns++; - } else { - break; - } - } + // columns + max_ones = max_log2_tile_cols - min_log2_tile_cols; + cm->log2_tile_cols = min_log2_tile_cols; + while (max_ones-- && vp9_rb_read_bit(rb)) + cm->log2_tile_cols++; + // rows cm->log2_tile_rows = vp9_rb_read_bit(rb); if (cm->log2_tile_rows) cm->log2_tile_rows += vp9_rb_read_bit(rb); - - cm->tile_columns = 1 << cm->log2_tile_columns; - cm->tile_rows = 1 << cm->log2_tile_rows; } -static void decode_tiles(VP9D_COMP *pbi, - const uint8_t *data, size_t first_partition_size, - vp9_reader *residual_bc) { +static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { + vp9_reader residual_bc; + VP9_COMMON *const pc = &pbi->common; - const uint8_t *data_ptr = data + first_partition_size; - const uint8_t* const data_end = pbi->source + pbi->source_sz; + const uint8_t *const data_end = pbi->source + pbi->source_sz; + const int aligned_mi_cols = mi_cols_aligned_to_sb(pc->mi_cols); + const int tile_cols = 1 << pc->log2_tile_cols; + const int tile_rows = 1 << pc->log2_tile_rows; int tile_row, tile_col; // Note: this memset assumes above_context[0], [1] and [2] // are allocated as part of the same buffer. - vpx_memset(pc->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 * - MAX_MB_PLANE * mi_cols_aligned_to_sb(pc)); + vpx_memset(pc->above_context[0], 0, + sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols); - vpx_memset(pc->above_seg_context, 0, sizeof(PARTITION_CONTEXT) * - mi_cols_aligned_to_sb(pc)); + vpx_memset(pc->above_seg_context, 0, + sizeof(PARTITION_CONTEXT) * aligned_mi_cols); if (pbi->oxcf.inv_tile_order) { - const int n_cols = pc->tile_columns; const uint8_t *data_ptr2[4][1 << 6]; vp9_reader bc_bak = {0}; // pre-initialize the offsets, we're going to read in inverse order - data_ptr2[0][0] = data_ptr; - for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) { + data_ptr2[0][0] = data; + for (tile_row = 0; tile_row < tile_rows; tile_row++) { if (tile_row) { - const int size = read_be32(data_ptr2[tile_row - 1][n_cols - 1]); - data_ptr2[tile_row - 1][n_cols - 1] += 4; - data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][n_cols - 1] + size; + const int size = read_be32(data_ptr2[tile_row - 1][tile_cols - 1]); + data_ptr2[tile_row - 1][tile_cols - 1] += 4; + data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][tile_cols - 1] + size; } - for (tile_col = 1; tile_col < n_cols; tile_col++) { + for (tile_col = 1; tile_col < tile_cols; tile_col++) { const int size = read_be32(data_ptr2[tile_row][tile_col - 1]); data_ptr2[tile_row][tile_col - 1] += 4; data_ptr2[tile_row][tile_col] = @@ -887,48 +673,49 @@ static void decode_tiles(VP9D_COMP *pbi, } } - for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) { + for (tile_row = 0; tile_row < tile_rows; tile_row++) { vp9_get_tile_row_offsets(pc, tile_row); - for (tile_col = n_cols - 1; tile_col >= 0; tile_col--) { + for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) { vp9_get_tile_col_offsets(pc, tile_col); setup_token_decoder(pbi, data_ptr2[tile_row][tile_col], data_end - data_ptr2[tile_row][tile_col], - residual_bc); - decode_tile(pbi, residual_bc); - if (tile_row == pc->tile_rows - 1 && tile_col == n_cols - 1) - bc_bak = *residual_bc; + &residual_bc); + decode_tile(pbi, &residual_bc); + if (tile_row == tile_rows - 1 && tile_col == tile_cols - 1) + bc_bak = residual_bc; } } - *residual_bc = bc_bak; + residual_bc = bc_bak; } else { int has_more; - for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) { + for (tile_row = 0; tile_row < tile_rows; tile_row++) { vp9_get_tile_row_offsets(pc, tile_row); - for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) { + for (tile_col = 0; tile_col < tile_cols; tile_col++) { size_t size; vp9_get_tile_col_offsets(pc, tile_col); - has_more = tile_col < pc->tile_columns - 1 || - tile_row < pc->tile_rows - 1; + has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1; if (has_more) { - if (!read_is_valid(data_ptr, 4, data_end)) + if (!read_is_valid(data, 4, data_end)) vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt tile length"); - size = read_be32(data_ptr); - data_ptr += 4; + size = read_be32(data); + data += 4; } else { - size = data_end - data_ptr; + size = data_end - data; } - setup_token_decoder(pbi, data_ptr, size, residual_bc); - decode_tile(pbi, residual_bc); - data_ptr += size; + setup_token_decoder(pbi, data, size, &residual_bc); + decode_tile(pbi, &residual_bc); + data += size; } } } + + return vp9_reader_find_end(&residual_bc); } static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { @@ -949,10 +736,9 @@ static void setup_inter_inter(VP9_COMMON *cm) { int i; cm->allow_comp_inter_inter = 0; - for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) { - cm->allow_comp_inter_inter |= i > 0 && + for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i) + cm->allow_comp_inter_inter |= cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]; - } if (cm->allow_comp_inter_inter) { // which one is always-on in comp inter-inter? @@ -999,7 +785,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)]; ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->new_fb_idx, frame_to_show); pbi->refresh_frame_flags = 0; - cm->filter_level = 0; + xd->lf.filter_level = 0; return 0; } @@ -1053,7 +839,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES); for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) { - const int ref = vp9_rb_read_literal(rb, NUM_REF_FRAMES_LG2); + const int ref = vp9_rb_read_literal(rb, NUM_REF_FRAMES_LOG2); cm->active_ref_idx[i] = cm->ref_frame_map[ref]; cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb); } @@ -1078,23 +864,54 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, cm->frame_parallel_decoding_mode = 1; } - cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LG2); + cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LOG2); if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->intra_only) vp9_setup_past_independence(cm, xd); - setup_loopfilter(pbi, rb); + setup_loopfilter(&xd->lf, rb); setup_quantization(pbi, rb); - setup_segmentation(pbi, rb); + setup_segmentation(&xd->seg, rb); setup_tile_info(cm, rb); return vp9_rb_read_literal(rb, 16); } +static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data, + size_t partition_size) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + vp9_reader r; + + if (vp9_reader_init(&r, data, partition_size)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate bool decoder 0"); + + cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r); + if (cm->tx_mode == TX_MODE_SELECT) + read_tx_probs(&cm->fc.tx_probs, &r); + read_coef_probs(&cm->fc, cm->tx_mode, &r); + + vp9_prepare_read_mode_info(pbi, &r); + + return vp9_reader_has_error(&r); +} + +void vp9_init_dequantizer(VP9_COMMON *cm) { + int q; + + for (q = 0; q < QINDEX_RANGE; q++) { + cm->y_dequant[q][0] = vp9_dc_quant(q, cm->y_dc_delta_q); + cm->y_dequant[q][1] = vp9_ac_quant(q, 0); + + cm->uv_dequant[q][0] = vp9_dc_quant(q, cm->uv_dc_delta_q); + cm->uv_dequant[q][1] = vp9_ac_quant(q, cm->uv_ac_delta_q); + } +} + int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { int i; - vp9_reader header_bc, residual_bc; VP9_COMMON *const pc = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; @@ -1115,6 +932,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { data += vp9_rb_bytes_read(&rb); xd->corrupted = 0; new_fb->corrupted = 0; + pbi->do_loopfilter_inline = + (pc->log2_tile_rows | pc->log2_tile_cols) == 0 && pbi->mb.lf.filter_level; if (!pbi->decoded_key_frame && !keyframe) return -1; @@ -1125,37 +944,29 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { xd->mode_info_context = pc->mi; xd->prev_mode_info_context = pc->prev_mi; - xd->frame_type = pc->frame_type; xd->mode_info_stride = pc->mode_info_stride; - if (vp9_reader_init(&header_bc, data, first_partition_size)) - vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate bool decoder 0"); - - mb_init_dequantizer(pc, &pbi->mb); // MB level dequantizer setup + init_dequantizer(pc, &pbi->mb); if (!keyframe) vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc); pc->fc = pc->frame_contexts[pc->frame_context_idx]; - update_frame_context(&pc->fc); - - setup_txfm_mode(pc, xd->lossless, &header_bc); - - read_coef_probs(pbi, &header_bc); + vp9_zero(pc->counts); // Initialize xd pointers. Any reference should do for xd->pre, so use 0. - setup_pre_planes(xd, &pc->yv12_fb[pc->active_ref_idx[0]], NULL, - 0, 0, NULL, NULL); + setup_pre_planes(xd, 0, &pc->yv12_fb[pc->active_ref_idx[0]], 0, 0, NULL); setup_dst_planes(xd, new_fb, 0, 0); + new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size); + // Create the segmentation map structure and set to 0 if (!pc->last_frame_seg_map) - CHECK_MEM_ERROR(pc->last_frame_seg_map, + CHECK_MEM_ERROR(pc, pc->last_frame_seg_map, vpx_calloc((pc->mi_rows * pc->mi_cols), 1)); - vp9_setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y); + setup_block_dptrs(xd, pc->subsampling_x, pc->subsampling_y); // clear out the coeff buffer for (i = 0; i < MAX_MB_PLANE; ++i) @@ -1163,14 +974,12 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { set_prev_mi(pc); - vp9_decode_mode_mvs_init(pbi, &header_bc); - - decode_tiles(pbi, data, first_partition_size, &residual_bc); + *p_data_end = decode_tiles(pbi, data + first_partition_size); pc->last_width = pc->width; pc->last_height = pc->height; - new_fb->corrupted = vp9_reader_has_error(&header_bc) | xd->corrupted; + new_fb->corrupted |= xd->corrupted; if (!pbi->decoded_key_frame) { if (keyframe && !new_fb->corrupted) @@ -1180,20 +989,18 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { "A stream must start with a complete key frame"); } - // Adaptation if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) { vp9_adapt_coef_probs(pc); - if ((!keyframe) && (!pc->intra_only)) { + if (!keyframe && !pc->intra_only) { vp9_adapt_mode_probs(pc); vp9_adapt_mode_context(pc); - vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv); + vp9_adapt_mv_probs(pc, xd->allow_high_precision_mv); } } if (pc->refresh_frame_context) pc->frame_contexts[pc->frame_context_idx] = pc->fc; - *p_data_end = vp9_reader_find_end(&residual_bc); return 0; } diff --git a/libvpx/vp9/decoder/vp9_decodframe.h b/libvpx/vp9/decoder/vp9_decodframe.h index 66e951d..00b6d67 100644 --- a/libvpx/vp9/decoder/vp9_decodframe.h +++ b/libvpx/vp9/decoder/vp9_decodframe.h @@ -17,6 +17,5 @@ struct VP9Decompressor; void vp9_init_dequantizer(struct VP9Common *pc); int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end); -vp9_prob vp9_read_prob_diff_update(vp9_reader *r, int oldp); #endif // VP9_DECODER_VP9_DECODFRAME_H_ diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c index 3bbb212..01c1db0 100644 --- a/libvpx/vp9/decoder/vp9_detokenize.c +++ b/libvpx/vp9/decoder/vp9_detokenize.c @@ -18,14 +18,8 @@ #include "vp9/decoder/vp9_detokenize.h" #include "vp9/decoder/vp9_onyxd_int.h" -#if CONFIG_BALANCED_COEFTREE -#define ZERO_CONTEXT_NODE 0 -#define EOB_CONTEXT_NODE 1 -#else #define EOB_CONTEXT_NODE 0 #define ZERO_CONTEXT_NODE 1 -#endif - #define ONE_CONTEXT_NODE 2 #define LOW_VAL_CONTEXT_NODE 3 #define TWO_CONTEXT_NODE 4 @@ -91,13 +85,15 @@ DECLARE_ALIGNED(16, extern const uint8_t, val += 1 << bits_count; \ } while (0); -static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd, +static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, vp9_reader *r, int block_idx, PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr, TX_SIZE txfm_size, const int16_t *dq, ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) { + FRAME_CONTEXT *const fc = &cm->fc; + FRAME_COUNTS *const counts = &cm->counts; ENTROPY_CONTEXT above_ec, left_ec; - int pt, c = 0, pad, default_eob; + int pt, c = 0; int band; vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES]; vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]; @@ -113,53 +109,31 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd, vp9_prob *prob; vp9_coeff_count_model *coef_counts; const int ref = xd->mode_info_context->mbmi.ref_frame[0] != INTRA_FRAME; - TX_TYPE tx_type = DCT_DCT; - const int *scan, *nb; + const int16_t *scan, *nb; uint8_t token_cache[1024]; const uint8_t * band_translate; -#if CONFIG_BALANCED_COEFTREE - int skip_eob_node = 0; -#endif - coef_probs = fc->coef_probs[txfm_size][type][ref]; - coef_counts = fc->coef_counts[txfm_size]; + coef_counts = counts->coef[txfm_size]; switch (txfm_size) { default: case TX_4X4: { - tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type_4x4(xd, block_idx) : DCT_DCT; - scan = get_scan_4x4(tx_type); + scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx)); above_ec = A[0] != 0; left_ec = L[0] != 0; - default_eob = 16; band_translate = vp9_coefband_trans_4x4; break; } case TX_8X8: { - const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; - const int sz = 1 + b_width_log2(sb_type); - const int x = block_idx & ((1 << sz) - 1); - const int y = block_idx - x; - tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT; - scan = get_scan_8x8(tx_type); + scan = get_scan_8x8(get_tx_type_8x8(type, xd)); above_ec = (A[0] + A[1]) != 0; left_ec = (L[0] + L[1]) != 0; - default_eob = 64; band_translate = vp9_coefband_trans_8x8plus; break; } case TX_16X16: { - const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; - const int sz = 2 + b_width_log2(sb_type); - const int x = block_idx & ((1 << sz) - 1); - const int y = block_idx - x; - tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT; - scan = get_scan_16x16(tx_type); + scan = get_scan_16x16(get_tx_type_16x16(type, xd)); above_ec = (A[0] + A[1] + A[2] + A[3]) != 0; left_ec = (L[0] + L[1] + L[2] + L[3]) != 0; - default_eob = 256; band_translate = vp9_coefband_trans_8x8plus; break; } @@ -167,13 +141,12 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd, scan = vp9_default_scan_32x32; above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0; left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0; - default_eob = 1024; band_translate = vp9_coefband_trans_8x8plus; break; } pt = combine_entropy_contexts(above_ec, left_ec); - nb = vp9_get_coef_neighbors_handle(scan, &pad); + nb = vp9_get_coef_neighbors_handle(scan); while (1) { int val; @@ -181,43 +154,26 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd, if (c >= seg_eob) break; if (c) - pt = vp9_get_coef_context(scan, nb, pad, token_cache, - c, default_eob); + pt = get_coef_context(nb, token_cache, c); band = get_coef_band(band_translate, c); prob = coef_probs[band][pt]; -#if !CONFIG_BALANCED_COEFTREE - fc->eob_branch_counts[txfm_size][type][ref][band][pt]++; + counts->eob_branch[txfm_size][type][ref][band][pt]++; if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) break; SKIP_START: -#endif if (c >= seg_eob) break; if (c) - pt = vp9_get_coef_context(scan, nb, pad, token_cache, - c, default_eob); + pt = get_coef_context(nb, token_cache, c); band = get_coef_band(band_translate, c); prob = coef_probs[band][pt]; if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) { INCREMENT_COUNT(ZERO_TOKEN); ++c; -#if CONFIG_BALANCED_COEFTREE - skip_eob_node = 1; - continue; -#else goto SKIP_START; -#endif - } -#if CONFIG_BALANCED_COEFTREE - if (!skip_eob_node) { - fc->eob_branch_counts[txfm_size][type][ref][band][pt]++; - if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) - break; } - skip_eob_node = 0; -#endif // ONE_CONTEXT_NODE_0_ if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) { @@ -293,8 +249,8 @@ SKIP_START: return c; } -static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) { - return vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; +static int get_eob(struct segmentation *seg, int segment_id, int eob_max) { + return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; } struct decode_block_args { @@ -315,7 +271,7 @@ static void decode_block(int plane, int block, struct macroblockd_plane* pd = &xd->plane[plane]; const int segment_id = xd->mode_info_context->mbmi.segment_id; const TX_SIZE ss_tx_size = ss_txfrm_size / 2; - const int seg_eob = get_eob(xd, segment_id, 16 << ss_txfrm_size); + const int seg_eob = get_eob(&xd->seg, segment_id, 16 << ss_txfrm_size); const int off = block >> ss_txfrm_size; const int mod = bw - ss_tx_size - pd->subsampling_x; const int aoff = (off & ((1 << mod) - 1)) << ss_tx_size; @@ -323,7 +279,7 @@ static void decode_block(int plane, int block, ENTROPY_CONTEXT *A = pd->above_context + aoff; ENTROPY_CONTEXT *L = pd->left_context + loff; - const int eob = decode_coefs(&arg->pbi->common.fc, xd, arg->r, block, + const int eob = decode_coefs(&arg->pbi->common, xd, arg->r, block, pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block, 16), ss_tx_size, pd->dequant, A, L); diff --git a/libvpx/vp9/decoder/vp9_dsubexp.c b/libvpx/vp9/decoder/vp9_dsubexp.c new file mode 100644 index 0000000..8cc64f7 --- /dev/null +++ b/libvpx/vp9/decoder/vp9_dsubexp.c @@ -0,0 +1,106 @@ +/* + Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_entropy.h" + +#include "vp9/decoder/vp9_dsubexp.h" + +static int inv_recenter_nonneg(int v, int m) { + if (v > 2 * m) + return v; + + return v % 2 ? m - (v + 1) / 2 : m + v / 2; +} + +static int decode_uniform(vp9_reader *r, int n) { + int v; + const int l = get_unsigned_bits(n); + const int m = (1 << l) - n; + if (!l) + return 0; + + v = vp9_read_literal(r, l - 1); + return v < m ? v : (v << 1) - m + vp9_read_bit(r); +} + + +static int merge_index(int v, int n, int modulus) { + int max1 = (n - 1 - modulus / 2) / modulus + 1; + if (v < max1) { + v = v * modulus + modulus / 2; + } else { + int w; + v -= max1; + w = v; + v += (v + modulus - modulus / 2) / modulus; + while (v % modulus == modulus / 2 || + w != v - (v + modulus - modulus / 2) / modulus) v++; + } + return v; +} + +static int inv_remap_prob(int v, int m) { + static int inv_map_table[MAX_PROB - 1] = { + // generated by: + // inv_map_table[j] = merge_index(j, MAX_PROB - 1, MODULUS_PARAM); + 6, 19, 32, 45, 58, 71, 84, 97, 110, 123, 136, 149, 162, 175, 188, + 201, 214, 227, 240, 253, 0, 1, 2, 3, 4, 5, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 74, 75, + 76, 77, 78, 79, 80, 81, 82, 83, 85, 86, 87, 88, 89, 90, 91, + 92, 93, 94, 95, 96, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 124, + 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140, + 141, 142, 143, 144, 145, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156, + 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, + 173, 174, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189, + 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205, + 206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221, + 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, + 238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, + + }; + // v = merge_index(v, MAX_PROBS - 1, MODULUS_PARAM); + v = inv_map_table[v]; + m--; + if ((m << 1) <= MAX_PROB) { + return 1 + inv_recenter_nonneg(v + 1, m); + } else { + return MAX_PROB - inv_recenter_nonneg(v + 1, MAX_PROB - 1 - m); + } +} + +static int decode_term_subexp(vp9_reader *r, int k, int num_syms) { + int i = 0, mk = 0, word; + while (1) { + const int b = i ? k + i - 1 : k; + const int a = 1 << b; + if (num_syms <= mk + 3 * a) { + word = decode_uniform(r, num_syms - mk) + mk; + break; + } else { + if (vp9_read_bit(r)) { + i++; + mk += a; + } else { + word = vp9_read_literal(r, b) + mk; + break; + } + } + } + return word; +} + +void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) { + int delp = decode_term_subexp(r, SUBEXP_PARAM, 255); + *p = (vp9_prob)inv_remap_prob(delp, *p); +} diff --git a/libvpx/vp9/encoder/vp9_asm_enc_offsets.c b/libvpx/vp9/decoder/vp9_dsubexp.h index 921e8f0..aeb9399 100644 --- a/libvpx/vp9/encoder/vp9_asm_enc_offsets.c +++ b/libvpx/vp9/decoder/vp9_dsubexp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -9,9 +9,11 @@ */ -#include "vpx_ports/asm_offsets.h" +#ifndef VP9_DECODER_VP9_DSUBEXP_H_ +#define VP9_DECODER_VP9_DSUBEXP_H_ -BEGIN +#include "vp9/decoder/vp9_dboolhuff.h" +void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p); -END +#endif // VP9_DECODER_VP9_DSUBEXP_H_ diff --git a/libvpx/vp9/decoder/vp9_idct_blk.c b/libvpx/vp9/decoder/vp9_idct_blk.c index c52963c..0217919 100644 --- a/libvpx/vp9/decoder/vp9_idct_blk.c +++ b/libvpx/vp9/decoder/vp9_idct_blk.c @@ -66,7 +66,7 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) { vp9_short_idct4x4_add(input, dest, stride); vpx_memset(input, 0, 32); } else { - vp9_dc_only_idct_add(input[0], dest, dest, stride, stride); + vp9_short_idct4x4_1_add(input, dest, stride); ((int *)input)[0] = 0; } } diff --git a/libvpx/vp9/decoder/vp9_onyxd_if.c b/libvpx/vp9/decoder/vp9_onyxd_if.c index 3cef88b..cb72920 100644 --- a/libvpx/vp9/decoder/vp9_onyxd_if.c +++ b/libvpx/vp9/decoder/vp9_onyxd_if.c @@ -136,7 +136,7 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) { // vp9_init_dequantizer() for every frame. vp9_init_dequantizer(&pbi->common); - vp9_loop_filter_init(&pbi->common); + vp9_loop_filter_init(&pbi->common, &pbi->mb.lf); pbi->common.error.setjmp = 0; pbi->decoded_key_frame = 0; @@ -154,7 +154,6 @@ void vp9_remove_decompressor(VP9D_PTR ptr) { vpx_free(pbi->common.last_frame_seg_map); vp9_remove_common(&pbi->common); - vpx_free(pbi->mbc); vpx_free(pbi); } @@ -347,9 +346,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, cm->current_video_frame + 1000); #endif - if (cm->filter_level) { + if (!pbi->do_loopfilter_inline) { /* Apply the loop filter if appropriate. */ - vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0); + vp9_loop_filter_frame(cm, &pbi->mb, pbi->mb.lf.filter_level, 0); } #if WRITE_RECON_BUFFER == 2 @@ -361,8 +360,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, cm->current_video_frame + 3000); #endif - vp9_extend_frame_borders(cm->frame_to_show, - cm->subsampling_x, cm->subsampling_y); + vp9_extend_frame_inner_borders(cm->frame_to_show, + cm->subsampling_x, + cm->subsampling_y); } #if WRITE_RECON_BUFFER == 1 @@ -412,9 +412,8 @@ int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd, *time_stamp = pbi->last_time_stamp; *time_end_stamp = 0; - sd->clrtype = pbi->common.clr_type; #if CONFIG_POSTPROC - ret = vp9_post_proc_frame(&pbi->common, sd, flags); + ret = vp9_post_proc_frame(&pbi->common, &pbi->mb.lf, sd, flags); #else if (pbi->common.frame_to_show) { diff --git a/libvpx/vp9/decoder/vp9_onyxd_int.h b/libvpx/vp9/decoder/vp9_onyxd_int.h index 8698570..4760066 100644 --- a/libvpx/vp9/decoder/vp9_onyxd_int.h +++ b/libvpx/vp9/decoder/vp9_onyxd_int.h @@ -10,13 +10,14 @@ #ifndef VP9_DECODER_VP9_ONYXD_INT_H_ #define VP9_DECODER_VP9_ONYXD_INT_H_ + #include "./vpx_config.h" -#include "vp9/decoder/vp9_onyxd.h" -#include "vp9/decoder/vp9_treereader.h" + #include "vp9/common/vp9_onyxc_int.h" -#include "vp9/decoder/vp9_idct_blk.h" -// #define DEC_DEBUG +#include "vp9/decoder/vp9_idct_blk.h" +#include "vp9/decoder/vp9_onyxd.h" +#include "vp9/decoder/vp9_treereader.h" typedef struct VP9Decompressor { DECLARE_ALIGNED(16, MACROBLOCKD, mb); @@ -28,35 +29,17 @@ typedef struct VP9Decompressor { const uint8_t *source; uint32_t source_sz; - vp9_reader *mbc; int64_t last_time_stamp; int ready_for_new_data; int refresh_frame_flags; - vp9_prob prob_skip_false; int decoded_key_frame; int initial_width; int initial_height; -} VP9D_COMP; - -#if CONFIG_DEBUG -#define CHECK_MEM_ERROR(lval,expr) do {\ - lval = (expr); \ - if(!lval) \ - vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\ - "Failed to allocate "#lval" at %s:%d", \ - __FILE__,__LINE__);\ - } while(0) -#else -#define CHECK_MEM_ERROR(lval,expr) do {\ - lval = (expr); \ - if(!lval) \ - vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,\ - "Failed to allocate "#lval);\ - } while(0) -#endif + int do_loopfilter_inline; // apply loopfilter to available rows immediately +} VP9D_COMP; #endif // VP9_DECODER_VP9_TREEREADER_H_ diff --git a/libvpx/vp9/decoder/vp9_read_bit_buffer.h b/libvpx/vp9/decoder/vp9_read_bit_buffer.h index f243cb4..c7fa3aa 100644 --- a/libvpx/vp9/decoder/vp9_read_bit_buffer.h +++ b/libvpx/vp9/decoder/vp9_read_bit_buffer.h @@ -51,4 +51,10 @@ static int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits) { return value; } +static int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb, + int bits) { + const int value = vp9_rb_read_literal(rb, bits); + return vp9_rb_read_bit(rb) ? -value : value; +} + #endif // VP9_READ_BIT_BUFFER_ diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c index 09ab2db..ad0f6c5 100644 --- a/libvpx/vp9/encoder/vp9_bitstream.c +++ b/libvpx/vp9/encoder/vp9_bitstream.c @@ -32,6 +32,7 @@ #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_bitstream.h" #include "vp9/encoder/vp9_segmentation.h" +#include "vp9/encoder/vp9_subexp.h" #include "vp9/encoder/vp9_write_bit_buffer.h" @@ -48,8 +49,6 @@ vp9_coeff_stats tree_update_hist[TX_SIZE_MAX_SB][BLOCK_TYPES]; extern unsigned int active_section; #endif -#define vp9_cost_upd ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8) -#define vp9_cost_upd256 ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd))) #ifdef MODE_STATS int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB]; @@ -155,8 +154,6 @@ void write_switchable_interp_stats() { } #endif -static int update_bits[255]; - static INLINE void write_be32(uint8_t *p, int value) { p[0] = value >> 24; p[1] = value >> 16; @@ -164,248 +161,11 @@ static INLINE void write_be32(uint8_t *p, int value) { p[3] = value; } - - -int recenter_nonneg(int v, int m) { - if (v > (m << 1)) - return v; - else if (v >= m) - return ((v - m) << 1); - else - return ((m - v) << 1) - 1; -} - -static int get_unsigned_bits(unsigned num_values) { - int cat = 0; - if ((num_values--) <= 1) return 0; - while (num_values > 0) { - cat++; - num_values >>= 1; - } - return cat; -} - void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb, int data, int max) { vp9_wb_write_literal(wb, data, get_unsigned_bits(max)); } -void encode_uniform(vp9_writer *w, int v, int n) { - int l = get_unsigned_bits(n); - int m; - if (l == 0) - return; - m = (1 << l) - n; - if (v < m) { - vp9_write_literal(w, v, l - 1); - } else { - vp9_write_literal(w, m + ((v - m) >> 1), l - 1); - vp9_write_literal(w, (v - m) & 1, 1); - } -} - -int count_uniform(int v, int n) { - int l = get_unsigned_bits(n); - int m; - if (l == 0) return 0; - m = (1 << l) - n; - if (v < m) - return l - 1; - else - return l; -} - -void encode_term_subexp(vp9_writer *w, int word, int k, int num_syms) { - int i = 0; - int mk = 0; - while (1) { - int b = (i ? k + i - 1 : k); - int a = (1 << b); - if (num_syms <= mk + 3 * a) { - encode_uniform(w, word - mk, num_syms - mk); - break; - } else { - int t = (word >= mk + a); - vp9_write_literal(w, t, 1); - if (t) { - i = i + 1; - mk += a; - } else { - vp9_write_literal(w, word - mk, b); - break; - } - } - } -} - -int count_term_subexp(int word, int k, int num_syms) { - int count = 0; - int i = 0; - int mk = 0; - while (1) { - int b = (i ? k + i - 1 : k); - int a = (1 << b); - if (num_syms <= mk + 3 * a) { - count += count_uniform(word - mk, num_syms - mk); - break; - } else { - int t = (word >= mk + a); - count++; - if (t) { - i = i + 1; - mk += a; - } else { - count += b; - break; - } - } - } - return count; -} - -static void compute_update_table() { - int i; - for (i = 0; i < 254; i++) - update_bits[i] = count_term_subexp(i, SUBEXP_PARAM, 255); -} - -static int split_index(int i, int n, int modulus) { - int max1 = (n - 1 - modulus / 2) / modulus + 1; - if (i % modulus == modulus / 2) i = i / modulus; - else i = max1 + i - (i + modulus - modulus / 2) / modulus; - return i; -} - -static int remap_prob(int v, int m) { - const int n = 255; - const int modulus = MODULUS_PARAM; - int i; - v--; - m--; - if ((m << 1) <= n) - i = recenter_nonneg(v, m) - 1; - else - i = recenter_nonneg(n - 1 - v, n - 1 - m) - 1; - - i = split_index(i, n - 1, modulus); - return i; -} - -static void write_prob_diff_update(vp9_writer *w, - vp9_prob newp, vp9_prob oldp) { - int delp = remap_prob(newp, oldp); - encode_term_subexp(w, delp, SUBEXP_PARAM, 255); -} - -static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) { - int delp = remap_prob(newp, oldp); - return update_bits[delp] * 256; -} - -static int prob_update_savings(const unsigned int *ct, - const vp9_prob oldp, const vp9_prob newp, - const vp9_prob upd) { - const int old_b = cost_branch256(ct, oldp); - const int new_b = cost_branch256(ct, newp); - const int update_b = 2048 + vp9_cost_upd256; - return old_b - new_b - update_b; -} - -static int prob_diff_update_savings_search(const unsigned int *ct, - const vp9_prob oldp, vp9_prob *bestp, - const vp9_prob upd) { - const int old_b = cost_branch256(ct, oldp); - int new_b, update_b, savings, bestsavings, step; - vp9_prob newp, bestnewp; - - bestsavings = 0; - bestnewp = oldp; - - step = (*bestp > oldp ? -1 : 1); - for (newp = *bestp; newp != oldp; newp += step) { - new_b = cost_branch256(ct, newp); - update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256; - savings = old_b - new_b - update_b; - if (savings > bestsavings) { - bestsavings = savings; - bestnewp = newp; - } - } - *bestp = bestnewp; - return bestsavings; -} - -static int prob_diff_update_savings_search_model(const unsigned int *ct, - const vp9_prob *oldp, - vp9_prob *bestp, - const vp9_prob upd, - int b, int r) { - int i, old_b, new_b, update_b, savings, bestsavings, step; - int newp; - vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES]; - vp9_model_to_full_probs(oldp, oldplist); - vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES); - for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i) - old_b += cost_branch256(ct + 2 * i, oldplist[i]); - old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]); - - bestsavings = 0; - bestnewp = oldp[PIVOT_NODE]; - - step = (*bestp > oldp[PIVOT_NODE] ? -1 : 1); - newp = *bestp; - for (; newp != oldp[PIVOT_NODE]; newp += step) { - if (newp < 1 || newp > 255) continue; - newplist[PIVOT_NODE] = newp; - vp9_model_to_full_probs(newplist, newplist); - for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i) - new_b += cost_branch256(ct + 2 * i, newplist[i]); - new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]); - update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) + - vp9_cost_upd256; - savings = old_b - new_b - update_b; - if (savings > bestsavings) { - bestsavings = savings; - bestnewp = newp; - } - } - *bestp = bestnewp; - return bestsavings; -} - -static void vp9_cond_prob_update(vp9_writer *bc, vp9_prob *oldp, vp9_prob upd, - unsigned int *ct) { - vp9_prob newp; - int savings; - newp = get_binary_prob(ct[0], ct[1]); - assert(newp >= 1); - savings = prob_update_savings(ct, *oldp, newp, upd); - if (savings > 0) { - vp9_write(bc, 1, upd); - vp9_write_prob(bc, newp); - *oldp = newp; - } else { - vp9_write(bc, 0, upd); - } -} - -static void vp9_cond_prob_diff_update(vp9_writer *bc, vp9_prob *oldp, - vp9_prob upd, - unsigned int *ct) { - vp9_prob newp; - int savings; - newp = get_binary_prob(ct[0], ct[1]); - assert(newp >= 1); - savings = prob_diff_update_savings_search(ct, *oldp, &newp, upd); - if (savings > 0) { - vp9_write(bc, 1, upd); - write_prob_diff_update(bc, newp, *oldp); - *oldp = newp; - } else { - vp9_write(bc, 0, upd); - } -} - static void update_mode( vp9_writer *w, int n, @@ -440,16 +200,39 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi, (unsigned int *)cpi->y_mode_count[j]); } -void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *bc) { - VP9_COMMON *const pc = &cpi->common; - int k; +static void write_selected_txfm_size(const VP9_COMP *cpi, TX_SIZE tx_size, + BLOCK_SIZE_TYPE bsize, vp9_writer *w) { + const MACROBLOCKD *const xd = &cpi->mb.e_mbd; + const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs); + vp9_write(w, tx_size != TX_4X4, tx_probs[0]); + if (bsize >= BLOCK_SIZE_MB16X16 && tx_size != TX_4X4) { + vp9_write(w, tx_size != TX_8X8, tx_probs[1]); + if (bsize >= BLOCK_SIZE_SB32X32 && tx_size != TX_8X8) + vp9_write(w, tx_size != TX_16X16, tx_probs[2]); + } +} - for (k = 0; k < MBSKIP_CONTEXTS; ++k) { - vp9_cond_prob_diff_update(bc, &pc->fc.mbskip_probs[k], - VP9_MODE_UPDATE_PROB, pc->fc.mbskip_count[k]); +static int write_skip_coeff(const VP9_COMP *cpi, int segment_id, MODE_INFO *m, + vp9_writer *w) { + const MACROBLOCKD *const xd = &cpi->mb.e_mbd; + if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) { + return 1; + } else { + const int skip_coeff = m->mbmi.mb_skip_coeff; + vp9_write(w, skip_coeff, vp9_get_pred_prob_mbskip(&cpi->common, xd)); + return skip_coeff; } } +void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) { + VP9_COMMON *cm = &cpi->common; + int k; + + for (k = 0; k < MBSKIP_CONTEXTS; ++k) + vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], + VP9_MODE_UPDATE_PROB, cm->counts.mbskip[k]); +} + static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) { write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m); } @@ -465,7 +248,7 @@ static void update_switchable_interp_probs(VP9_COMP *const cpi, vp9_tree_probs_from_distribution( vp9_switchable_interp_tree, new_prob[j], branch_ct[j], - pc->fc.switchable_interp_count[j], 0); + pc->counts.switchable_interp[j], 0); } for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) { for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) { @@ -486,7 +269,7 @@ static void update_inter_mode_probs(VP9_COMMON *pc, vp9_writer* const bc) { for (j = 0; j < VP9_INTER_MODES - 1; j++) { vp9_cond_prob_diff_update(bc, &pc->fc.inter_mode_probs[i][j], VP9_MODE_UPDATE_PROB, - pc->fc.inter_mode_counts[i][j]); + pc->counts.inter_mode[i][j]); } } } @@ -519,22 +302,13 @@ static void pack_mb_tokens(vp9_writer* const bc, assert(pp != 0); /* skip one or two nodes */ -#if !CONFIG_BALANCED_COEFTREE if (p->skip_eob_node) { n -= p->skip_eob_node; i = 2 * p->skip_eob_node; } -#endif do { const int bb = (v >> --n) & 1; -#if CONFIG_BALANCED_COEFTREE - if (i == 2 && p->skip_eob_node) { - i += 2; - assert(bb == 1); - continue; - } -#endif vp9_write(bc, bb, pp[i >> 1]); i = vp9_coef_tree[i + bb]; } while (n); @@ -563,22 +337,18 @@ static void pack_mb_tokens(vp9_writer* const bc, *tp = p; } -static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m, +static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode, const vp9_prob *p) { -#if CONFIG_DEBUG - assert(NEARESTMV <= m && m <= NEWMV); -#endif - write_token(bc, vp9_sb_mv_ref_tree, p, - vp9_sb_mv_ref_encoding_array - NEARESTMV + m); + assert(is_inter_mode(mode)); + write_token(w, vp9_inter_mode_tree, p, + &vp9_inter_mode_encodings[mode - NEARESTMV]); } -// This function writes the current macro block's segnment id to the bitstream -// It should only be called if a segment map update is indicated. -static void write_mb_segid(vp9_writer *bc, - const MB_MODE_INFO *mi, const MACROBLOCKD *xd) { - if (xd->segmentation_enabled && xd->update_mb_segmentation_map) - treed_write(bc, vp9_segment_tree, xd->mb_segment_tree_probs, - mi->segment_id, 3); + +static void write_segment_id(vp9_writer *w, const struct segmentation *seg, + int segment_id) { + if (seg->enabled && seg->update_map) + treed_write(w, vp9_segment_tree, seg->tree_probs, segment_id, 3); } // This function encodes the reference frame @@ -588,7 +358,7 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mi = &xd->mode_info_context->mbmi; const int segment_id = mi->segment_id; - int seg_ref_active = vp9_segfeature_active(xd, segment_id, + int seg_ref_active = vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME); // If segment level coding of this signal is disabled... // or the segment allows multiple reference frame options @@ -597,7 +367,7 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) { // (if not specified at the frame/segment level) if (pc->comp_pred_mode == HYBRID_PREDICTION) { vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME, - vp9_get_pred_prob(pc, xd, PRED_COMP_INTER_INTER)); + vp9_get_pred_prob_comp_inter_inter(pc, xd)); } else { assert((mi->ref_frame[1] <= INTRA_FRAME) == (pc->comp_pred_mode == SINGLE_PREDICTION_ONLY)); @@ -605,17 +375,17 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) { if (mi->ref_frame[1] > INTRA_FRAME) { vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME, - vp9_get_pred_prob(pc, xd, PRED_COMP_REF_P)); + vp9_get_pred_prob_comp_ref_p(pc, xd)); } else { vp9_write(bc, mi->ref_frame[0] != LAST_FRAME, - vp9_get_pred_prob(pc, xd, PRED_SINGLE_REF_P1)); + vp9_get_pred_prob_single_ref_p1(pc, xd)); if (mi->ref_frame[0] != LAST_FRAME) vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME, - vp9_get_pred_prob(pc, xd, PRED_SINGLE_REF_P2)); + vp9_get_pred_prob_single_ref_p2(pc, xd)); } } else { assert(mi->ref_frame[1] <= INTRA_FRAME); - assert(vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME) == + assert(vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) == mi->ref_frame[0]); } @@ -629,60 +399,42 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, const nmv_context *nmvc = &pc->fc.nmvc; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; + struct segmentation *seg = &xd->seg; MB_MODE_INFO *const mi = &m->mbmi; const MV_REFERENCE_FRAME rf = mi->ref_frame[0]; const MB_PREDICTION_MODE mode = mi->mode; const int segment_id = mi->segment_id; int skip_coeff; + const BLOCK_SIZE_TYPE bsize = mi->sb_type; - xd->prev_mode_info_context = pc->prev_mi + (m - pc->mi); x->partition_info = x->pi + (m - pc->mi); #ifdef ENTROPY_STATS active_section = 9; #endif - if (cpi->mb.e_mbd.update_mb_segmentation_map) { - // Is temporal coding of the segment map enabled - if (pc->temporal_update) { - unsigned char prediction_flag = vp9_get_pred_flag(xd, PRED_SEG_ID); - vp9_prob pred_prob = vp9_get_pred_prob(pc, xd, PRED_SEG_ID); - - // Code the segment id prediction flag for this mb - vp9_write(bc, prediction_flag, pred_prob); - - // If the mb segment id wasn't predicted code explicitly - if (!prediction_flag) - write_mb_segid(bc, mi, &cpi->mb.e_mbd); + if (seg->update_map) { + if (seg->temporal_update) { + const int pred_flag = mi->seg_id_predicted; + vp9_prob pred_prob = vp9_get_pred_prob_seg_id(xd); + vp9_write(bc, pred_flag, pred_prob); + if (!pred_flag) + write_segment_id(bc, seg, segment_id); } else { - // Normal unpredicted coding - write_mb_segid(bc, mi, &cpi->mb.e_mbd); + write_segment_id(bc, seg, segment_id); } } - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { - skip_coeff = 1; - } else { - skip_coeff = m->mbmi.mb_skip_coeff; - vp9_write(bc, skip_coeff, - vp9_get_pred_prob(pc, xd, PRED_MBSKIP)); - } + skip_coeff = write_skip_coeff(cpi, segment_id, m, bc); - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) + if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) vp9_write(bc, rf != INTRA_FRAME, - vp9_get_pred_prob(pc, xd, PRED_INTRA_INTER)); + vp9_get_pred_prob_intra_inter(pc, xd)); - if (mi->sb_type >= BLOCK_SIZE_SB8X8 && pc->txfm_mode == TX_MODE_SELECT && + if (bsize >= BLOCK_SIZE_SB8X8 && pc->tx_mode == TX_MODE_SELECT && !(rf != INTRA_FRAME && - (skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) { - TX_SIZE sz = mi->txfm_size; - const vp9_prob *tx_probs = vp9_get_pred_probs(pc, xd, PRED_TX_SIZE); - vp9_write(bc, sz != TX_4X4, tx_probs[0]); - if (mi->sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) { - vp9_write(bc, sz != TX_8X8, tx_probs[1]); - if (mi->sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8) - vp9_write(bc, sz != TX_16X16, tx_probs[2]); - } + (skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) { + write_selected_txfm_size(cpi, mi->txfm_size, bsize, bc); } if (rf == INTRA_FRAME) { @@ -690,28 +442,24 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, active_section = 6; #endif - if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) { - const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; + if (bsize >= BLOCK_SIZE_SB8X8) { const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); const int bsl = MIN(bwl, bhl); write_intra_mode(bc, mode, pc->fc.y_mode_prob[MIN(3, bsl)]); } else { int idx, idy; - int bw = 1 << b_width_log2(mi->sb_type); - int bh = 1 << b_height_log2(mi->sb_type); - for (idy = 0; idy < 2; idy += bh) - for (idx = 0; idx < 2; idx += bw) { - MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode.first; + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode; write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]); } } - write_intra_mode(bc, mi->uv_mode, - pc->fc.uv_mode_prob[mode]); + write_intra_mode(bc, mi->uv_mode, pc->fc.uv_mode_prob[mode]); } else { vp9_prob *mv_ref_p; - encode_ref_frame(cpi, bc); - mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mb_mode_context[rf]]; #ifdef ENTROPY_STATS @@ -719,8 +467,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, #endif // If segment skip is not enabled code the mode. - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { - if (mi->sb_type >= BLOCK_SIZE_SB8X8) { + if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { + if (bsize >= BLOCK_SIZE_SB8X8) { write_sb_mv_ref(bc, mode, mv_ref_p); vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]); } @@ -728,38 +476,37 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, if (cpi->common.mcomp_filter_type == SWITCHABLE) { write_token(bc, vp9_switchable_interp_tree, - vp9_get_pred_probs(&cpi->common, xd, - PRED_SWITCHABLE_INTERP), + vp9_get_pred_probs_switchable_interp(&cpi->common, xd), vp9_switchable_interp_encodings + vp9_switchable_interp_map[mi->interp_filter]); } else { assert(mi->interp_filter == cpi->common.mcomp_filter_type); } - if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) { + if (bsize < BLOCK_SIZE_SB8X8) { int j; MB_PREDICTION_MODE blockmode; int_mv blockmv; - int bwl = b_width_log2(mi->sb_type), bw = 1 << bwl; - int bhl = b_height_log2(mi->sb_type), bh = 1 << bhl; + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; int idx, idy; - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { j = idy * 2 + idx; - blockmode = cpi->mb.partition_info->bmi[j].mode; - blockmv = cpi->mb.partition_info->bmi[j].mv; + blockmode = x->partition_info->bmi[j].mode; + blockmv = m->bmi[j].as_mv[0]; write_sb_mv_ref(bc, blockmode, mv_ref_p); vp9_accum_mv_refs(&cpi->common, blockmode, mi->mb_mode_context[rf]); if (blockmode == NEWMV) { #ifdef ENTROPY_STATS active_section = 11; #endif - vp9_encode_mv(bc, &blockmv.as_mv, &mi->best_mv.as_mv, + vp9_encode_mv(cpi, bc, &blockmv.as_mv, &mi->best_mv.as_mv, nmvc, xd->allow_high_precision_mv); if (mi->ref_frame[1] > INTRA_FRAME) - vp9_encode_mv(bc, - &cpi->mb.partition_info->bmi[j].second_mv.as_mv, + vp9_encode_mv(cpi, bc, + &m->bmi[j].as_mv[1].as_mv, &mi->best_second_mv.as_mv, nmvc, xd->allow_high_precision_mv); } @@ -769,12 +516,12 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, #ifdef ENTROPY_STATS active_section = 5; #endif - vp9_encode_mv(bc, + vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv, &mi->best_mv.as_mv, nmvc, xd->allow_high_precision_mv); if (mi->ref_frame[1] > INTRA_FRAME) - vp9_encode_mv(bc, + vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv, &mi->best_second_mv.as_mv, nmvc, xd->allow_high_precision_mv); } @@ -789,54 +536,40 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, const int ym = m->mbmi.mode; const int mis = c->mode_info_stride; const int segment_id = m->mbmi.segment_id; - int skip_coeff; - if (xd->update_mb_segmentation_map) - write_mb_segid(bc, &m->mbmi, xd); + if (xd->seg.update_map) + write_segment_id(bc, &xd->seg, m->mbmi.segment_id); - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { - skip_coeff = 1; - } else { - skip_coeff = m->mbmi.mb_skip_coeff; - vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP)); - } + write_skip_coeff(cpi, segment_id, m, bc); - if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->txfm_mode == TX_MODE_SELECT) { - TX_SIZE sz = m->mbmi.txfm_size; - const vp9_prob *tx_probs = vp9_get_pred_probs(c, xd, PRED_TX_SIZE); - vp9_write(bc, sz != TX_4X4, tx_probs[0]); - if (m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 && sz != TX_4X4) { - vp9_write(bc, sz != TX_8X8, tx_probs[1]); - if (m->mbmi.sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8) - vp9_write(bc, sz != TX_16X16, tx_probs[2]); - } - } + if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->tx_mode == TX_MODE_SELECT) + write_selected_txfm_size(cpi, m->mbmi.txfm_size, m->mbmi.sb_type, bc); if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) { const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis); const MB_PREDICTION_MODE L = xd->left_available ? left_block_mode(m, 0) : DC_PRED; - write_intra_mode(bc, ym, c->kf_y_mode_prob[A][L]); + write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]); } else { int idx, idy; - int bw = 1 << b_width_log2(m->mbmi.sb_type); - int bh = 1 << b_height_log2(m->mbmi.sb_type); - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[m->mbmi.sb_type]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[m->mbmi.sb_type]; + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { int i = idy * 2 + idx; const MB_PREDICTION_MODE A = above_block_mode(m, i, mis); const MB_PREDICTION_MODE L = (xd->left_available || idx) ? left_block_mode(m, i) : DC_PRED; - const int bm = m->bmi[i].as_mode.first; + const int bm = m->bmi[i].as_mode; #ifdef ENTROPY_STATS ++intra_mode_stats[A][L][bm]; #endif - write_intra_mode(bc, bm, c->kf_y_mode_prob[A][L]); + write_intra_mode(bc, bm, vp9_kf_y_mode_prob[A][L]); } } } - write_intra_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]); + write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]); } static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, @@ -875,30 +608,16 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &cpi->mb.e_mbd; const int mis = cm->mode_info_stride; - int bwl, bhl; int bsl = b_width_log2(bsize); int bs = (1 << bsl) / 4; // mode_info step for subsize int n; - PARTITION_TYPE partition; + PARTITION_TYPE partition = PARTITION_NONE; BLOCK_SIZE_TYPE subsize; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - bwl = b_width_log2(m->mbmi.sb_type); - bhl = b_height_log2(m->mbmi.sb_type); - - // parse the partition type - if ((bwl == bsl) && (bhl == bsl)) - partition = PARTITION_NONE; - else if ((bwl == bsl) && (bhl < bsl)) - partition = PARTITION_HORZ; - else if ((bwl < bsl) && (bhl == bsl)) - partition = PARTITION_VERT; - else if ((bwl < bsl) && (bhl < bsl)) - partition = PARTITION_SPLIT; - else - assert(0); + partition = partition_lookup[bsl][m->mbmi.sb_type]; if (bsize < BLOCK_SIZE_SB8X8) if (xd->ab_index > 0) @@ -906,9 +625,8 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, if (bsize >= BLOCK_SIZE_SB8X8) { int pl; - int idx = check_bsize_coverage(cm, xd, mi_row, mi_col, bsize); - xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK); - xd->above_seg_context = cm->above_seg_context + mi_col; + const int idx = check_bsize_coverage(cm, xd, mi_row, mi_col, bsize); + set_partition_seg_context(cm, xd, mi_row, mi_col); pl = partition_plane_context(xd, bsize); // encode the partition information if (idx == 0) @@ -968,14 +686,12 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc, m_ptr += c->cur_tile_mi_col_start + c->cur_tile_mi_row_start * mis; - for (mi_row = c->cur_tile_mi_row_start; - mi_row < c->cur_tile_mi_row_end; + for (mi_row = c->cur_tile_mi_row_start; mi_row < c->cur_tile_mi_row_end; mi_row += 8, m_ptr += 8 * mis) { m = m_ptr; - vpx_memset(c->left_seg_context, 0, sizeof(c->left_seg_context)); - for (mi_col = c->cur_tile_mi_col_start; - mi_col < c->cur_tile_mi_col_end; - mi_col += 64 / MI_SIZE, m += 64 / MI_SIZE) + vp9_zero(c->left_seg_context); + for (mi_col = c->cur_tile_mi_col_start; mi_col < c->cur_tile_mi_col_end; + mi_col += MI_BLOCK_SIZE, m += MI_BLOCK_SIZE) write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col, BLOCK_SIZE_SB64X64); } @@ -1014,7 +730,7 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE txfm_size) { vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[txfm_size]; vp9_coeff_count *coef_counts = cpi->coef_counts[txfm_size]; unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] = - cpi->common.fc.eob_branch_counts[txfm_size]; + cpi->common.counts.eob_branch[txfm_size]; vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[txfm_size]; vp9_prob full_probs[ENTROPY_NODES]; int i, j, k, l; @@ -1031,19 +747,11 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE txfm_size) { coef_counts[i][j][k][l], 0); vpx_memcpy(coef_probs[i][j][k][l], full_probs, sizeof(vp9_prob) * UNCONSTRAINED_NODES); -#if CONFIG_BALANCED_COEFTREE - coef_branch_ct[i][j][k][l][1][1] = eob_branch_ct[i][j][k][l] - - coef_branch_ct[i][j][k][l][1][0]; - coef_probs[i][j][k][l][1] = - get_binary_prob(coef_branch_ct[i][j][k][l][1][0], - coef_branch_ct[i][j][k][l][1][1]); -#else coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] - coef_branch_ct[i][j][k][l][0][0]; coef_probs[i][j][k][l][0] = get_binary_prob(coef_branch_ct[i][j][k][l][0][0], coef_branch_ct[i][j][k][l][0][1]); -#endif #ifdef ENTROPY_STATS if (!cpi->dummy_packing) { int t; @@ -1096,11 +804,11 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, if (l >= 3 && k == 0) continue; if (t == PIVOT_NODE) - s = prob_diff_update_savings_search_model( + s = vp9_prob_diff_update_savings_search_model( frame_branch_ct[i][j][k][l][0], old_frame_coef_probs[i][j][k][l], &newp, upd, i, j); else - s = prob_diff_update_savings_search( + s = vp9_prob_diff_update_savings_search( frame_branch_ct[i][j][k][l][t], oldp, &newp, upd); if (s > 0 && newp != oldp) u = 1; @@ -1137,11 +845,11 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, if (l >= 3 && k == 0) continue; if (t == PIVOT_NODE) - s = prob_diff_update_savings_search_model( + s = vp9_prob_diff_update_savings_search_model( frame_branch_ct[i][j][k][l][0], old_frame_coef_probs[i][j][k][l], &newp, upd, i, j); else - s = prob_diff_update_savings_search( + s = vp9_prob_diff_update_savings_search( frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd); if (s > 0 && newp != *oldp) @@ -1153,7 +861,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, #endif if (u) { /* send/use new probability */ - write_prob_diff_update(bc, newp, *oldp); + vp9_write_prob_diff_update(bc, newp, *oldp); *oldp = newp; } } @@ -1164,7 +872,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, } static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) { - const TXFM_MODE txfm_mode = cpi->common.txfm_mode; + const TX_MODE tx_mode = cpi->common.tx_mode; vp9_clear_system_state(); @@ -1174,39 +882,39 @@ static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) { update_coef_probs_common(bc, cpi, TX_4X4); // do not do this if not even allowed - if (txfm_mode > ONLY_4X4) + if (tx_mode > ONLY_4X4) update_coef_probs_common(bc, cpi, TX_8X8); - if (txfm_mode > ALLOW_8X8) + if (tx_mode > ALLOW_8X8) update_coef_probs_common(bc, cpi, TX_16X16); - if (txfm_mode > ALLOW_16X16) + if (tx_mode > ALLOW_16X16) update_coef_probs_common(bc, cpi, TX_32X32); } -static void encode_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, +static void encode_loopfilter(struct loopfilter *lf, struct vp9_write_bit_buffer *wb) { int i; // Encode the loop filter level and type - vp9_wb_write_literal(wb, pc->filter_level, 6); - vp9_wb_write_literal(wb, pc->sharpness_level, 3); + vp9_wb_write_literal(wb, lf->filter_level, 6); + vp9_wb_write_literal(wb, lf->sharpness_level, 3); // Write out loop filter deltas applied at the MB level based on mode or // ref frame (if they are enabled). - vp9_wb_write_bit(wb, xd->mode_ref_lf_delta_enabled); + vp9_wb_write_bit(wb, lf->mode_ref_delta_enabled); - if (xd->mode_ref_lf_delta_enabled) { + if (lf->mode_ref_delta_enabled) { // Do the deltas need to be updated - vp9_wb_write_bit(wb, xd->mode_ref_lf_delta_update); - if (xd->mode_ref_lf_delta_update) { + vp9_wb_write_bit(wb, lf->mode_ref_delta_update); + if (lf->mode_ref_delta_update) { // Send update for (i = 0; i < MAX_REF_LF_DELTAS; i++) { - const int delta = xd->ref_lf_deltas[i]; + const int delta = lf->ref_deltas[i]; // Frame level data - if (delta != xd->last_ref_lf_deltas[i]) { - xd->last_ref_lf_deltas[i] = delta; + if (delta != lf->last_ref_deltas[i]) { + lf->last_ref_deltas[i] = delta; vp9_wb_write_bit(wb, 1); assert(delta != 0); @@ -1219,9 +927,9 @@ static void encode_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, // Send update for (i = 0; i < MAX_MODE_LF_DELTAS; i++) { - const int delta = xd->mode_lf_deltas[i]; - if (delta != xd->last_mode_lf_deltas[i]) { - xd->last_mode_lf_deltas[i] = delta; + const int delta = lf->mode_deltas[i]; + if (delta != lf->last_mode_deltas[i]) { + lf->last_mode_deltas[i] = delta; vp9_wb_write_bit(wb, 1); assert(delta != 0); @@ -1255,23 +963,23 @@ static void encode_quantization(VP9_COMMON *cm, static void encode_segmentation(VP9_COMP *cpi, - struct vp9_write_bit_buffer *wb) { + struct vp9_write_bit_buffer *wb) { int i, j; - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->mb.e_mbd; - vp9_wb_write_bit(wb, xd->segmentation_enabled); - if (!xd->segmentation_enabled) + struct segmentation *seg = &cpi->mb.e_mbd.seg; + + vp9_wb_write_bit(wb, seg->enabled); + if (!seg->enabled) return; // Segmentation map - vp9_wb_write_bit(wb, xd->update_mb_segmentation_map); - if (xd->update_mb_segmentation_map) { + vp9_wb_write_bit(wb, seg->update_map); + if (seg->update_map) { // Select the coding strategy (temporal or spatial) vp9_choose_segmap_coding_method(cpi); // Write out probabilities used to decode unpredicted macro-block segments - for (i = 0; i < MB_SEG_TREE_PROBS; i++) { - const int prob = xd->mb_segment_tree_probs[i]; + for (i = 0; i < SEG_TREE_PROBS; i++) { + const int prob = seg->tree_probs[i]; const int update = prob != MAX_PROB; vp9_wb_write_bit(wb, update); if (update) @@ -1279,10 +987,10 @@ static void encode_segmentation(VP9_COMP *cpi, } // Write out the chosen coding method. - vp9_wb_write_bit(wb, cm->temporal_update); - if (cm->temporal_update) { + vp9_wb_write_bit(wb, seg->temporal_update); + if (seg->temporal_update) { for (i = 0; i < PREDICTION_PROBS; i++) { - const int prob = cm->segment_pred_probs[i]; + const int prob = seg->pred_probs[i]; const int update = prob != MAX_PROB; vp9_wb_write_bit(wb, update); if (update) @@ -1292,16 +1000,16 @@ static void encode_segmentation(VP9_COMP *cpi, } // Segmentation data - vp9_wb_write_bit(wb, xd->update_mb_segmentation_data); - if (xd->update_mb_segmentation_data) { - vp9_wb_write_bit(wb, xd->mb_segment_abs_delta); + vp9_wb_write_bit(wb, seg->update_data); + if (seg->update_data) { + vp9_wb_write_bit(wb, seg->abs_delta); - for (i = 0; i < MAX_MB_SEGMENTS; i++) { + for (i = 0; i < MAX_SEGMENTS; i++) { for (j = 0; j < SEG_LVL_MAX; j++) { - const int active = vp9_segfeature_active(xd, i, j); + const int active = vp9_segfeature_active(seg, i, j); vp9_wb_write_bit(wb, active); if (active) { - const int data = vp9_get_segdata(xd, i, j); + const int data = vp9_get_segdata(seg, i, j); const int data_max = vp9_seg_feature_data_max(j); if (vp9_is_segfeature_signed(j)) { @@ -1321,12 +1029,12 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) { VP9_COMMON *const cm = &cpi->common; // Mode - vp9_write_literal(w, MIN(cm->txfm_mode, ALLOW_32X32), 2); - if (cm->txfm_mode >= ALLOW_32X32) - vp9_write_bit(w, cm->txfm_mode == TX_MODE_SELECT); + vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2); + if (cm->tx_mode >= ALLOW_32X32) + vp9_write_bit(w, cm->tx_mode == TX_MODE_SELECT); // Probabilities - if (cm->txfm_mode == TX_MODE_SELECT) { + if (cm->tx_mode == TX_MODE_SELECT) { int i, j; unsigned int ct_8x8p[TX_SIZE_MAX_SB - 3][2]; unsigned int ct_16x16p[TX_SIZE_MAX_SB - 2][2]; @@ -1334,28 +1042,26 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) { for (i = 0; i < TX_SIZE_CONTEXTS; i++) { - tx_counts_to_branch_counts_8x8(cm->fc.tx_count_8x8p[i], + tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p); - for (j = 0; j < TX_SIZE_MAX_SB - 3; j++) { - vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_8x8p[i][j], + for (j = 0; j < TX_SIZE_MAX_SB - 3; j++) + vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], VP9_MODE_UPDATE_PROB, ct_8x8p[j]); - } } + for (i = 0; i < TX_SIZE_CONTEXTS; i++) { - tx_counts_to_branch_counts_16x16(cm->fc.tx_count_16x16p[i], + tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p); - for (j = 0; j < TX_SIZE_MAX_SB - 2; j++) { - vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_16x16p[i][j], + for (j = 0; j < TX_SIZE_MAX_SB - 2; j++) + vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j], VP9_MODE_UPDATE_PROB, ct_16x16p[j]); - } } + for (i = 0; i < TX_SIZE_CONTEXTS; i++) { - tx_counts_to_branch_counts_32x32(cm->fc.tx_count_32x32p[i], - ct_32x32p); - for (j = 0; j < TX_SIZE_MAX_SB - 1; j++) { - vp9_cond_prob_diff_update(w, &cm->fc.tx_probs_32x32p[i][j], + tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p); + for (j = 0; j < TX_SIZE_MAX_SB - 1; j++) + vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j], VP9_MODE_UPDATE_PROB, ct_32x32p[j]); - } } #ifdef MODE_STATS if (!cpi->dummy_packing) @@ -1381,7 +1087,7 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) { for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) { count[i] = 0; for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) - count[i] += cm->fc.switchable_interp_count[j][i]; + count[i] += cm->counts.switchable_interp[j][i]; c += (count[i] > 0); } if (c == 1) { @@ -1397,18 +1103,18 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) { } static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) { - int min_log2_tiles, delta_log2_tiles, n_tile_bits, n; - vp9_get_tile_n_bits(cm, &min_log2_tiles, &delta_log2_tiles); - n_tile_bits = cm->log2_tile_columns - min_log2_tiles; - for (n = 0; n < delta_log2_tiles; n++) { - if (n_tile_bits--) { - vp9_wb_write_bit(wb, 1); - } else { - vp9_wb_write_bit(wb, 0); - break; - } - } + int min_log2_tile_cols, max_log2_tile_cols, ones; + vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); + // columns + ones = cm->log2_tile_cols - min_log2_tile_cols; + while (ones--) + vp9_wb_write_bit(wb, 1); + + if (cm->log2_tile_cols < max_log2_tile_cols) + vp9_wb_write_bit(wb, 0); + + // rows vp9_wb_write_bit(wb, cm->log2_tile_rows != 0); if (cm->log2_tile_rows != 0) vp9_wb_write_bit(wb, cm->log2_tile_rows != 1); @@ -1449,6 +1155,57 @@ static int get_refresh_mask(VP9_COMP *cpi) { } } +static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { + VP9_COMMON *const cm = &cpi->common; + vp9_writer residual_bc; + + int tile_row, tile_col; + TOKENEXTRA *tok[4][1 << 6], *tok_end; + size_t total_size = 0; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + + vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) * + mi_cols_aligned_to_sb(cm->mi_cols)); + + tok[0][0] = cpi->tok; + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + if (tile_row) + tok[tile_row][0] = tok[tile_row - 1][tile_cols - 1] + + cpi->tok_count[tile_row - 1][tile_cols - 1]; + + for (tile_col = 1; tile_col < tile_cols; tile_col++) + tok[tile_row][tile_col] = tok[tile_row][tile_col - 1] + + cpi->tok_count[tile_row][tile_col - 1]; + } + + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + vp9_get_tile_row_offsets(cm, tile_row); + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + vp9_get_tile_col_offsets(cm, tile_col); + tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col]; + + if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) + vp9_start_encode(&residual_bc, data_ptr + total_size + 4); + else + vp9_start_encode(&residual_bc, data_ptr + total_size); + + write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end); + assert(tok[tile_row][tile_col] == tok_end); + vp9_stop_encode(&residual_bc); + if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) { + // size of this tile + write_be32(data_ptr + total_size, residual_bc.pos); + total_size += 4; + } + + total_size += residual_bc.pos; + } + } + + return total_size; +} + static void write_display_size(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) { VP9_COMMON *const cm = &cpi->common; @@ -1562,7 +1319,7 @@ static void write_uncompressed_header(VP9_COMP *cpi, int i; vp9_wb_write_literal(wb, get_refresh_mask(cpi), NUM_REF_FRAMES); for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) { - vp9_wb_write_literal(wb, refs[i], NUM_REF_FRAMES_LG2); + vp9_wb_write_literal(wb, refs[i], NUM_REF_FRAMES_LOG2); vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[LAST_FRAME + i]); } @@ -1580,66 +1337,27 @@ static void write_uncompressed_header(VP9_COMP *cpi, vp9_wb_write_bit(wb, cm->frame_parallel_decoding_mode); } - vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LG2); + vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LOG2); - encode_loopfilter(cm, xd, wb); + encode_loopfilter(&xd->lf, wb); encode_quantization(cm, wb); encode_segmentation(cpi, wb); write_tile_info(cm, wb); } -void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) { - int i, bytes_packed; - VP9_COMMON *const pc = &cpi->common; - vp9_writer header_bc, residual_bc; +static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { + VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; + FRAME_CONTEXT *const fc = &cm->fc; + vp9_writer header_bc; - uint8_t *cx_data = dest; - struct vp9_write_bit_buffer wb = {dest, 0}; - struct vp9_write_bit_buffer first_partition_size_wb; - - write_uncompressed_header(cpi, &wb); - first_partition_size_wb = wb; - vp9_wb_write_literal(&wb, 0, 16); // don't know in advance first part. size - - bytes_packed = vp9_rb_bytes_written(&wb); - cx_data += bytes_packed; + vp9_start_encode(&header_bc, data); - compute_update_table(); - - vp9_start_encode(&header_bc, cx_data); - -#ifdef ENTROPY_STATS - if (pc->frame_type == INTER_FRAME) - active_section = 0; + if (xd->lossless) + cm->tx_mode = ONLY_4X4; else - active_section = 7; -#endif - - vp9_clear_system_state(); // __asm emms; - - vp9_copy(pc->fc.pre_coef_probs, pc->fc.coef_probs); - vp9_copy(pc->fc.pre_y_mode_prob, pc->fc.y_mode_prob); - vp9_copy(pc->fc.pre_uv_mode_prob, pc->fc.uv_mode_prob); - vp9_copy(pc->fc.pre_partition_prob, pc->fc.partition_prob[INTER_FRAME]); - pc->fc.pre_nmvc = pc->fc.nmvc; - vp9_copy(pc->fc.pre_switchable_interp_prob, pc->fc.switchable_interp_prob); - vp9_copy(pc->fc.pre_inter_mode_probs, pc->fc.inter_mode_probs); - vp9_copy(pc->fc.pre_intra_inter_prob, pc->fc.intra_inter_prob); - vp9_copy(pc->fc.pre_comp_inter_prob, pc->fc.comp_inter_prob); - vp9_copy(pc->fc.pre_comp_ref_prob, pc->fc.comp_ref_prob); - vp9_copy(pc->fc.pre_single_ref_prob, pc->fc.single_ref_prob); - vp9_copy(pc->fc.pre_tx_probs_8x8p, pc->fc.tx_probs_8x8p); - vp9_copy(pc->fc.pre_tx_probs_16x16p, pc->fc.tx_probs_16x16p); - vp9_copy(pc->fc.pre_tx_probs_32x32p, pc->fc.tx_probs_32x32p); - vp9_copy(pc->fc.pre_mbskip_probs, pc->fc.mbskip_probs); - - if (xd->lossless) { - pc->txfm_mode = ONLY_4X4; - } else { encode_txfm_probs(cpi, &header_bc); - } update_coef_probs(cpi, &header_bc); @@ -1649,124 +1367,106 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) { vp9_update_skip_probs(cpi, &header_bc); - if (pc->frame_type != KEY_FRAME) { + if (cm->frame_type != KEY_FRAME) { + int i; #ifdef ENTROPY_STATS active_section = 1; #endif - update_inter_mode_probs(pc, &header_bc); - vp9_zero(cpi->common.fc.inter_mode_counts); + update_inter_mode_probs(cm, &header_bc); + vp9_zero(cm->counts.inter_mode); - if (pc->mcomp_filter_type == SWITCHABLE) + if (cm->mcomp_filter_type == SWITCHABLE) update_switchable_interp_probs(cpi, &header_bc); for (i = 0; i < INTRA_INTER_CONTEXTS; i++) - vp9_cond_prob_diff_update(&header_bc, &pc->fc.intra_inter_prob[i], + vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i], VP9_MODE_UPDATE_PROB, cpi->intra_inter_count[i]); - if (pc->allow_comp_inter_inter) { + if (cm->allow_comp_inter_inter) { const int comp_pred_mode = cpi->common.comp_pred_mode; - const int use_compound_pred = (comp_pred_mode != SINGLE_PREDICTION_ONLY); - const int use_hybrid_pred = (comp_pred_mode == HYBRID_PREDICTION); + const int use_compound_pred = comp_pred_mode != SINGLE_PREDICTION_ONLY; + const int use_hybrid_pred = comp_pred_mode == HYBRID_PREDICTION; vp9_write_bit(&header_bc, use_compound_pred); if (use_compound_pred) { vp9_write_bit(&header_bc, use_hybrid_pred); - if (use_hybrid_pred) { + if (use_hybrid_pred) for (i = 0; i < COMP_INTER_CONTEXTS; i++) - vp9_cond_prob_diff_update(&header_bc, &pc->fc.comp_inter_prob[i], + vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i], VP9_MODE_UPDATE_PROB, cpi->comp_inter_count[i]); - } } } - if (pc->comp_pred_mode != COMP_PREDICTION_ONLY) { + if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) { for (i = 0; i < REF_CONTEXTS; i++) { - vp9_cond_prob_diff_update(&header_bc, &pc->fc.single_ref_prob[i][0], + vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0], VP9_MODE_UPDATE_PROB, cpi->single_ref_count[i][0]); - vp9_cond_prob_diff_update(&header_bc, &pc->fc.single_ref_prob[i][1], + vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1], VP9_MODE_UPDATE_PROB, cpi->single_ref_count[i][1]); } } - if (pc->comp_pred_mode != SINGLE_PREDICTION_ONLY) { + if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) for (i = 0; i < REF_CONTEXTS; i++) - vp9_cond_prob_diff_update(&header_bc, &pc->fc.comp_ref_prob[i], + vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i], VP9_MODE_UPDATE_PROB, cpi->comp_ref_count[i]); - } update_mbintra_mode_probs(cpi, &header_bc); for (i = 0; i < NUM_PARTITION_CONTEXTS; ++i) { - vp9_prob Pnew[PARTITION_TYPES - 1]; + vp9_prob pnew[PARTITION_TYPES - 1]; unsigned int bct[PARTITION_TYPES - 1][2]; update_mode(&header_bc, PARTITION_TYPES, vp9_partition_encodings, - vp9_partition_tree, Pnew, - pc->fc.partition_prob[pc->frame_type][i], bct, + vp9_partition_tree, pnew, + fc->partition_prob[cm->frame_type][i], bct, (unsigned int *)cpi->partition_count[i]); } vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc); } - vp9_stop_encode(&header_bc); + assert(header_bc.pos <= 0xffff); + return header_bc.pos; +} - // first partition size - assert(header_bc.pos <= 0xffff); - vp9_wb_write_literal(&first_partition_size_wb, header_bc.pos, 16); - *size = bytes_packed + header_bc.pos; - - { - int tile_row, tile_col, total_size = 0; - unsigned char *data_ptr = cx_data + header_bc.pos; - TOKENEXTRA *tok[4][1 << 6], *tok_end; - - vpx_memset(cpi->common.above_seg_context, 0, sizeof(PARTITION_CONTEXT) * - mi_cols_aligned_to_sb(&cpi->common)); - tok[0][0] = cpi->tok; - for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) { - if (tile_row) { - tok[tile_row][0] = tok[tile_row - 1][pc->tile_columns - 1] + - cpi->tok_count[tile_row - 1][pc->tile_columns - 1]; - } - for (tile_col = 1; tile_col < pc->tile_columns; tile_col++) { - tok[tile_row][tile_col] = tok[tile_row][tile_col - 1] + - cpi->tok_count[tile_row][tile_col - 1]; - } - } +void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) { + uint8_t *data = dest; + size_t first_part_size; + struct vp9_write_bit_buffer wb = {data, 0}; + struct vp9_write_bit_buffer saved_wb; - for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) { - vp9_get_tile_row_offsets(pc, tile_row); - for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) { - vp9_get_tile_col_offsets(pc, tile_col); - tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col]; - - if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) - vp9_start_encode(&residual_bc, data_ptr + total_size + 4); - else - vp9_start_encode(&residual_bc, data_ptr + total_size); - write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end); - assert(tok[tile_row][tile_col] == tok_end); - vp9_stop_encode(&residual_bc); - if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) { - // size of this tile - write_be32(data_ptr + total_size, residual_bc.pos); - total_size += 4; - } + write_uncompressed_header(cpi, &wb); + saved_wb = wb; + vp9_wb_write_literal(&wb, 0, 16); // don't know in advance first part. size - total_size += residual_bc.pos; - } - } + data += vp9_rb_bytes_written(&wb); - *size += total_size; - } + vp9_compute_update_table(); + +#ifdef ENTROPY_STATS + if (pc->frame_type == INTER_FRAME) + active_section = 0; + else + active_section = 7; +#endif + + vp9_clear_system_state(); // __asm emms; + + first_part_size = write_compressed_header(cpi, data); + data += first_part_size; + vp9_wb_write_literal(&saved_wb, first_part_size, 16); + + data += encode_tiles(cpi, data); + + *size = data - dest; } #ifdef ENTROPY_STATS diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h index 59cc3d9..4b49b17 100644 --- a/libvpx/vp9/encoder/vp9_block.h +++ b/libvpx/vp9/encoder/vp9_block.h @@ -24,11 +24,8 @@ typedef struct { } search_site; typedef struct { - int count; struct { MB_PREDICTION_MODE mode; - int_mv mv; - int_mv second_mv; } bmi[4]; } PARTITION_INFO; @@ -51,6 +48,7 @@ typedef struct { int comp_pred_diff; int single_pred_diff; int64_t txfm_rd_diff[NB_TXFM_MODES]; + int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]; // Bit flag for each mode whether it has high error in comparison to others. unsigned int modes_with_high_error; @@ -66,9 +64,8 @@ struct macroblock_plane { // Quantizer setings int16_t *quant; - uint8_t *quant_shift; + int16_t *quant_shift; int16_t *zbin; - int16_t *zrun_zbin_boost; int16_t *round; // Zbin Over Quant value @@ -99,6 +96,7 @@ struct macroblock { signed int act_zbin_adj; int mv_best_ref_index[MAX_REF_FRAMES]; + unsigned int max_mv_context[MAX_REF_FRAMES]; int nmvjointcost[MV_JOINTS]; int nmvcosts[2][MV_VALS]; @@ -115,6 +113,7 @@ struct macroblock { int **mvsadcost; int mbmode_cost[MB_MODE_COUNT]; + unsigned inter_mode_cost[INTER_MODE_CONTEXTS][MB_MODE_COUNT - NEARESTMV]; int intra_uv_mode_cost[2][MB_MODE_COUNT]; int y_mode_costs[VP9_INTRA_MODES][VP9_INTRA_MODES][VP9_INTRA_MODES]; int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1] @@ -134,13 +133,18 @@ struct macroblock { unsigned char *active_ptr; // note that token_costs is the cost when eob node is skipped - vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES]; - vp9_coeff_count token_costs_noskip[TX_SIZE_MAX_SB][BLOCK_TYPES]; + vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES][2]; int optimize; // indicate if it is in the rd search loop or encoding process int rd_search; + int skip_encode; + + // Used to store sub partition's choices. + int fast_ms; + int_mv pred_mv; + int subblock_ref; // TODO(jingning): Need to refactor the structure arrays that buffers the // coding mode decisions of each partition type. diff --git a/libvpx/vp9/encoder/vp9_dct.c b/libvpx/vp9/encoder/vp9_dct.c index a90bcf5..3112dad 100644 --- a/libvpx/vp9/encoder/vp9_dct.c +++ b/libvpx/vp9/encoder/vp9_dct.c @@ -587,7 +587,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output, temp_in[j] = out[j + i * 8]; ht.rows(temp_in, temp_out); for (j = 0; j < 8; ++j) - output[j + i * 8] = temp_out[j] >> 1; + output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; } } @@ -978,7 +978,8 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output, temp_in[j] = input[j * pitch + i] << 2; ht.cols(temp_in, temp_out); for (j = 0; j < 16; ++j) - outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; +// outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } // Rows @@ -1366,6 +1367,9 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) { temp_in[j] = input[j * shortpitch + i] << 2; dct32_1d(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) + // TODO(cd): see quality impact of only doing + // output[j * 32 + i] = (temp_out[j] + 1) >> 2; + // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c index 54b6e24..798adc1 100644 --- a/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/libvpx/vp9/encoder/vp9_encodeframe.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include "./vpx_config.h" #include "./vp9_rtcd.h" #include "vp9/encoder/vp9_encodeframe.h" @@ -44,11 +43,8 @@ int enc_debug = 0; #endif -void vp9_select_interp_filter_type(VP9_COMP *cpi); - -static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, - int output_enabled, int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize); +static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, + int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize); static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x); @@ -64,10 +60,8 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x); * Eventually this should be replaced by custom no-reference routines, * which will be faster. */ -static const uint8_t VP9_VAR_OFFS[16] = { - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 -}; - +static const uint8_t VP9_VAR_OFFS[16] = {128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; // Original activity measure from Tim T's code. static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) { @@ -92,13 +86,11 @@ static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) { } // Stub for alternative experimental activity measures. -static unsigned int alt_activity_measure(VP9_COMP *cpi, - MACROBLOCK *x, int use_dc_pred) { +static unsigned int alt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x, + int use_dc_pred) { return vp9_encode_intra(cpi, x, use_dc_pred); } - -DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = { 0 }; - +DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = {0}; // Measure the activity of the current macroblock // What we measure here is TBD so abstracted to this function @@ -135,14 +127,12 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) { unsigned int tmp; // Create a list to sort to - CHECK_MEM_ERROR(sortlist, - vpx_calloc(sizeof(unsigned int), - cpi->common.MBs)); + CHECK_MEM_ERROR(&cpi->common, sortlist, vpx_calloc(sizeof(unsigned int), + cpi->common.MBs)); // Copy map to sort list vpx_memcpy(sortlist, cpi->mb_activity_map, - sizeof(unsigned int) * cpi->common.MBs); - + sizeof(unsigned int) * cpi->common.MBs); // Ripple each value down to its correct position for (i = 1; i < cpi->common.MBs; i ++) { @@ -153,13 +143,13 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) { sortlist[j - 1] = sortlist[j]; sortlist[j] = tmp; } else - break; + break; } } // Even number MBs so estimate median as mean of two either side. median = (1 + sortlist[cpi->common.MBs >> 1] + - sortlist[(cpi->common.MBs >> 1) + 1]) >> 1; + sortlist[(cpi->common.MBs >> 1) + 1]) >> 1; cpi->activity_avg = median; @@ -167,7 +157,7 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) { } #else // Simple mean for now - cpi->activity_avg = (unsigned int)(activity_sum / cpi->common.MBs); + cpi->activity_avg = (unsigned int) (activity_sum / cpi->common.MBs); #endif if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN) @@ -211,9 +201,9 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) { b = 4 * act + cpi->activity_avg; if (b >= a) - *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1; + *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1; else - *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b); + *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b); #if OUTPUT_NORM_ACT_STATS fprintf(f, " %6d", *(x->mb_activity_ptr)); @@ -238,9 +228,9 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) { // Loop through all MBs. Note activity of each, average activity and // calculate a normalized activity for each static void build_activity_map(VP9_COMP *cpi) { - MACROBLOCK *const x = &cpi->mb; + MACROBLOCK * const x = &cpi->mb; MACROBLOCKD *xd = &x->e_mbd; - VP9_COMMON *const cm = &cpi->common; + VP9_COMMON * const cm = &cpi->common; #if ALT_ACT_MEASURE YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; @@ -285,7 +275,6 @@ static void build_activity_map(VP9_COMP *cpi) { x->plane[0].src.buf += 16; } - // adjust to the next row of mbs x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols; } @@ -315,7 +304,7 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) { a = act + (2 * cpi->activity_avg); b = (2 * act) + cpi->activity_avg; - x->rdmult = (unsigned int)(((int64_t)x->rdmult * b + (a >> 1)) / a); + x->rdmult = (unsigned int) (((int64_t) x->rdmult * b + (a >> 1)) / a); x->errorperbit = x->rdmult * 100 / (110 * x->rddiv); x->errorperbit += (x->errorperbit == 0); #endif @@ -324,41 +313,38 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) { adjust_act_zbin(cpi, x); } -static void update_state(VP9_COMP *cpi, - PICK_MODE_CONTEXT *ctx, - BLOCK_SIZE_TYPE bsize, - int output_enabled) { +static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, + BLOCK_SIZE_TYPE bsize, int output_enabled) { int i, x_idx, y; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; MODE_INFO *mi = &ctx->mic; - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; -#if CONFIG_DEBUG || CONFIG_INTERNAL_STATS - MB_PREDICTION_MODE mb_mode = mi->mbmi.mode; -#endif + MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi; + int mb_mode_index = ctx->best_mode_index; const int mis = cpi->common.mode_info_stride; - const int bh = 1 << mi_height_log2(bsize), bw = 1 << mi_width_log2(bsize); + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; -#if CONFIG_DEBUG - assert(mb_mode < MB_MODE_COUNT); + assert(mi->mbmi.mode < MB_MODE_COUNT); assert(mb_mode_index < MAX_MODES); assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES); assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES); -#endif - assert(mi->mbmi.sb_type == bsize); + // Restore the coding context of the MB to that that was in place // when the mode was picked for it - for (y = 0; y < bh; y++) { - for (x_idx = 0; x_idx < bw; x_idx++) { - if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > x_idx && - (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > y) { + for (y = 0; y < mi_height; y++) { + for (x_idx = 0; x_idx < mi_width; x_idx++) { + if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > x_idx + && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > y) { MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis; *mi_addr = *mi; } } } + // FIXME(rbultje) I'm pretty sure this should go to the end of this block + // (i.e. after the output_enabled) if (bsize < BLOCK_SIZE_SB32X32) { if (bsize < BLOCK_SIZE_MB16X16) ctx->txfm_rd_diff[ALLOW_16X16] = ctx->txfm_rd_diff[ALLOW_8X8]; @@ -367,15 +353,15 @@ static void update_state(VP9_COMP *cpi, if (mbmi->ref_frame[0] != INTRA_FRAME && mbmi->sb_type < BLOCK_SIZE_SB8X8) { *x->partition_info = ctx->partition_info; - mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int; - mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int; + mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; + mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; } x->skip = ctx->skip; if (!output_enabled) return; - if (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) { + if (!vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP)) { for (i = 0; i < NB_TXFM_MODES; i++) { cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i]; } @@ -404,31 +390,13 @@ static void update_state(VP9_COMP *cpi, THR_TM /*TM_PRED*/, THR_B_PRED /*I4X4_PRED*/, }; - cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++; + cpi->mode_chosen_counts[kf_mode_index[mi->mbmi.mode]]++; #endif } else { - /* - // Reduce the activation RD thresholds for the best choice mode - if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) && - (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2))) - { - int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2); - - cpi->rd_thresh_mult[mb_mode_index] = - (cpi->rd_thresh_mult[mb_mode_index] - >= (MIN_THRESHMULT + best_adjustment)) ? - cpi->rd_thresh_mult[mb_mode_index] - best_adjustment : - MIN_THRESHMULT; - cpi->rd_threshes[mb_mode_index] = - (cpi->rd_baseline_thresh[mb_mode_index] >> 7) - * cpi->rd_thresh_mult[mb_mode_index]; - - } - */ // Note how often each mode chosen as best cpi->mode_chosen_counts[mb_mode_index]++; - if (mbmi->ref_frame[0] != INTRA_FRAME && - (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) { + if (mbmi->ref_frame[0] != INTRA_FRAME + && (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) { int_mv best_mv, best_second_mv; const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0]; const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1]; @@ -445,72 +413,55 @@ static void update_state(VP9_COMP *cpi, if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) { int i, j; - for (j = 0; j < bh; ++j) - for (i = 0; i < bw; ++i) - if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > i && - (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > j) + for (j = 0; j < mi_height; ++j) + for (i = 0; i < mi_width; ++i) + if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > i + && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > j) xd->mode_info_context[mis * j + i].mbmi = *mbmi; } - if (cpi->common.mcomp_filter_type == SWITCHABLE && - is_inter_mode(mbmi->mode)) { - ++cpi->common.fc.switchable_interp_count - [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)] - [vp9_switchable_interp_map[mbmi->interp_filter]]; + if (cpi->common.mcomp_filter_type == SWITCHABLE + && is_inter_mode(mbmi->mode)) { + ++cpi->common.counts.switchable_interp[ + vp9_get_pred_context_switchable_interp(xd)] + [vp9_switchable_interp_map[mbmi->interp_filter]]; } cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff; - cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff; - cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff; - } -} - -static unsigned find_seg_id(VP9_COMMON *cm, uint8_t *buf, BLOCK_SIZE_TYPE bsize, - int start_y, int height, int start_x, int width) { - const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize); - const int end_x = MIN(start_x + bw, width); - const int end_y = MIN(start_y + bh, height); - int x, y; - unsigned seg_id = -1; + cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff; + cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff; - buf += width * start_y; - assert(start_y < cm->mi_rows && start_x < cm->cur_tile_mi_col_end); - for (y = start_y; y < end_y; y++, buf += width) { - for (x = start_x; x < end_x; x++) { - seg_id = MIN(seg_id, buf[x]); + for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) { + cpi->rd_filter_diff[i] += ctx->best_filter_diff[i]; } } - - return seg_id; } -void vp9_setup_src_planes(MACROBLOCK *x, - const YV12_BUFFER_CONFIG *src, +void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, int mb_row, int mb_col) { - uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; + uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, src + ->alpha_buffer}; + int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, src + ->alpha_stride}; int i; for (i = 0; i < MAX_MB_PLANE; i++) { - setup_pred_plane(&x->plane[i].src, - buffers[i], strides[i], - mb_row, mb_col, NULL, - x->e_mbd.plane[i].subsampling_x, + setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mb_row, mb_col, + NULL, x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y); } } -static void set_offsets(VP9_COMP *cpi, - int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) { - MACROBLOCK *const x = &cpi->mb; - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; +static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, + BLOCK_SIZE_TYPE bsize) { + MACROBLOCK * const x = &cpi->mb; + VP9_COMMON * const cm = &cpi->common; + MACROBLOCKD * const xd = &x->e_mbd; MB_MODE_INFO *mbmi; const int dst_fb_idx = cm->new_fb_idx; const int idx_str = xd->mode_info_stride * mi_row + mi_col; - const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize); + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; const int mb_row = mi_row >> 1; const int mb_col = mi_col >> 1; const int idx_map = mb_row * cm->mb_cols + mb_col; @@ -518,10 +469,10 @@ static void set_offsets(VP9_COMP *cpi, // entropy context structures for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].above_context = cm->above_context[i] + - (mi_col * 2 >> xd->plane[i].subsampling_x); - xd->plane[i].left_context = cm->left_context[i] + - (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y); + xd->plane[i].above_context = cm->above_context[i] + + (mi_col * 2 >> xd->plane[i].subsampling_x); + xd->plane[i].left_context = cm->left_context[i] + + (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y); } // partition contexts @@ -532,29 +483,28 @@ static void set_offsets(VP9_COMP *cpi, x->active_ptr = cpi->active_map + idx_map; /* pointers to mode info contexts */ - x->partition_info = x->pi + idx_str; - xd->mode_info_context = cm->mi + idx_str; + x->partition_info = x->pi + idx_str; + xd->mode_info_context = cm->mi + idx_str; mbmi = &xd->mode_info_context->mbmi; // Special case: if prev_mi is NULL, the previous mode info context // cannot be used. - xd->prev_mode_info_context = cm->prev_mi ? - cm->prev_mi + idx_str : NULL; + xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + idx_str : NULL; // Set up destination pointers setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col); /* Set up limit values for MV components to prevent them from * extending beyond the UMV borders assuming 16x16 block size */ - x->mv_row_min = -((mi_row * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND); - x->mv_col_min = -((mi_col * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND); - x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE + - (VP9BORDERINPIXELS - MI_SIZE * bh - VP9_INTERP_EXTEND)); - x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE + - (VP9BORDERINPIXELS - MI_SIZE * bw - VP9_INTERP_EXTEND)); + x->mv_row_min = -((mi_row * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND); + x->mv_col_min = -((mi_col * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND); + x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE + + (VP9BORDERINPIXELS - MI_SIZE * mi_height - VP9_INTERP_EXTEND)); + x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE + + (VP9BORDERINPIXELS - MI_SIZE * mi_width - VP9_INTERP_EXTEND)); // Set up distance of MB to edge of frame in 1/8th pel units - assert(!(mi_col & (bw - 1)) && !(mi_row & (bh - 1))); - set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw); + assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); + set_mi_row_col(cm, xd, mi_row, mi_height, mi_col, mi_width); /* set up source buffers */ vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); @@ -564,31 +514,28 @@ static void set_offsets(VP9_COMP *cpi, x->rdmult = cpi->RDMULT; /* segment ID */ - if (xd->segmentation_enabled) { - uint8_t *map = xd->update_mb_segmentation_map ? cpi->segmentation_map - : cm->last_frame_seg_map; - mbmi->segment_id = find_seg_id(cm, map, bsize, mi_row, - cm->mi_rows, mi_col, cm->mi_cols); + if (xd->seg.enabled) { + uint8_t *map = xd->seg.update_map ? cpi->segmentation_map + : cm->last_frame_seg_map; + mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); - assert(mbmi->segment_id <= (MAX_MB_SEGMENTS-1)); vp9_mb_init_quantizer(cpi, x); - if (xd->segmentation_enabled && cpi->seg0_cnt > 0 && - !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) && - vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) { + if (xd->seg.enabled && cpi->seg0_cnt > 0 + && !vp9_segfeature_active(&xd->seg, 0, SEG_LVL_REF_FRAME) + && vp9_segfeature_active(&xd->seg, 1, SEG_LVL_REF_FRAME)) { cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt; } else { const int y = mb_row & ~3; const int x = mb_col & ~3; - const int p16 = ((mb_row & 1) << 1) + (mb_col & 1); + const int p16 = ((mb_row & 1) << 1) + (mb_col & 1); const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1); - const int tile_progress = - cm->cur_tile_mi_col_start * cm->mb_rows >> 1; - const int mb_cols = - (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> 1; + const int tile_progress = cm->cur_tile_mi_col_start * cm->mb_rows >> 1; + const int mb_cols = (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) + >> 1; - cpi->seg0_progress = - ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs; + cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) + << 16) / cm->MBs; } } else { mbmi->segment_id = 0; @@ -596,8 +543,9 @@ static void set_offsets(VP9_COMP *cpi, } static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, - TOKENEXTRA **tp, int *totalrate, int *totaldist, - BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) { + int *totalrate, int64_t *totaldist, + BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -613,53 +561,48 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp9_activity_masking(cpi, x); - /* Find best coding mode & reconstruct the MB so it is available - * as a predictor for MBs that follow in the SB */ - if (cm->frame_type == KEY_FRAME) { - vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx); - } else { + // Find best coding mode & reconstruct the MB so it is available + // as a predictor for MBs that follow in the SB + if (cm->frame_type == KEY_FRAME) + vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx, + best_rd); + else vp9_rd_pick_inter_mode_sb(cpi, x, mi_row, mi_col, totalrate, totaldist, - bsize, ctx); - } + bsize, ctx, best_rd); } static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; MODE_INFO *mi = xd->mode_info_context; - MB_MODE_INFO *const mbmi = &mi->mbmi; + MB_MODE_INFO * const mbmi = &mi->mbmi; if (cm->frame_type != KEY_FRAME) { - int segment_id, seg_ref_active; - - segment_id = mbmi->segment_id; - seg_ref_active = vp9_segfeature_active(xd, segment_id, - SEG_LVL_REF_FRAME); + const int seg_ref_active = vp9_segfeature_active(&xd->seg, mbmi->segment_id, + SEG_LVL_REF_FRAME); if (!seg_ref_active) - cpi->intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)] - [mbmi->ref_frame[0] > INTRA_FRAME]++; + cpi->intra_inter_count[vp9_get_pred_context_intra_inter(xd)][mbmi + ->ref_frame[0] > INTRA_FRAME]++; // If the segment reference feature is enabled we have only a single // reference frame allowed for the segment so exclude it from // the reference frame counts used to work out probabilities. if ((mbmi->ref_frame[0] > INTRA_FRAME) && !seg_ref_active) { if (cm->comp_pred_mode == HYBRID_PREDICTION) - cpi->comp_inter_count[vp9_get_pred_context(cm, xd, - PRED_COMP_INTER_INTER)] - [mbmi->ref_frame[1] > INTRA_FRAME]++; + cpi->comp_inter_count[vp9_get_pred_context_comp_inter_inter(cm, xd)] + [mbmi->ref_frame[1] > INTRA_FRAME]++; if (mbmi->ref_frame[1] > INTRA_FRAME) { - cpi->comp_ref_count[vp9_get_pred_context(cm, xd, PRED_COMP_REF_P)] - [mbmi->ref_frame[0] == GOLDEN_FRAME]++; + cpi->comp_ref_count[vp9_get_pred_context_comp_ref_p(cm, xd)][mbmi + ->ref_frame[0] == GOLDEN_FRAME]++; } else { - cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1)] - [0][mbmi->ref_frame[0] != LAST_FRAME]++; + cpi->single_ref_count[vp9_get_pred_context_single_ref_p1(xd)] + [0][mbmi->ref_frame[0] != LAST_FRAME]++; if (mbmi->ref_frame[0] != LAST_FRAME) - cpi->single_ref_count[vp9_get_pred_context(cm, xd, - PRED_SINGLE_REF_P2)] - [1][mbmi->ref_frame[0] != GOLDEN_FRAME]++; + cpi->single_ref_count[vp9_get_pred_context_single_ref_p2(xd)][1] + [mbmi->ref_frame[0] != GOLDEN_FRAME]++; } } // Count of last ref frame 0,0 usage @@ -673,7 +616,7 @@ static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) { // partition down to 4x4 block size is enabled. static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { - MACROBLOCKD *const xd = &x->e_mbd; + MACROBLOCKD * const xd = &x->e_mbd; switch (bsize) { case BLOCK_SIZE_SB64X64: @@ -704,7 +647,7 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index]; default: assert(0); - return NULL; + return NULL ; } } @@ -722,75 +665,80 @@ static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x, return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index]; default: assert(0); - return NULL; + return NULL ; } } static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col, ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], - PARTITION_CONTEXT sa[8], - PARTITION_CONTEXT sl[8], + PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8], BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; int p; - int bwl = b_width_log2(bsize), bw = 1 << bwl; - int bhl = b_height_log2(bsize), bh = 1 << bhl; - int mwl = mi_width_log2(bsize), mw = 1 << mwl; - int mhl = mi_height_log2(bsize), mh = 1 << mhl; + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int mi_height = num_8x8_blocks_high_lookup[bsize]; for (p = 0; p < MAX_MB_PLANE; p++) { - vpx_memcpy(cm->above_context[p] + - ((mi_col * 2) >> xd->plane[p].subsampling_x), - a + bw * p, - sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x); - vpx_memcpy(cm->left_context[p] + - ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), - l + bh * p, - sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y); + vpx_memcpy( + cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x), + a + num_4x4_blocks_wide * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> + xd->plane[p].subsampling_x); + vpx_memcpy( + cm->left_context[p] + + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), + l + num_4x4_blocks_high * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> + xd->plane[p].subsampling_y); } vpx_memcpy(cm->above_seg_context + mi_col, sa, - sizeof(PARTITION_CONTEXT) * mw); + sizeof(PARTITION_CONTEXT) * mi_width); vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl, - sizeof(PARTITION_CONTEXT) * mh); + sizeof(PARTITION_CONTEXT) * mi_height); } static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, - ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], - ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], - PARTITION_CONTEXT sa[8], - PARTITION_CONTEXT sl[8], - BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], + ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], + PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8], + BLOCK_SIZE_TYPE bsize) { + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; int p; - int bwl = b_width_log2(bsize), bw = 1 << bwl; - int bhl = b_height_log2(bsize), bh = 1 << bhl; - int mwl = mi_width_log2(bsize), mw = 1 << mwl; - int mhl = mi_height_log2(bsize), mh = 1 << mhl; + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int mi_height = num_8x8_blocks_high_lookup[bsize]; // buffer the above/left context information of the block in search. for (p = 0; p < MAX_MB_PLANE; ++p) { - vpx_memcpy(a + bw * p, cm->above_context[p] + - (mi_col * 2 >> xd->plane[p].subsampling_x), - sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x); - vpx_memcpy(l + bh * p, cm->left_context[p] + - ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), - sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y); + vpx_memcpy( + a + num_4x4_blocks_wide * p, + cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x), + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> + xd->plane[p].subsampling_x); + vpx_memcpy( + l + num_4x4_blocks_high * p, + cm->left_context[p] + + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> + xd->plane[p].subsampling_y); } vpx_memcpy(sa, cm->above_seg_context + mi_col, - sizeof(PARTITION_CONTEXT) * mw); + sizeof(PARTITION_CONTEXT) * mi_width); vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK), - sizeof(PARTITION_CONTEXT) * mh); + sizeof(PARTITION_CONTEXT) * mi_height); } -static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, - int mi_row, int mi_col, int output_enabled, - BLOCK_SIZE_TYPE bsize, int sub_index) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; +static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, + int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) { + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -813,16 +761,17 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, } } -static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, - int mi_row, int mi_col, int output_enabled, - BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; +static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, + int output_enabled, BLOCK_SIZE_TYPE bsize) { + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8; const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4; - int bwl, bhl; int UNINITIALIZED_IS_SAFE(pl); + PARTITION_TYPE partition; + BLOCK_SIZE_TYPE subsize; + int i; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -833,44 +782,46 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, pl = partition_plane_context(xd, bsize); c1 = *(get_sb_partitioning(x, bsize)); } + partition = partition_lookup[bsl][c1]; - bwl = b_width_log2(c1), bhl = b_height_log2(c1); - - if (bsl == bwl && bsl == bhl) { - if (output_enabled && bsize >= BLOCK_SIZE_SB8X8) + switch (partition) { + case PARTITION_NONE: + if (output_enabled && bsize >= BLOCK_SIZE_SB8X8) cpi->partition_count[pl][PARTITION_NONE]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1); - } else if (bsl == bhl && bsl > bwl) { - if (output_enabled) - cpi->partition_count[pl][PARTITION_VERT]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); - encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1); - } else if (bsl == bwl && bsl > bhl) { - if (output_enabled) - cpi->partition_count[pl][PARTITION_HORZ]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); - encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1); - } else { - BLOCK_SIZE_TYPE subsize; - int i; - - assert(bwl < bsl && bhl < bsl); - subsize = get_subsize(bsize, PARTITION_SPLIT); + encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1); + break; + case PARTITION_VERT: + if (output_enabled) + cpi->partition_count[pl][PARTITION_VERT]++; + encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); + encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1); + break; + case PARTITION_HORZ: + if (output_enabled) + cpi->partition_count[pl][PARTITION_HORZ]++; + encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); + encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1); + break; + case PARTITION_SPLIT: + subsize = get_subsize(bsize, PARTITION_SPLIT); - if (output_enabled) - cpi->partition_count[pl][PARTITION_SPLIT]++; + if (output_enabled) + cpi->partition_count[pl][PARTITION_SPLIT]++; - for (i = 0; i < 4; i++) { - const int x_idx = i & 1, y_idx = i >> 1; + for (i = 0; i < 4; i++) { + const int x_idx = i & 1, y_idx = i >> 1; - *(get_sb_index(xd, subsize)) = i; - encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs, - output_enabled, subsize); - } + *(get_sb_index(xd, subsize)) = i; + encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs, + output_enabled, subsize); + } + break; + default: + assert(0); + break; } - if (bsize >= BLOCK_SIZE_SB8X8 && - (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) { + if (partition != PARTITION_SPLIT || bsize == BLOCK_SIZE_SB8X8) { set_partition_seg_context(cm, xd, mi_row, mi_col); update_partition_context(xd, c1, bsize); } @@ -880,26 +831,28 @@ static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m, BLOCK_SIZE_TYPE bsize) { VP9_COMMON *const cm = &cpi->common; const int mis = cm->mode_info_stride; - int bsl = b_width_log2(bsize); - int bs = (1 << bsl) / 2; // int block_row, block_col; - int row, col; - - // this test function sets the entire macroblock to the same bsize - for (block_row = 0; block_row < 8; block_row += bs) { - for (block_col = 0; block_col < 8; block_col += bs) { - for (row = 0; row < bs; row++) { - for (col = 0; col < bs; col++) { - m[(block_row+row)*mis + block_col+col].mbmi.sb_type = bsize; - } - } + for (block_row = 0; block_row < 8; ++block_row) { + for (block_col = 0; block_col < 8; ++block_col) { + m[block_row * mis + block_col].mbmi.sb_type = bsize; + } + } +} +static void copy_partitioning(VP9_COMP *cpi, MODE_INFO *m, MODE_INFO *p) { + VP9_COMMON *const cm = &cpi->common; + const int mis = cm->mode_info_stride; + int block_row, block_col; + for (block_row = 0; block_row < 8; ++block_row) { + for (block_col = 0; block_col < 8; ++block_col) { + m[block_row * mis + block_col].mbmi.sb_type = + p[block_row * mis + block_col].mbmi.sb_type; } } } -static void set_block_size(VP9_COMMON *const cm, - MODE_INFO *m, BLOCK_SIZE_TYPE bsize, int mis, - int mi_row, int mi_col) { +static void set_block_size(VP9_COMMON * const cm, MODE_INFO *m, + BLOCK_SIZE_TYPE bsize, int mis, int mi_row, + int mi_col) { int row, col; int bwl = b_width_log2(bsize); int bhl = b_height_log2(bsize); @@ -911,10 +864,11 @@ static void set_block_size(VP9_COMMON *const cm, for (col = 0; col < bs; col++) { if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols) continue; - m2[row*mis+col].mbmi.sb_type = bsize; + m2[row * mis + col].mbmi.sb_type = bsize; } } } + typedef struct { int64_t sum_square_error; int64_t sum_error; @@ -922,11 +876,15 @@ typedef struct { int variance; } var; +typedef struct { + var none; + var horz[2]; + var vert[2]; +} partition_variance; + #define VT(TYPE, BLOCKSIZE) \ typedef struct { \ - var none; \ - var horz[2]; \ - var vert[2]; \ + partition_variance vt; \ BLOCKSIZE split[4]; } TYPE; VT(v8x8, var) @@ -934,20 +892,67 @@ VT(v16x16, v8x8) VT(v32x32, v16x16) VT(v64x64, v32x32) +typedef struct { + partition_variance *vt; + var *split[4]; +} vt_node; + typedef enum { V16X16, V32X32, V64X64, } TREE_LEVEL; +static void tree_to_node(void *data, BLOCK_SIZE_TYPE block_size, vt_node *node) { + int i; + switch (block_size) { + case BLOCK_SIZE_SB64X64: { + v64x64 *vt = (v64x64 *) data; + node->vt = &vt->vt; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].vt.none; + break; + } + case BLOCK_SIZE_SB32X32: { + v32x32 *vt = (v32x32 *) data; + node->vt = &vt->vt; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].vt.none; + break; + } + case BLOCK_SIZE_MB16X16: { + v16x16 *vt = (v16x16 *) data; + node->vt = &vt->vt; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].vt.none; + break; + } + case BLOCK_SIZE_SB8X8: { + v8x8 *vt = (v8x8 *) data; + node->vt = &vt->vt; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i]; + break; + } + default: + node->vt = 0; + for (i = 0; i < 4; i++) + node->split[i] = 0; + assert(-1); + } +} + // Set variance values given sum square error, sum error, count. static void fill_variance(var *v, int64_t s2, int64_t s, int c) { v->sum_square_error = s2; v->sum_error = s; v->count = c; - v->variance = 256 - * (v->sum_square_error - v->sum_error * v->sum_error / v->count) - / v->count; + if (c > 0) + v->variance = 256 + * (v->sum_square_error - v->sum_error * v->sum_error / v->count) + / v->count; + else + v->variance = 0; } // Combine 2 variance structures by summing the sum_error, sum_square_error, @@ -956,31 +961,95 @@ void sum_2_variances(var *r, var *a, var*b) { fill_variance(r, a->sum_square_error + b->sum_square_error, a->sum_error + b->sum_error, a->count + b->count); } -// Fill one level of our variance tree, by summing the split sums into each of -// the horizontal, vertical and none from split and recalculating variance. -#define fill_variance_tree(VT) \ - sum_2_variances(VT.horz[0], VT.split[0].none, VT.split[1].none); \ - sum_2_variances(VT.horz[1], VT.split[2].none, VT.split[3].none); \ - sum_2_variances(VT.vert[0], VT.split[0].none, VT.split[2].none); \ - sum_2_variances(VT.vert[1], VT.split[1].none, VT.split[3].none); \ - sum_2_variances(VT.none, VT.vert[0], VT.vert[1]); - -// Set the blocksize in the macroblock info structure if the variance is less -// than our threshold to one of none, horz, vert. -#define set_vt_size(VT, BLOCKSIZE, R, C, ACTION) \ - if (VT.none.variance < threshold) { \ - set_block_size(cm, m, BLOCKSIZE, mis, R, C); \ - ACTION; \ - } \ - if (VT.horz[0].variance < threshold && VT.horz[1].variance < threshold ) { \ - set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_HORZ), mis, R, C); \ - ACTION; \ - } \ - if (VT.vert[0].variance < threshold && VT.vert[1].variance < threshold ) { \ - set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_VERT), mis, R, C); \ - ACTION; \ + +static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) { + vt_node node; + tree_to_node(data, block_size, &node); + sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]); + sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]); + sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]); + sum_2_variances(&node.vt->vert[1], node.split[1], node.split[3]); + sum_2_variances(&node.vt->none, &node.vt->vert[0], &node.vt->vert[1]); +} + +#if PERFORM_RANDOM_PARTITIONING +static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m, + BLOCK_SIZE_TYPE block_size, int mi_row, + int mi_col, int mi_size) { + VP9_COMMON * const cm = &cpi->common; + vt_node vt; + const int mis = cm->mode_info_stride; + int64_t threshold = 4 * cpi->common.base_qindex * cpi->common.base_qindex; + + tree_to_node(data, block_size, &vt); + + // split none is available only if we have more than half a block size + // in width and height inside the visible image + if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows && + (rand() & 3) < 1) { + set_block_size(cm, m, block_size, mis, mi_row, mi_col); + return 1; + } + + // vertical split is available on all but the bottom border + if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold + && (rand() & 3) < 1) { + set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row, + mi_col); + return 1; } + // horizontal split is available on all but the right border + if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold + && (rand() & 3) < 1) { + set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row, + mi_col); + return 1; + } + + return 0; +} + +#else + +static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m, + BLOCK_SIZE_TYPE block_size, int mi_row, + int mi_col, int mi_size) { + VP9_COMMON * const cm = &cpi->common; + vt_node vt; + const int mis = cm->mode_info_stride; + int64_t threshold = 50 * cpi->common.base_qindex; + + tree_to_node(data, block_size, &vt); + + // split none is available only if we have more than half a block size + // in width and height inside the visible image + if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows + && vt.vt->none.variance < threshold) { + set_block_size(cm, m, block_size, mis, mi_row, mi_col); + return 1; + } + + // vertical split is available on all but the bottom border + if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold + && vt.vt->vert[1].variance < threshold) { + set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row, + mi_col); + return 1; + } + + // horizontal split is available on all but the right border + if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold + && vt.vt->horz[1].variance < threshold) { + set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row, + mi_col); + return 1; + } + + return 0; +} +#endif + static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row, int mi_col) { VP9_COMMON * const cm = &cpi->common; @@ -993,8 +1062,8 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row, v64x64 vt; unsigned char * s; int sp; - const unsigned char * d = xd->plane[0].pre->buf; - int dp = xd->plane[0].pre->stride; + const unsigned char * d; + int dp; int pixels_wide = 64, pixels_high = 64; vpx_memset(&vt, 0, sizeof(vt)); @@ -1014,179 +1083,228 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row, // but this needs more experimentation. threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex; - // if ( cm->frame_type == KEY_FRAME ) { d = vp9_64x64_zeros; dp = 64; - // } + if (cm->frame_type != KEY_FRAME) { + int_mv nearest_mv, near_mv; + YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[0]; + YV12_BUFFER_CONFIG *second_ref_fb = NULL; + + setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col, + &xd->scale_factor[0]); + setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col, + &xd->scale_factor[1]); + xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME; + xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64; + vp9_find_best_ref_mvs(xd, m->mbmi.ref_mvs[m->mbmi.ref_frame[0]], + &nearest_mv, &near_mv); + + xd->mode_info_context->mbmi.mv[0] = nearest_mv; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_SIZE_SB64X64); + d = xd->plane[0].dst.buf; + dp = xd->plane[0].dst.stride; + + } // Fill in the entire tree of 8x8 variances for splits. for (i = 0; i < 4; i++) { const int x32_idx = ((i & 1) << 5); const int y32_idx = ((i >> 1) << 5); for (j = 0; j < 4; j++) { - const int x_idx = x32_idx + ((j & 1) << 4); - const int y_idx = y32_idx + ((j >> 1) << 4); - const uint8_t *st = s + y_idx * sp + x_idx; - const uint8_t *dt = d + y_idx * dp + x_idx; - unsigned int sse = 0; - int sum = 0; + const int x16_idx = x32_idx + ((j & 1) << 4); + const int y16_idx = y32_idx + ((j >> 1) << 4); v16x16 *vst = &vt.split[i].split[j]; - sse = sum = 0; - if (x_idx < pixels_wide && y_idx < pixels_high) - vp9_get_sse_sum_8x8(st, sp, dt, dp, &sse, &sum); - fill_variance(&vst->split[0].none, sse, sum, 64); - sse = sum = 0; - if (x_idx + 8 < pixels_wide && y_idx < pixels_high) - vp9_get_sse_sum_8x8(st + 8, sp, dt + 8, dp, &sse, &sum); - fill_variance(&vst->split[1].none, sse, sum, 64); - sse = sum = 0; - if (x_idx < pixels_wide && y_idx + 8 < pixels_high) - vp9_get_sse_sum_8x8(st + 8 * sp, sp, dt + 8 * dp, dp, &sse, &sum); - fill_variance(&vst->split[2].none, sse, sum, 64); - sse = sum = 0; - if (x_idx + 8 < pixels_wide && y_idx + 8 < pixels_high) - vp9_get_sse_sum_8x8(st + 8 * sp + 8, sp, dt + 8 + 8 * dp, dp, &sse, - &sum); - fill_variance(&vst->split[3].none, sse, sum, 64); + for (k = 0; k < 4; k++) { + int x_idx = x16_idx + ((k & 1) << 3); + int y_idx = y16_idx + ((k >> 1) << 3); + unsigned int sse = 0; + int sum = 0; + if (x_idx < pixels_wide && y_idx < pixels_high) + vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp, + d + y_idx * dp + x_idx, dp, &sse, &sum); + fill_variance(&vst->split[k].vt.none, sse, sum, 64); + } } } // Fill the rest of the variance tree by summing the split partition // values. for (i = 0; i < 4; i++) { for (j = 0; j < 4; j++) { - fill_variance_tree(&vt.split[i].split[j]) + fill_variance_tree(&vt.split[i].split[j], BLOCK_SIZE_MB16X16); } - fill_variance_tree(&vt.split[i]) + fill_variance_tree(&vt.split[i], BLOCK_SIZE_SB32X32); } - fill_variance_tree(&vt) - - // Now go through the entire structure, splitting every blocksize until + fill_variance_tree(&vt, BLOCK_SIZE_SB64X64); + // Now go through the entire structure, splitting every block size until // we get to one that's got a variance lower than our threshold, or we // hit 8x8. - set_vt_size( vt, BLOCK_SIZE_SB64X64, mi_row, mi_col, return); - for (i = 0; i < 4; ++i) { - const int x32_idx = ((i & 1) << 2); - const int y32_idx = ((i >> 1) << 2); - set_vt_size(vt, BLOCK_SIZE_SB32X32, mi_row + y32_idx, mi_col + x32_idx, - continue); - - for (j = 0; j < 4; ++j) { - const int x16_idx = ((j & 1) << 1); - const int y16_idx = ((j >> 1) << 1); - set_vt_size(vt, BLOCK_SIZE_MB16X16, mi_row + y32_idx + y16_idx, - mi_col+x32_idx+x16_idx, continue); - - for (k = 0; k < 4; ++k) { - const int x8_idx = (k & 1); - const int y8_idx = (k >> 1); - set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis, - mi_row + y32_idx + y16_idx + y8_idx, - mi_col + x32_idx + x16_idx + x8_idx); + if (!set_vt_partitioning(cpi, &vt, m, BLOCK_SIZE_SB64X64, mi_row, mi_col, + 4)) { + for (i = 0; i < 4; ++i) { + const int x32_idx = ((i & 1) << 2); + const int y32_idx = ((i >> 1) << 2); + if (!set_vt_partitioning(cpi, &vt.split[i], m, BLOCK_SIZE_SB32X32, + (mi_row + y32_idx), (mi_col + x32_idx), 2)) { + for (j = 0; j < 4; ++j) { + const int x16_idx = ((j & 1) << 1); + const int y16_idx = ((j >> 1) << 1); + if (!set_vt_partitioning(cpi, &vt.split[i].split[j], m, + BLOCK_SIZE_MB16X16, + (mi_row + y32_idx + y16_idx), + (mi_col + x32_idx + x16_idx), 1)) { + for (k = 0; k < 4; ++k) { + const int x8_idx = (k & 1); + const int y8_idx = (k >> 1); + set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis, + (mi_row + y32_idx + y16_idx + y8_idx), + (mi_col + x32_idx + x16_idx + x8_idx)); + } + } + } } } } } static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize, - int *rate, int *dist) { + int *rate, int64_t *dist, int do_recon) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; MACROBLOCKD *xd = &cpi->mb.e_mbd; const int mis = cm->mode_info_stride; - int bwl = b_width_log2(m->mbmi.sb_type); - int bhl = b_height_log2(m->mbmi.sb_type); int bsl = b_width_log2(bsize); - int bh = (1 << bhl); - int bs = (1 << bsl); - int bss = (1 << bsl)/4; + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + int ms = num_4x4_blocks_wide / 2; + int mh = num_4x4_blocks_high / 2; + int bss = (1 << bsl) / 4; int i, pl; - PARTITION_TYPE partition; + PARTITION_TYPE partition = PARTITION_NONE; BLOCK_SIZE_TYPE subsize; ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; - int r = 0, d = 0; + int last_part_rate = INT_MAX; + int64_t last_part_dist = INT_MAX; + int split_rate = INT_MAX; + int64_t split_dist = INT_MAX; + int none_rate = INT_MAX; + int64_t none_dist = INT_MAX; + int chosen_rate = INT_MAX; + int64_t chosen_dist = INT_MAX; + BLOCK_SIZE_TYPE sub_subsize = BLOCK_SIZE_AB4X4; + int splits_below = 0; + BLOCK_SIZE_TYPE bs_type = m->mbmi.sb_type; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - - // parse the partition type - if ((bwl == bsl) && (bhl == bsl)) - partition = PARTITION_NONE; - else if ((bwl == bsl) && (bhl < bsl)) - partition = PARTITION_HORZ; - else if ((bwl < bsl) && (bhl == bsl)) - partition = PARTITION_VERT; - else if ((bwl < bsl) && (bhl < bsl)) - partition = PARTITION_SPLIT; - else - assert(0); + partition = partition_lookup[bsl][bs_type]; subsize = get_subsize(bsize, partition); - // TODO(JBB): this restriction is here because pick_sb_modes can return - // r's that are INT_MAX meaning we can't select a mode / mv for this block. - // when the code is made to work for less than sb8x8 we need to come up with - // a solution to this problem. - assert(subsize >= BLOCK_SIZE_SB8X8); - - if (bsize >= BLOCK_SIZE_SB8X8) { - xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK); - xd->above_seg_context = cm->above_seg_context + mi_col; + if (bsize < BLOCK_SIZE_SB8X8) { + if (xd->ab_index != 0) { + *rate = 0; + *dist = 0; + return; + } + } else { *(get_sb_partitioning(x, bsize)) = subsize; } - - pl = partition_plane_context(xd, bsize); save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + + if (cpi->sf.adjust_partitioning_from_last_frame) { + // Check if any of the sub blocks are further split. + if (partition == PARTITION_SPLIT && subsize > BLOCK_SIZE_SB8X8) { + sub_subsize = get_subsize(subsize, PARTITION_SPLIT); + splits_below = 1; + for (i = 0; i < 4; i++) { + int jj = i >> 1, ii = i & 0x01; + if (m[jj * bss * mis + ii * bss].mbmi.sb_type >= sub_subsize) { + splits_below = 0; + } + } + } + + // If partition is not none try none unless each of the 4 splits are split + // even further.. + if (partition != PARTITION_NONE && !splits_below && + mi_row + (ms >> 1) < cm->mi_rows && + mi_col + (ms >> 1) < cm->mi_cols) { + *(get_sb_partitioning(x, bsize)) = bsize; + pick_sb_modes(cpi, mi_row, mi_col, &none_rate, &none_dist, bsize, + get_block_context(x, bsize), INT64_MAX); + + set_partition_seg_context(cm, xd, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + none_rate += x->partition_cost[pl][PARTITION_NONE]; + + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + m->mbmi.sb_type = bs_type; + *(get_sb_partitioning(x, bsize)) = subsize; + } + } + switch (partition) { case PARTITION_NONE: - pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize, - get_block_context(x, bsize)); - r += x->partition_cost[pl][PARTITION_NONE]; + pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist, + bsize, get_block_context(x, bsize), INT64_MAX); break; case PARTITION_HORZ: *(get_sb_index(xd, subsize)) = 0; - pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize, - get_block_context(x, subsize)); - if (mi_row + (bh >> 1) <= cm->mi_rows) { - int rt, dt; + pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist, + subsize, get_block_context(x, subsize), INT64_MAX); + if (last_part_rate != INT_MAX && + bsize >= BLOCK_SIZE_SB8X8 && mi_row + (mh >> 1) < cm->mi_rows) { + int rt = 0; + int64_t dt = 0; update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *(get_sb_index(xd, subsize)) = 1; - pick_sb_modes(cpi, mi_row + (bs >> 2), mi_col, tp, &rt, &dt, subsize, - get_block_context(x, subsize)); - r += rt; - d += dt; + pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize, + get_block_context(x, subsize), INT64_MAX); + if (rt == INT_MAX || dt == INT_MAX) { + last_part_rate = INT_MAX; + last_part_dist = INT_MAX; + break; + } + + last_part_rate += rt; + last_part_dist += dt; } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - r += x->partition_cost[pl][PARTITION_HORZ]; break; case PARTITION_VERT: *(get_sb_index(xd, subsize)) = 0; - pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize, - get_block_context(x, subsize)); - if (mi_col + (bs >> 1) <= cm->mi_cols) { - int rt, dt; + pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist, + subsize, get_block_context(x, subsize), INT64_MAX); + if (last_part_rate != INT_MAX && + bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) { + int rt = 0; + int64_t dt = 0; update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *(get_sb_index(xd, subsize)) = 1; - pick_sb_modes(cpi, mi_row, mi_col + (bs >> 2), tp, &rt, &dt, subsize, - get_block_context(x, subsize)); - r += rt; - d += dt; + pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize, + get_block_context(x, subsize), INT64_MAX); + if (rt == INT_MAX || dt == INT_MAX) { + last_part_rate = INT_MAX; + last_part_dist = INT_MAX; + break; + } + last_part_rate += rt; + last_part_dist += dt; } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - r += x->partition_cost[pl][PARTITION_VERT]; - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); break; case PARTITION_SPLIT: + // Split partition. + last_part_rate = 0; + last_part_dist = 0; for (i = 0; i < 4; i++) { - int x_idx = (i & 1) * (bs >> 2); - int y_idx = (i >> 1) * (bs >> 2); + int x_idx = (i & 1) * (ms >> 1); + int y_idx = (i >> 1) * (ms >> 1); int jj = i >> 1, ii = i & 0x01; - int rt, dt; + int rt; + int64_t dt; if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; @@ -1194,56 +1312,137 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, *(get_sb_index(xd, subsize)) = i; rd_use_partition(cpi, m + jj * bss * mis + ii * bss, tp, mi_row + y_idx, - mi_col + x_idx, subsize, &rt, &dt); - r += rt; - d += dt; + mi_col + x_idx, subsize, &rt, &dt, i != 3); + if (rt == INT_MAX || dt == INT_MAX) { + last_part_rate = INT_MAX; + last_part_dist = INT_MAX; + break; + } + last_part_rate += rt; + last_part_dist += dt; } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - r += x->partition_cost[pl][PARTITION_SPLIT]; break; default: assert(0); } + set_partition_seg_context(cm, xd, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + if (last_part_rate < INT_MAX) + last_part_rate += x->partition_cost[pl][partition]; + + if (cpi->sf.adjust_partitioning_from_last_frame + && partition != PARTITION_SPLIT && bsize > BLOCK_SIZE_SB8X8 + && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows) + && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) { + BLOCK_SIZE_TYPE split_subsize = get_subsize(bsize, PARTITION_SPLIT); + split_rate = 0; + split_dist = 0; + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - // update partition context -#if CONFIG_AB4X4 - if (bsize >= BLOCK_SIZE_SB8X8 && - (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) { -#else - if (bsize > BLOCK_SIZE_SB8X8 - && (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) { -#endif + // Split partition. + for (i = 0; i < 4; i++) { + int x_idx = (i & 1) * (num_4x4_blocks_wide >> 2); + int y_idx = (i >> 1) * (num_4x4_blocks_wide >> 2); + int rt = 0; + int64_t dt = 0; + ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; + PARTITION_CONTEXT sl[8], sa[8]; + + if ((mi_row + y_idx >= cm->mi_rows) + || (mi_col + x_idx >= cm->mi_cols)) + continue; + + *(get_sb_index(xd, split_subsize)) = i; + *(get_sb_partitioning(x, bsize)) = split_subsize; + *(get_sb_partitioning(x, split_subsize)) = split_subsize; + + save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + + pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx, &rt, &dt, + split_subsize, get_block_context(x, split_subsize), + INT64_MAX); + + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + + if (rt == INT_MAX || dt == INT_MAX) { + split_rate = INT_MAX; + split_dist = INT_MAX; + break; + } + + if (i != 3) + encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0, + split_subsize); + + split_rate += rt; + split_dist += dt; + set_partition_seg_context(cm, xd, mi_row + y_idx, mi_col + x_idx); + pl = partition_plane_context(xd, bsize); + split_rate += x->partition_cost[pl][PARTITION_NONE]; + } set_partition_seg_context(cm, xd, mi_row, mi_col); - update_partition_context(xd, subsize, bsize); + pl = partition_plane_context(xd, bsize); + if (split_rate < INT_MAX) { + split_rate += x->partition_cost[pl][PARTITION_SPLIT]; + + chosen_rate = split_rate; + chosen_dist = split_dist; + } } + + // If last_part is better set the partitioning to that... + if (RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist) + < RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)) { + m->mbmi.sb_type = bsize; + if (bsize >= BLOCK_SIZE_SB8X8) + *(get_sb_partitioning(x, bsize)) = subsize; + chosen_rate = last_part_rate; + chosen_dist = last_part_dist; + } + // If none was better set the partitioning to that... + if (RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist) + > RDCOST(x->rdmult, x->rddiv, none_rate, none_dist)) { + if (bsize >= BLOCK_SIZE_SB8X8) + *(get_sb_partitioning(x, bsize)) = bsize; + chosen_rate = none_rate; + chosen_dist = none_dist; + } + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - if (r < INT_MAX && d < INT_MAX) + // We must have chosen a partitioning and encoding or we'll fail later on. + // No other opportunities for success. + if ( bsize == BLOCK_SIZE_SB64X64) + assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX); + + if (do_recon) encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize); - *rate = r; - *dist = d; + + *rate = chosen_rate; + *dist = chosen_dist; } // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previously rate-distortion optimization // results, for encoding speed-up. -static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, - int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize, - int *rate, int *dist) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; +static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, + int mi_col, BLOCK_SIZE_TYPE bsize, int *rate, + int64_t *dist, int do_recon, int64_t best_rd) { + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; int bsl = b_width_log2(bsize), bs = 1 << bsl; int ms = bs / 2; - ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; + ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; TOKENEXTRA *tp_orig = *tp; int i, pl; BLOCK_SIZE_TYPE subsize; - int srate = INT_MAX, sdist = INT_MAX; + int srate = INT_MAX; + int64_t sdist = INT_MAX; + + (void) *tp_orig; if (bsize < BLOCK_SIZE_SB8X8) if (xd->ab_index != 0) { @@ -1256,127 +1455,343 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); // PARTITION_SPLIT - if (bsize >= BLOCK_SIZE_SB8X8) { - int r4 = 0, d4 = 0; - subsize = get_subsize(bsize, PARTITION_SPLIT); - *(get_sb_partitioning(x, bsize)) = subsize; + if (!cpi->sf.use_partitions_greater_than + || (cpi->sf.use_partitions_greater_than + && bsize > cpi->sf.greater_than_block_size)) { + if (bsize > BLOCK_SIZE_SB8X8) { + int r4 = 0; + int64_t d4 = 0, sum_rd = 0; + subsize = get_subsize(bsize, PARTITION_SPLIT); + + for (i = 0; i < 4 && sum_rd < best_rd; ++i) { + int x_idx = (i & 1) * (ms >> 1); + int y_idx = (i >> 1) * (ms >> 1); + int r = 0; + int64_t d = 0; - for (i = 0; i < 4; ++i) { - int x_idx = (i & 1) * (ms >> 1); - int y_idx = (i >> 1) * (ms >> 1); - int r = 0, d = 0; - - if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) - continue; + if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) + continue; - *(get_sb_index(xd, subsize)) = i; - rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, - &r, &d); + *(get_sb_index(xd, subsize)) = i; + rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r, + &d, i != 3, best_rd - sum_rd); - r4 += r; - d4 += d; + if (r == INT_MAX) { + r4 = INT_MAX; + sum_rd = INT64_MAX; + } else { + r4 += r; + d4 += d; + sum_rd = RDCOST(x->rdmult, x->rddiv, r4, d4); + } + } + set_partition_seg_context(cm, xd, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + if (r4 != INT_MAX && i == 4) { + r4 += x->partition_cost[pl][PARTITION_SPLIT]; + *(get_sb_partitioning(x, bsize)) = subsize; + assert(r4 >= 0); + assert(d4 >= 0); + srate = r4; + sdist = d4; + best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r4, d4)); + } + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - if (r4 < INT_MAX) - r4 += x->partition_cost[pl][PARTITION_SPLIT]; - assert(r4 >= 0); - assert(d4 >= 0); - srate = r4; - sdist = d4; - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - // PARTITION_HORZ - if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) { - int r2, d2; - int r = 0, d = 0; - subsize = get_subsize(bsize, PARTITION_HORZ); - *(get_sb_index(xd, subsize)) = 0; - pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize, - get_block_context(x, subsize)); - - if (mi_row + (ms >> 1) < cm->mi_rows) { - update_state(cpi, get_block_context(x, subsize), subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - - *(get_sb_index(xd, subsize)) = 1; - pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize, - get_block_context(x, subsize)); - r2 += r; - d2 += d; - } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - if (r2 < INT_MAX) - r2 += x->partition_cost[pl][PARTITION_HORZ]; - if (RDCOST(x->rdmult, x->rddiv, r2, d2) < - RDCOST(x->rdmult, x->rddiv, srate, sdist)) { - srate = r2; - sdist = d2; - *(get_sb_partitioning(x, bsize)) = subsize; + x->fast_ms = 0; + x->pred_mv.as_int = 0; + x->subblock_ref = 0; + + // Use 4 subblocks' motion estimation results to speed up current + // partition's checking. + if (cpi->sf.using_small_partition_info) { + // Only use 8x8 result for non HD videos. + // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0; + int use_8x8 = 1; + + if (cm->frame_type && !cpi->is_src_frame_alt_ref && + ((use_8x8 && bsize == BLOCK_SIZE_MB16X16) || + bsize == BLOCK_SIZE_SB32X32 || bsize == BLOCK_SIZE_SB64X64)) { + int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0; + + if (bsize == BLOCK_SIZE_MB16X16) { + ref0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi. + ref_frame[0]; + ref1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi. + ref_frame[0]; + ref2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi. + ref_frame[0]; + ref3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi. + ref_frame[0]; + } else if (bsize == BLOCK_SIZE_SB32X32) { + ref0 = x->mb_context[xd->sb_index][0].mic.mbmi.ref_frame[0]; + ref1 = x->mb_context[xd->sb_index][1].mic.mbmi.ref_frame[0]; + ref2 = x->mb_context[xd->sb_index][2].mic.mbmi.ref_frame[0]; + ref3 = x->mb_context[xd->sb_index][3].mic.mbmi.ref_frame[0]; + } else if (bsize == BLOCK_SIZE_SB64X64) { + ref0 = x->sb32_context[0].mic.mbmi.ref_frame[0]; + ref1 = x->sb32_context[1].mic.mbmi.ref_frame[0]; + ref2 = x->sb32_context[2].mic.mbmi.ref_frame[0]; + ref3 = x->sb32_context[3].mic.mbmi.ref_frame[0]; + } + + // Currently, only consider 4 inter ref frames. + if (ref0 && ref1 && ref2 && ref3) { + int16_t mvr0 = 0, mvc0 = 0, mvr1 = 0, mvc1 = 0, mvr2 = 0, mvc2 = 0, + mvr3 = 0, mvc3 = 0; + int d01, d23, d02, d13; // motion vector distance between 2 blocks + + // Get each subblock's motion vectors. + if (bsize == BLOCK_SIZE_MB16X16) { + mvr0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.mv[0]. + as_mv.row; + mvc0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.mv[0]. + as_mv.col; + mvr1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.mv[0]. + as_mv.row; + mvc1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.mv[0]. + as_mv.col; + mvr2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.mv[0]. + as_mv.row; + mvc2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.mv[0]. + as_mv.col; + mvr3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.mv[0]. + as_mv.row; + mvc3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.mv[0]. + as_mv.col; + } else if (bsize == BLOCK_SIZE_SB32X32) { + mvr0 = x->mb_context[xd->sb_index][0].mic.mbmi.mv[0].as_mv.row; + mvc0 = x->mb_context[xd->sb_index][0].mic.mbmi.mv[0].as_mv.col; + mvr1 = x->mb_context[xd->sb_index][1].mic.mbmi.mv[0].as_mv.row; + mvc1 = x->mb_context[xd->sb_index][1].mic.mbmi.mv[0].as_mv.col; + mvr2 = x->mb_context[xd->sb_index][2].mic.mbmi.mv[0].as_mv.row; + mvc2 = x->mb_context[xd->sb_index][2].mic.mbmi.mv[0].as_mv.col; + mvr3 = x->mb_context[xd->sb_index][3].mic.mbmi.mv[0].as_mv.row; + mvc3 = x->mb_context[xd->sb_index][3].mic.mbmi.mv[0].as_mv.col; + } else if (bsize == BLOCK_SIZE_SB64X64) { + mvr0 = x->sb32_context[0].mic.mbmi.mv[0].as_mv.row; + mvc0 = x->sb32_context[0].mic.mbmi.mv[0].as_mv.col; + mvr1 = x->sb32_context[1].mic.mbmi.mv[0].as_mv.row; + mvc1 = x->sb32_context[1].mic.mbmi.mv[0].as_mv.col; + mvr2 = x->sb32_context[2].mic.mbmi.mv[0].as_mv.row; + mvc2 = x->sb32_context[2].mic.mbmi.mv[0].as_mv.col; + mvr3 = x->sb32_context[3].mic.mbmi.mv[0].as_mv.row; + mvc3 = x->sb32_context[3].mic.mbmi.mv[0].as_mv.col; + } + + // Adjust sign if ref is alt_ref + if (cm->ref_frame_sign_bias[ref0]) { + mvr0 *= -1; + mvc0 *= -1; + } + + if (cm->ref_frame_sign_bias[ref1]) { + mvr1 *= -1; + mvc1 *= -1; + } + + if (cm->ref_frame_sign_bias[ref2]) { + mvr2 *= -1; + mvc2 *= -1; + } + + if (cm->ref_frame_sign_bias[ref3]) { + mvr3 *= -1; + mvc3 *= -1; + } + + // Calculate mv distances. + d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1)); + d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3)); + d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2)); + d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3)); + + if (d01 < 24 && d23 < 24 && d02 < 24 && d13 < 24) { + // Set fast motion search level. + x->fast_ms = 1; + + // Calculate prediction MV + x->pred_mv.as_mv.row = (mvr0 + mvr1 + mvr2 + mvr3) >> 2; + x->pred_mv.as_mv.col = (mvc0 + mvc1 + mvc2 + mvc3) >> 2; + + if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 && + d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) { + // Set fast motion search level. + x->fast_ms = 2; + + if (!d01 && !d23 && !d02 && !d13) { + x->fast_ms = 3; + x->subblock_ref = ref0; + } + } + } + } } - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - // PARTITION_VERT - if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) { - int r2, d2; - subsize = get_subsize(bsize, PARTITION_VERT); - *(get_sb_index(xd, subsize)) = 0; - pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize, - get_block_context(x, subsize)); - if (mi_col + (ms >> 1) < cm->mi_cols) { - int r = 0, d = 0; - update_state(cpi, get_block_context(x, subsize), subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - - *(get_sb_index(xd, subsize)) = 1; - pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize, - get_block_context(x, subsize)); - r2 += r; - d2 += d; - } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - if (r2 < INT_MAX) - r2 += x->partition_cost[pl][PARTITION_VERT]; - if (RDCOST(x->rdmult, x->rddiv, r2, d2) < - RDCOST(x->rdmult, x->rddiv, srate, sdist)) { - srate = r2; - sdist = d2; - *(get_sb_partitioning(x, bsize)) = subsize; + if (!cpi->sf.use_partitions_less_than + || (cpi->sf.use_partitions_less_than + && bsize <= cpi->sf.less_than_block_size)) { + int larger_is_better = 0; + // PARTITION_NONE + if ((mi_row + (ms >> 1) < cm->mi_rows) && + (mi_col + (ms >> 1) < cm->mi_cols)) { + int r; + int64_t d; + pick_sb_modes(cpi, mi_row, mi_col, &r, &d, bsize, + get_block_context(x, bsize), best_rd); + if (r != INT_MAX && bsize >= BLOCK_SIZE_SB8X8) { + set_partition_seg_context(cm, xd, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + r += x->partition_cost[pl][PARTITION_NONE]; + } + + if (r != INT_MAX && + (bsize == BLOCK_SIZE_SB8X8 || + RDCOST(x->rdmult, x->rddiv, r, d) < + RDCOST(x->rdmult, x->rddiv, srate, sdist))) { + best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r, d)); + srate = r; + sdist = d; + larger_is_better = 1; + if (bsize >= BLOCK_SIZE_SB8X8) + *(get_sb_partitioning(x, bsize)) = bsize; + } } - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - } - // PARTITION_NONE - if ((mi_row + (ms >> 1) < cm->mi_rows) && - (mi_col + (ms >> 1) < cm->mi_cols)) { - int r, d; - pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize, - get_block_context(x, bsize)); - if (bsize >= BLOCK_SIZE_SB8X8) { + if (bsize == BLOCK_SIZE_SB8X8) { + int r4 = 0; + int64_t d4 = 0, sum_rd = 0; + subsize = get_subsize(bsize, PARTITION_SPLIT); + + for (i = 0; i < 4 && sum_rd < best_rd; ++i) { + int x_idx = (i & 1) * (ms >> 1); + int y_idx = (i >> 1) * (ms >> 1); + int r = 0; + int64_t d = 0; + + if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) + continue; + + *(get_sb_index(xd, subsize)) = i; + rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r, + &d, i != 3, best_rd - sum_rd); + + if (r == INT_MAX) { + r4 = INT_MAX; + sum_rd = INT64_MAX; + } else { + r4 += r; + d4 += d; + sum_rd = RDCOST(x->rdmult, x->rddiv, r4, d4); + } + } set_partition_seg_context(cm, xd, mi_row, mi_col); pl = partition_plane_context(xd, bsize); - r += x->partition_cost[pl][PARTITION_NONE]; + if (r4 != INT_MAX && i == 4) { + r4 += x->partition_cost[pl][PARTITION_SPLIT]; + if (RDCOST(x->rdmult, x->rddiv, r4, d4) < + RDCOST(x->rdmult, x->rddiv, srate, sdist)) { + srate = r4; + sdist = d4; + larger_is_better = 0; + *(get_sb_partitioning(x, bsize)) = subsize; + best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r4, d4)); + } + } + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - if (RDCOST(x->rdmult, x->rddiv, r, d) < - RDCOST(x->rdmult, x->rddiv, srate, sdist)) { - srate = r; - sdist = d; - if (bsize >= BLOCK_SIZE_SB8X8) - *(get_sb_partitioning(x, bsize)) = bsize; + if (!cpi->sf.use_square_partition_only && + (!cpi->sf.less_rectangular_check ||!larger_is_better)) { + // PARTITION_HORZ + if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) { + int r2, r = 0; + int64_t d2, d = 0, h_rd; + subsize = get_subsize(bsize, PARTITION_HORZ); + *(get_sb_index(xd, subsize)) = 0; + pick_sb_modes(cpi, mi_row, mi_col, &r2, &d2, subsize, + get_block_context(x, subsize), best_rd); + h_rd = RDCOST(x->rdmult, x->rddiv, r2, d2); + + if (r2 != INT_MAX && h_rd < best_rd && + mi_row + (ms >> 1) < cm->mi_rows) { + update_state(cpi, get_block_context(x, subsize), subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); + + *(get_sb_index(xd, subsize)) = 1; + pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &r, &d, subsize, + get_block_context(x, subsize), best_rd - h_rd); + if (r == INT_MAX) { + r2 = INT_MAX; + } else { + r2 += r; + d2 += d; + } + } + set_partition_seg_context(cm, xd, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + if (r2 < INT_MAX) + r2 += x->partition_cost[pl][PARTITION_HORZ]; + if (r2 != INT_MAX && RDCOST(x->rdmult, x->rddiv, r2, d2) + < RDCOST(x->rdmult, x->rddiv, srate, sdist)) { + best_rd = MIN(best_rd, RDCOST(x->rdmult, x->rddiv, r2, d2)); + srate = r2; + sdist = d2; + *(get_sb_partitioning(x, bsize)) = subsize; + } + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + } + + // PARTITION_VERT + if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) { + int r2; + int64_t d2, v_rd; + subsize = get_subsize(bsize, PARTITION_VERT); + *(get_sb_index(xd, subsize)) = 0; + pick_sb_modes(cpi, mi_row, mi_col, &r2, &d2, subsize, + get_block_context(x, subsize), best_rd); + v_rd = RDCOST(x->rdmult, x->rddiv, r2, d2); + if (r2 != INT_MAX && v_rd < best_rd && + mi_col + (ms >> 1) < cm->mi_cols) { + int r = 0; + int64_t d = 0; + update_state(cpi, get_block_context(x, subsize), subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); + + *(get_sb_index(xd, subsize)) = 1; + pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &r, &d, subsize, + get_block_context(x, subsize), best_rd - v_rd); + if (r == INT_MAX) { + r2 = INT_MAX; + } else { + r2 += r; + d2 += d; + } + } + set_partition_seg_context(cm, xd, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + if (r2 < INT_MAX) + r2 += x->partition_cost[pl][PARTITION_VERT]; + if (r2 != INT_MAX && + RDCOST(x->rdmult, x->rddiv, r2, d2) + < RDCOST(x->rdmult, x->rddiv, srate, sdist)) { + srate = r2; + sdist = d2; + *(get_sb_partitioning(x, bsize)) = subsize; + } + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + } } } - *rate = srate; *dist = sdist; restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - if (srate < INT_MAX && sdist < INT_MAX) + if (srate < INT_MAX && sdist < INT_MAX && do_recon) encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize); if (bsize == BLOCK_SIZE_SB64X64) { @@ -1388,9 +1803,61 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, } } -static void encode_sb_row(VP9_COMP *cpi, int mi_row, - TOKENEXTRA **tp, int *totalrate) { - VP9_COMMON *const cm = &cpi->common; +// Examines 64x64 block and chooses a best reference frame +static void rd_pick_reference_frame(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, + int mi_col, int *rate, int64_t *dist) { + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; + int bsl = b_width_log2(BLOCK_SIZE_SB64X64), bs = 1 << bsl; + int ms = bs / 2; + ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; + PARTITION_CONTEXT sl[8], sa[8]; + int pl; + int r; + int64_t d; + + save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64); + + // Default is non mask (all reference frames allowed. + cpi->ref_frame_mask = 0; + + // Do RD search for 64x64. + if ((mi_row + (ms >> 1) < cm->mi_rows) && + (mi_col + (ms >> 1) < cm->mi_cols)) { + cpi->set_ref_frame_mask = 1; + pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_SIZE_SB64X64, + get_block_context(x, BLOCK_SIZE_SB64X64), INT64_MAX); + set_partition_seg_context(cm, xd, mi_row, mi_col); + pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64); + r += x->partition_cost[pl][PARTITION_NONE]; + + *(get_sb_partitioning(x, BLOCK_SIZE_SB64X64)) = BLOCK_SIZE_SB64X64; + cpi->set_ref_frame_mask = 0; + } + + *rate = r; + *dist = d; + // RDCOST(x->rdmult, x->rddiv, r, d) + + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64); + + /*if (srate < INT_MAX && sdist < INT_MAX) + encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64); + + if (bsize == BLOCK_SIZE_SB64X64) { + assert(tp_orig < *tp); + assert(srate < INT_MAX); + assert(sdist < INT_MAX); + } else { + assert(tp_orig == *tp); + } + */ +} + +static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, + int *totalrate) { + VP9_COMMON * const cm = &cpi->common; int mi_col; // Initialize the left context for the new SB row @@ -1398,19 +1865,56 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context)); // Code each SB in the row - for (mi_col = cm->cur_tile_mi_col_start; - mi_col < cm->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE) { - int dummy_rate, dummy_dist; - if (cpi->speed < 5) { - rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, - &dummy_rate, &dummy_dist); - } else { + for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; + mi_col += MI_BLOCK_SIZE) { + int dummy_rate; + int64_t dummy_dist; + + // Initialize a mask of modes that we will not consider; + // cpi->unused_mode_skip_mask = 0x0000000AAE17F800 (test no golden) + if (cpi->common.frame_type == KEY_FRAME) + cpi->unused_mode_skip_mask = 0; + else + cpi->unused_mode_skip_mask = 0xFFFFFFFFFFFFFE00; + + if (cpi->sf.reference_masking) { + rd_pick_reference_frame(cpi, tp, mi_row, mi_col, + &dummy_rate, &dummy_dist); + } + + if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning || + cpi->sf.use_one_partition_size_always ) { const int idx_str = cm->mode_info_stride * mi_row + mi_col; MODE_INFO *m = cm->mi + idx_str; - // set_partitioning(cpi, m, BLOCK_SIZE_SB64X64); - choose_partitioning(cpi, cm->mi, mi_row, mi_col); - rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, - &dummy_rate, &dummy_dist); + MODE_INFO *p = cm->prev_mi + idx_str; + + if (cpi->sf.use_one_partition_size_always) { + set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64); + set_partitioning(cpi, m, cpi->sf.always_this_block_size); + rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + &dummy_rate, &dummy_dist, 1); + } else if (cpi->sf.partition_by_variance) { + choose_partitioning(cpi, cm->mi, mi_row, mi_col); + rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + &dummy_rate, &dummy_dist, 1); + } else { + if ((cpi->common.current_video_frame + % cpi->sf.last_partitioning_redo_frequency) == 0 + || cm->prev_mi == 0 + || cpi->common.show_frame == 0 + || cpi->common.frame_type == KEY_FRAME + || cpi->is_src_frame_alt_ref) { + rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + &dummy_rate, &dummy_dist, 1, INT64_MAX); + } else { + copy_partitioning(cpi, m, p); + rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + &dummy_rate, &dummy_dist, 1); + } + } + } else { + rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + &dummy_rate, &dummy_dist, 1, INT64_MAX); } } } @@ -1419,15 +1923,12 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { MACROBLOCK *const x = &cpi->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; + const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); x->act_zbin_adj = 0; cpi->seg0_idx = 0; xd->mode_info_stride = cm->mode_info_stride; - xd->frame_type = cm->frame_type; - - xd->frames_since_golden = cm->frames_since_golden; - xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame; // reset intra mode contexts if (cm->frame_type == KEY_FRAME) @@ -1437,62 +1938,65 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { vp9_setup_src_planes(x, cpi->Source, 0, 0); // TODO(jkoleszar): are these initializations required? - setup_pre_planes(xd, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], NULL, - 0, 0, NULL, NULL); + setup_pre_planes(xd, 0, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], + 0, 0, NULL); setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0); - vp9_build_block_offsets(x); - - vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); + setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); xd->mode_info_context->mbmi.mode = DC_PRED; xd->mode_info_context->mbmi.uv_mode = DC_PRED; vp9_zero(cpi->y_mode_count) vp9_zero(cpi->y_uv_mode_count) - vp9_zero(cm->fc.inter_mode_counts) + vp9_zero(cm->counts.inter_mode) vp9_zero(cpi->partition_count); vp9_zero(cpi->intra_inter_count); vp9_zero(cpi->comp_inter_count); vp9_zero(cpi->single_ref_count); vp9_zero(cpi->comp_ref_count); - vp9_zero(cm->fc.tx_count_32x32p); - vp9_zero(cm->fc.tx_count_16x16p); - vp9_zero(cm->fc.tx_count_8x8p); - vp9_zero(cm->fc.mbskip_count); + vp9_zero(cm->counts.tx); + vp9_zero(cm->counts.mbskip); // Note: this memset assumes above_context[0], [1] and [2] // are allocated as part of the same buffer. - vpx_memset(cm->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 * - MAX_MB_PLANE * mi_cols_aligned_to_sb(cm)); - vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) * - mi_cols_aligned_to_sb(cm)); + vpx_memset(cm->above_context[0], 0, + sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols); + vpx_memset(cm->above_seg_context, 0, + sizeof(PARTITION_CONTEXT) * aligned_mi_cols); } static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { if (lossless) { - cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; - cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; - cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add; - cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add; - cpi->mb.optimize = 0; - cpi->common.filter_level = 0; - cpi->zbin_mode_boost_enabled = 0; - cpi->common.txfm_mode = ONLY_4X4; + // printf("Switching to lossless\n"); + cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; + cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; + cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add; + cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add; + cpi->mb.optimize = 0; + cpi->mb.e_mbd.lf.filter_level = 0; + cpi->zbin_mode_boost_enabled = 0; + cpi->common.tx_mode = ONLY_4X4; } else { - cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; - cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; - cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add; - cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add; + // printf("Not lossless\n"); + cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; + cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; + cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add; + cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add; } } +static void switch_tx_mode(VP9_COMP *cpi) { + if (cpi->sf.tx_size_search_method == USE_LARGESTALL && + cpi->common.tx_mode >= ALLOW_32X32) + cpi->common.tx_mode = ALLOW_32X32; +} static void encode_frame_internal(VP9_COMP *cpi) { int mi_row; - MACROBLOCK *const x = &cpi->mb; - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; + MACROBLOCK * const x = &cpi->mb; + VP9_COMMON * const cm = &cpi->common; + MACROBLOCKD * const xd = &x->e_mbd; int totalrate; // fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n", @@ -1514,26 +2018,25 @@ static void encode_frame_internal(VP9_COMP *cpi) { // Reset frame count of inter 0,0 motion vector usage. cpi->inter_zz_count = 0; - vp9_zero(cm->fc.switchable_interp_count); - vp9_zero(cpi->best_switchable_interp_count); + vp9_zero(cm->counts.switchable_interp); + vp9_zero(cpi->txfm_stepdown_count); xd->mode_info_context = cm->mi; xd->prev_mode_info_context = cm->prev_mi; vp9_zero(cpi->NMVcount); vp9_zero(cpi->coef_counts); - vp9_zero(cm->fc.eob_branch_counts); + vp9_zero(cm->counts.eob_branch); - cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && - cm->y_dc_delta_q == 0 && - cm->uv_dc_delta_q == 0 && - cm->uv_ac_delta_q == 0; + cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0 + && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless); vp9_frame_init_quantizer(cpi); vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q); vp9_initialize_me_consts(cpi, cm->base_qindex); + switch_tx_mode(cpi); if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { // Initialize encode frame context. @@ -1546,36 +2049,38 @@ static void encode_frame_internal(VP9_COMP *cpi) { // re-initencode frame context. init_encode_frame_mb_context(cpi); - vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff)); - vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff)); - vpx_memset(cpi->rd_tx_select_threshes, 0, sizeof(cpi->rd_tx_select_threshes)); + vp9_zero(cpi->rd_comp_pred_diff); + vp9_zero(cpi->rd_filter_diff); + vp9_zero(cpi->rd_tx_select_diff); + vp9_zero(cpi->rd_tx_select_threshes); set_prev_mi(cm); { - struct vpx_usec_timer emr_timer; + struct vpx_usec_timer emr_timer; vpx_usec_timer_start(&emr_timer); { // Take tiles into account and give start/end MB int tile_col, tile_row; TOKENEXTRA *tp = cpi->tok; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; - for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) { + for (tile_row = 0; tile_row < tile_rows; tile_row++) { vp9_get_tile_row_offsets(cm, tile_row); - for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) { + for (tile_col = 0; tile_col < tile_cols; tile_col++) { TOKENEXTRA *tp_old = tp; // For each row of SBs in the frame vp9_get_tile_col_offsets(cm, tile_col); for (mi_row = cm->cur_tile_mi_row_start; - mi_row < cm->cur_tile_mi_row_end; - mi_row += 8) + mi_row < cm->cur_tile_mi_row_end; mi_row += 8) encode_sb_row(cpi, mi_row, &tp, &totalrate); + cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old); - assert(tp - cpi->tok <= - get_token_alloc(cm->mb_rows, cm->mb_cols)); + assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); } } } @@ -1584,6 +2089,20 @@ static void encode_frame_internal(VP9_COMP *cpi) { cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer); } + if (cpi->sf.skip_encode_sb) { + int j; + unsigned int intra_count = 0, inter_count = 0; + for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) { + intra_count += cpi->intra_inter_count[j][0]; + inter_count += cpi->intra_inter_count[j][1]; + } + cpi->sf.skip_encode_frame = ((intra_count << 2) < inter_count); + cpi->sf.skip_encode_frame &= (cm->frame_type != KEY_FRAME); + cpi->sf.skip_encode_frame &= cm->show_frame; + } else { + cpi->sf.skip_encode_frame = 0; + } + // 256 rate units to the bit, // projected_frame_size in units of BYTES cpi->projected_frame_size = totalrate >> 8; @@ -1599,12 +2118,11 @@ static int check_dual_ref_flags(VP9_COMP *cpi) { MACROBLOCKD *xd = &cpi->mb.e_mbd; int ref_flags = cpi->ref_frame_flags; - if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) { + if (vp9_segfeature_active(&xd->seg, 1, SEG_LVL_REF_FRAME)) { return 0; } else { - return (!!(ref_flags & VP9_GOLD_FLAG) + - !!(ref_flags & VP9_LAST_FLAG) + - !!(ref_flags & VP9_ALT_FLAG)) >= 2; + return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG) + + !!(ref_flags & VP9_ALT_FLAG)) >= 2; } } @@ -1631,35 +2149,32 @@ static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs, } } -static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi, - int mis, TX_SIZE txfm_max, - int bw, int bh, int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const cm = &cpi->common; - MB_MODE_INFO *const mbmi = &mi->mbmi; +static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi, int mis, + TX_SIZE txfm_max, int bw, int bh, int mi_row, + int mi_col, BLOCK_SIZE_TYPE bsize) { + VP9_COMMON * const cm = &cpi->common; + MB_MODE_INFO * const mbmi = &mi->mbmi; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; if (mbmi->txfm_size > txfm_max) { - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; - const int segment_id = mbmi->segment_id; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; const int ymbs = MIN(bh, cm->mi_rows - mi_row); const int xmbs = MIN(bw, cm->mi_cols - mi_col); xd->mode_info_context = mi; - assert(vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) || + assert(vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP) || get_skip_flag(mi, mis, ymbs, xmbs)); set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max); } } static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi, - TX_SIZE txfm_max, - int mi_row, int mi_col, + TX_SIZE txfm_max, int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const cm = &cpi->common; + VP9_COMMON * const cm = &cpi->common; const int mis = cm->mode_info_stride; int bwl, bhl; const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1); @@ -1671,18 +2186,18 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi, bhl = mi_height_log2(mi->mbmi.sb_type); if (bwl == bsl && bhl == bsl) { - reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl, - mi_row, mi_col, bsize); + reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl, mi_row, + mi_col, bsize); } else if (bwl == bsl && bhl < bsl) { - reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs, - mi_row, mi_col, bsize); + reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs, mi_row, mi_col, + bsize); reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs, mi_row + bs, mi_col, bsize); } else if (bwl < bsl && bhl == bsl) { - reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl, - mi_row, mi_col, bsize); - reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl, - mi_row, mi_col + bs, bsize); + reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl, mi_row, mi_col, + bsize); + reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl, mi_row, + mi_col + bs, bsize); } else { BLOCK_SIZE_TYPE subsize; int n; @@ -1700,43 +2215,82 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi, for (n = 0; n < 4; n++) { const int y_idx = n >> 1, x_idx = n & 0x01; - reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, - txfm_max, mi_row + y_idx * bs, - mi_col + x_idx * bs, subsize); + reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, txfm_max, + mi_row + y_idx * bs, mi_col + x_idx * bs, + subsize); } } } static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) { - VP9_COMMON *const cm = &cpi->common; + VP9_COMMON * const cm = &cpi->common; int mi_row, mi_col; const int mis = cm->mode_info_stride; MODE_INFO *mi, *mi_ptr = cm->mi; - for (mi_row = 0; mi_row < cm->mi_rows; - mi_row += 8, mi_ptr += 8 * mis) { + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) { mi = mi_ptr; - for (mi_col = 0; mi_col < cm->mi_cols; - mi_col += 8, mi += 8) { - reset_skip_txfm_size_sb(cpi, mi, txfm_max, - mi_row, mi_col, BLOCK_SIZE_SB64X64); + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi += 8) { + reset_skip_txfm_size_sb(cpi, mi, txfm_max, mi_row, mi_col, + BLOCK_SIZE_SB64X64); + } + } +} + +static int get_frame_type(VP9_COMP *cpi) { + int frame_type; + if (cpi->common.frame_type == KEY_FRAME) + frame_type = 0; + else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame) + frame_type = 3; + else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) + frame_type = 1; + else + frame_type = 2; + return frame_type; +} + +static void select_tx_mode(VP9_COMP *cpi) { + if (cpi->oxcf.lossless) { + cpi->common.tx_mode = ONLY_4X4; + } else if (cpi->common.current_video_frame == 0) { + cpi->common.tx_mode = TX_MODE_SELECT; + } else { + if (cpi->sf.tx_size_search_method == USE_LARGESTALL) { + cpi->common.tx_mode = ALLOW_32X32; + } else if (cpi->sf.tx_size_search_method == USE_FULL_RD) { + int frame_type = get_frame_type(cpi); + cpi->common.tx_mode = + cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] + > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ? + ALLOW_32X32 : TX_MODE_SELECT; + } else { + unsigned int total = 0; + int i; + for (i = 0; i < TX_SIZE_MAX_SB; ++i) + total += cpi->txfm_stepdown_count[i]; + if (total) { + double fraction = (double)cpi->txfm_stepdown_count[0] / total; + cpi->common.tx_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT; + // printf("fraction = %f\n", fraction); + } // else keep unchanged } } } void vp9_encode_frame(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; + VP9_COMMON * const cm = &cpi->common; // In the longer term the encoder should be generalized to match the // decoder such that we allow compound where one of the 3 buffers has a - // differnt sign bias and that buffer is then the fixed ref. However, this + // different sign bias and that buffer is then the fixed ref. However, this // requires further work in the rd loop. For now the only supported encoder - // side behaviour is where the ALT ref buffer has oppositie sign bias to + // side behaviour is where the ALT ref buffer has opposite sign bias to // the other two. - if ((cm->ref_frame_sign_bias[ALTREF_FRAME] == - cm->ref_frame_sign_bias[GOLDEN_FRAME]) || - (cm->ref_frame_sign_bias[ALTREF_FRAME] == - cm->ref_frame_sign_bias[LAST_FRAME])) { + if ((cm->ref_frame_sign_bias[ALTREF_FRAME] + == cm->ref_frame_sign_bias[GOLDEN_FRAME]) + || (cm->ref_frame_sign_bias[ALTREF_FRAME] + == cm->ref_frame_sign_bias[LAST_FRAME])) { cm->allow_comp_inter_inter = 0; } else { cm->allow_comp_inter_inter = 1; @@ -1746,9 +2300,8 @@ void vp9_encode_frame(VP9_COMP *cpi) { } if (cpi->sf.RD) { - int i, frame_type, pred_type; - TXFM_MODE txfm_type; - + int i, pred_type; + INTERPOLATIONFILTERTYPE filter_type; /* * This code does a single RD pass over the whole frame assuming * either compound, single or hybrid prediction as per whatever has @@ -1758,86 +2311,78 @@ void vp9_encode_frame(VP9_COMP *cpi) { * that for subsequent frames. * It does the same analysis for transform size selection also. */ - if (cpi->common.frame_type == KEY_FRAME) - frame_type = 0; - else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame) - frame_type = 3; - else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) - frame_type = 1; - else - frame_type = 2; + int frame_type = get_frame_type(cpi); /* prediction (compound, single or hybrid) mode selection */ if (frame_type == 3 || !cm->allow_comp_inter_inter) pred_type = SINGLE_PREDICTION_ONLY; - else if (cpi->rd_prediction_type_threshes[frame_type][1] > - cpi->rd_prediction_type_threshes[frame_type][0] && - cpi->rd_prediction_type_threshes[frame_type][1] > - cpi->rd_prediction_type_threshes[frame_type][2] && - check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100) + else if (cpi->rd_prediction_type_threshes[frame_type][1] + > cpi->rd_prediction_type_threshes[frame_type][0] + && cpi->rd_prediction_type_threshes[frame_type][1] + > cpi->rd_prediction_type_threshes[frame_type][2] + && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100) pred_type = COMP_PREDICTION_ONLY; - else if (cpi->rd_prediction_type_threshes[frame_type][0] > - cpi->rd_prediction_type_threshes[frame_type][2]) + else if (cpi->rd_prediction_type_threshes[frame_type][0] + > cpi->rd_prediction_type_threshes[frame_type][2]) pred_type = SINGLE_PREDICTION_ONLY; else pred_type = HYBRID_PREDICTION; + /* filter type selection */ + // FIXME(rbultje) for some odd reason, we often select smooth_filter + // as default filter for ARF overlay frames. This is a REALLY BAD + // IDEA so we explicitly disable it here. + if (frame_type != 3 && + cpi->rd_filter_threshes[frame_type][1] > + cpi->rd_filter_threshes[frame_type][0] && + cpi->rd_filter_threshes[frame_type][1] > + cpi->rd_filter_threshes[frame_type][2] && + cpi->rd_filter_threshes[frame_type][1] > + cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) { + filter_type = vp9_switchable_interp[1]; + } else if (cpi->rd_filter_threshes[frame_type][2] > + cpi->rd_filter_threshes[frame_type][0] && + cpi->rd_filter_threshes[frame_type][2] > + cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) { + filter_type = vp9_switchable_interp[2]; + } else if (cpi->rd_filter_threshes[frame_type][0] > + cpi->rd_filter_threshes[frame_type][VP9_SWITCHABLE_FILTERS]) { + filter_type = vp9_switchable_interp[0]; + } else { + filter_type = SWITCHABLE; + } + /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */ cpi->mb.e_mbd.lossless = 0; if (cpi->oxcf.lossless) { - txfm_type = ONLY_4X4; cpi->mb.e_mbd.lossless = 1; - } else -#if 0 - /* FIXME (rbultje): this code is disabled until we support cost updates - * while a frame is being encoded; the problem is that each time we - * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities - * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging - * further behind and not being chosen for subsequent frames either. This - * is essentially a local minimum problem that we can probably fix by - * estimating real costs more closely within a frame, perhaps by re- - * calculating costs on-the-fly as frame encoding progresses. */ - if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > - cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] && - cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > - cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] && - cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > - cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) { - txfm_type = TX_MODE_SELECT; - } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] > - cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8] - && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] > - cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] - ) { - txfm_type = ONLY_4X4; - } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >= - cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) { - txfm_type = ALLOW_16X16; - } else - txfm_type = ALLOW_8X8; -#else - txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] > - cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ? - ALLOW_32X32 : TX_MODE_SELECT; -#endif - cpi->common.txfm_mode = txfm_type; + } + + select_tx_mode(cpi); cpi->common.comp_pred_mode = pred_type; + cpi->common.mcomp_filter_type = filter_type; encode_frame_internal(cpi); for (i = 0; i < NB_PREDICTION_TYPES; ++i) { - const int diff = (int)(cpi->rd_comp_pred_diff[i] / cpi->common.MBs); + const int diff = (int) (cpi->rd_comp_pred_diff[i] / cpi->common.MBs); cpi->rd_prediction_type_threshes[frame_type][i] += diff; cpi->rd_prediction_type_threshes[frame_type][i] >>= 1; } + for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) { + const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs; + cpi->rd_filter_threshes[frame_type][i] = + (cpi->rd_filter_threshes[frame_type][i] + diff) / 2; + } + for (i = 0; i < NB_TXFM_MODES; ++i) { int64_t pd = cpi->rd_tx_select_diff[i]; int diff; if (i == TX_MODE_SELECT) pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZE_MAX_SB - 1), 0); - diff = (int)(pd / cpi->common.MBs); + diff = (int) (pd / cpi->common.MBs); cpi->rd_tx_select_threshes[frame_type][i] += diff; cpi->rd_tx_select_threshes[frame_type][i] /= 2; } @@ -1860,64 +2405,47 @@ void vp9_encode_frame(VP9_COMP *cpi) { } } - if (cpi->common.txfm_mode == TX_MODE_SELECT) { + if (cpi->common.tx_mode == TX_MODE_SELECT) { int count4x4 = 0; int count8x8_lp = 0, count8x8_8x8p = 0; int count16x16_16x16p = 0, count16x16_lp = 0; int count32x32 = 0; - for (i = 0; i < TX_SIZE_CONTEXTS; i++) - count4x4 += cm->fc.tx_count_32x32p[i][TX_4X4]; - for (i = 0; i < TX_SIZE_CONTEXTS; i++) - count4x4 += cm->fc.tx_count_16x16p[i][TX_4X4]; - for (i = 0; i < TX_SIZE_CONTEXTS; i++) - count4x4 += cm->fc.tx_count_8x8p[i][TX_4X4]; - - for (i = 0; i < TX_SIZE_CONTEXTS; i++) - count8x8_lp += cm->fc.tx_count_32x32p[i][TX_8X8]; - for (i = 0; i < TX_SIZE_CONTEXTS; i++) - count8x8_lp += cm->fc.tx_count_16x16p[i][TX_8X8]; + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { + count4x4 += cm->counts.tx.p32x32[i][TX_4X4]; + count4x4 += cm->counts.tx.p16x16[i][TX_4X4]; + count4x4 += cm->counts.tx.p8x8[i][TX_4X4]; - for (i = 0; i < TX_SIZE_CONTEXTS; i++) - count8x8_8x8p += cm->fc.tx_count_8x8p[i][TX_8X8]; + count8x8_lp += cm->counts.tx.p32x32[i][TX_8X8]; + count8x8_lp += cm->counts.tx.p16x16[i][TX_8X8]; + count8x8_8x8p += cm->counts.tx.p8x8[i][TX_8X8]; - for (i = 0; i < TX_SIZE_CONTEXTS; i++) - count16x16_16x16p += cm->fc.tx_count_16x16p[i][TX_16X16]; - - for (i = 0; i < TX_SIZE_CONTEXTS; i++) - count16x16_lp += cm->fc.tx_count_32x32p[i][TX_16X16]; - - for (i = 0; i < TX_SIZE_CONTEXTS; i++) - count32x32 += cm->fc.tx_count_32x32p[i][TX_32X32]; + count16x16_16x16p += cm->counts.tx.p16x16[i][TX_16X16]; + count16x16_lp += cm->counts.tx.p32x32[i][TX_16X16]; + count32x32 += cm->counts.tx.p32x32[i][TX_32X32]; + } - if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 && - count32x32 == 0) { - cpi->common.txfm_mode = ALLOW_8X8; + if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 + && count32x32 == 0) { + cpi->common.tx_mode = ALLOW_8X8; reset_skip_txfm_size(cpi, TX_8X8); - } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 && - count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) { - cpi->common.txfm_mode = ONLY_4X4; + } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 + && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) { + cpi->common.tx_mode = ONLY_4X4; reset_skip_txfm_size(cpi, TX_4X4); } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) { - cpi->common.txfm_mode = ALLOW_32X32; + cpi->common.tx_mode = ALLOW_32X32; } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) { - cpi->common.txfm_mode = ALLOW_16X16; + cpi->common.tx_mode = ALLOW_16X16; reset_skip_txfm_size(cpi, TX_16X16); } } - - // Update interpolation filter strategy for next frame. - if ((cpi->common.frame_type != KEY_FRAME) && (cpi->sf.search_best_filter)) - vp9_select_interp_filter_type(cpi); } else { encode_frame_internal(cpi); } } -void vp9_build_block_offsets(MACROBLOCK *x) { -} - static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) { const MACROBLOCKD *xd = &x->e_mbd; const MB_PREDICTION_MODE m = xd->mode_info_context->mbmi.mode; @@ -1931,11 +2459,13 @@ static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) { ++cpi->y_mode_count[MIN(bsl, 3)][m]; } else { int idx, idy; - int bw = 1 << b_width_log2(xd->mode_info_context->mbmi.sb_type); - int bh = 1 << b_height_log2(xd->mode_info_context->mbmi.sb_type); - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { - int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode.first; + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[ + xd->mode_info_context->mbmi.sb_type]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[ + xd->mode_info_context->mbmi.sb_type]; + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode; ++cpi->y_mode_count[0][m]; } } @@ -1957,26 +2487,28 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) { b = 4 * act + cpi->activity_avg; if (act > cpi->activity_avg) - x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1; + x->act_zbin_adj = (int) (((int64_t) b + (a >> 1)) / a) - 1; else - x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b); + x->act_zbin_adj = 1 - (int) (((int64_t) a + (b >> 1)) / b); #endif } -static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, - int output_enabled, int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; - int n; +static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, + int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) { + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; MODE_INFO *mi = xd->mode_info_context; MB_MODE_INFO *mbmi = &mi->mbmi; unsigned int segment_id = mbmi->segment_id; const int mis = cm->mode_info_stride; - const int bwl = mi_width_log2(bsize); - const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize); + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; x->rd_search = 0; + x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame && + xd->q_index < QIDX_SKIP_THRESH); + if (x->skip_encode) + return; if (cm->frame_type == KEY_FRAME) { if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { @@ -2015,10 +2547,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, } if (mbmi->ref_frame[0] == INTRA_FRAME) { - vp9_encode_intra_block_y(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? - BLOCK_SIZE_SB8X8 : bsize); - vp9_encode_intra_block_uv(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? - BLOCK_SIZE_SB8X8 : bsize); + vp9_encode_intra_block_y( + cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); + vp9_encode_intra_block_uv( + cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); if (output_enabled) sum_intra_stats(cpi, x); } else { @@ -2032,58 +2564,51 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, assert(cm->frame_type != KEY_FRAME); - setup_pre_planes(xd, ref_fb, second_ref_fb, - mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv); + setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col, + &xd->scale_factor[0]); + setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col, + &xd->scale_factor[1]); + - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, - bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8 - : bsize); + vp9_build_inter_predictors_sb( + xd, mi_row, mi_col, + bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8 : bsize); } if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) { - vp9_tokenize_sb(cpi, xd, t, !output_enabled, + vp9_tokenize_sb(cpi, t, !output_enabled, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); } else if (!x->skip) { vp9_encode_sb(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); - vp9_tokenize_sb(cpi, xd, t, !output_enabled, + vp9_tokenize_sb(cpi, t, !output_enabled, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); } else { - // FIXME(rbultje): not tile-aware (mi - 1) - int mb_skip_context = - (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff; + int mb_skip_context = xd->left_available ? (mi - 1)->mbmi.mb_skip_coeff : 0; + mb_skip_context += (mi - mis)->mbmi.mb_skip_coeff; mbmi->mb_skip_coeff = 1; if (output_enabled) - cm->fc.mbskip_count[mb_skip_context][1]++; - vp9_reset_sb_tokens_context(xd, - (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); + cm->counts.mbskip[mb_skip_context][1]++; + vp9_reset_sb_tokens_context( + xd, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); } // copy skip flag on all mb_mode_info contexts in this SB // if this was a skip at this txfm size - for (n = 1; n < bw * bh; n++) { - const int x_idx = n & (bw - 1), y_idx = n >> bwl; - if (mi_col + x_idx < cm->mi_cols && mi_row + y_idx < cm->mi_rows) - mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff; - } + vp9_set_pred_flag_mbskip(cm, bsize, mi_row, mi_col, mi->mbmi.mb_skip_coeff); if (output_enabled) { - if (cm->txfm_mode == TX_MODE_SELECT && - mbmi->sb_type >= BLOCK_SIZE_SB8X8 && - !(mbmi->ref_frame[0] != INTRA_FRAME && (mbmi->mb_skip_coeff || - vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) { - const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE); - if (bsize >= BLOCK_SIZE_SB32X32) { - cm->fc.tx_count_32x32p[context][mbmi->txfm_size]++; - } else if (bsize >= BLOCK_SIZE_MB16X16) { - cm->fc.tx_count_16x16p[context][mbmi->txfm_size]++; - } else { - cm->fc.tx_count_8x8p[context][mbmi->txfm_size]++; - } + if (cm->tx_mode == TX_MODE_SELECT && + mbmi->sb_type >= BLOCK_SIZE_SB8X8 && + !(mbmi->ref_frame[0] != INTRA_FRAME && + (mbmi->mb_skip_coeff || + vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)))) { + const uint8_t context = vp9_get_pred_context_tx_size(xd); + update_tx_counts(bsize, context, mbmi->txfm_size, &cm->counts.tx); } else { int x, y; - TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode; - // The new intra coding scheme requires no change of transform size + TX_SIZE sz = (cm->tx_mode == TX_MODE_SELECT) ? TX_32X32 : cm->tx_mode; + // The new intra coding scheme requires no change of transform size if (mi->mbmi.ref_frame[0] != INTRA_FRAME) { if (sz == TX_32X32 && bsize < BLOCK_SIZE_SB32X32) sz = TX_16X16; @@ -2097,8 +2622,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, sz = TX_4X4; } - for (y = 0; y < bh; y++) { - for (x = 0; x < bw; x++) { + for (y = 0; y < mi_height; y++) { + for (x = 0; x < mi_width; x++) { if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) { mi[mis * y + x].mbmi.txfm_size = sz; } diff --git a/libvpx/vp9/encoder/vp9_encodeframe.h b/libvpx/vp9/encoder/vp9_encodeframe.h index d37bdca..3991969 100644 --- a/libvpx/vp9/encoder/vp9_encodeframe.h +++ b/libvpx/vp9/encoder/vp9_encodeframe.h @@ -15,8 +15,6 @@ struct macroblock; struct yv12_buffer_config; -void vp9_build_block_offsets(struct macroblock *x); - void vp9_setup_src_planes(struct macroblock *x, const struct yv12_buffer_config *src, int mb_row, int mb_col); diff --git a/libvpx/vp9/encoder/vp9_encodeintra.c b/libvpx/vp9/encoder/vp9_encodeintra.c index f29dba0..d49e532 100644 --- a/libvpx/vp9/encoder/vp9_encodeintra.c +++ b/libvpx/vp9/encoder/vp9_encodeintra.c @@ -18,15 +18,11 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) { MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi; (void) cpi; + x->skip_encode = 0; mbmi->mode = DC_PRED; mbmi->ref_frame[0] = INTRA_FRAME; - if (use_16x16_pred) { - mbmi->txfm_size = mbmi->sb_type >= BLOCK_SIZE_MB16X16 ? TX_16X16 : TX_8X8; - vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type); - } else { - mbmi->txfm_size = TX_4X4; - vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type); - } - + mbmi->txfm_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_SIZE_MB16X16 ? + TX_16X16 : TX_8X8) : TX_4X4; + vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type); return vp9_get_mb_ss(x->plane[0].src_diff); } diff --git a/libvpx/vp9/encoder/vp9_encodeintra.h b/libvpx/vp9/encoder/vp9_encodeintra.h index 14d144b..16ac59e 100644 --- a/libvpx/vp9/encoder/vp9_encodeintra.h +++ b/libvpx/vp9/encoder/vp9_encodeintra.h @@ -14,6 +14,8 @@ #include "vp9/encoder/vp9_onyx_int.h" int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred); +void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, + int ss_txfrm_size, void *arg); void vp9_encode_intra_block_y(VP9_COMMON *const cm, MACROBLOCK *mb, BLOCK_SIZE_TYPE bs); void vp9_encode_intra_block_uv(VP9_COMMON *const cm, MACROBLOCK *mb, diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c index 4f45496..66e35a9 100644 --- a/libvpx/vp9/encoder/vp9_encodemb.c +++ b/libvpx/vp9/encoder/vp9_encodemb.c @@ -22,10 +22,10 @@ DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]); -void vp9_subtract_block(int rows, int cols, - int16_t *diff_ptr, int diff_stride, - const uint8_t *src_ptr, int src_stride, - const uint8_t *pred_ptr, int pred_stride) { +void vp9_subtract_block_c(int rows, int cols, + int16_t *diff_ptr, ptrdiff_t diff_stride, + const uint8_t *src_ptr, ptrdiff_t src_stride, + const uint8_t *pred_ptr, ptrdiff_t pred_stride) { int r, c; for (r = 0; r < rows; r++) { @@ -78,7 +78,6 @@ void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF ) -#define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF ) typedef struct vp9_token_state vp9_token_state; struct vp9_token_state { @@ -110,14 +109,13 @@ static const int plane_rd_mult[4] = { // This function is a place holder for now but may ultimately need // to scan previous tokens to work out the correct context. -static int trellis_get_coeff_context(const int *scan, - const int *nb, +static int trellis_get_coeff_context(const int16_t *scan, + const int16_t *nb, int idx, int token, - uint8_t *token_cache, - int pad, int l) { + uint8_t *token_cache) { int bak = token_cache[scan[idx]], pt; token_cache[scan[idx]] = vp9_pt_energy_class[token]; - pt = vp9_get_coef_context(scan, nb, pad, token_cache, idx + 1, l); + pt = get_coef_context(nb, token_cache, idx + 1); token_cache[scan[idx]] = bak; return pt; } @@ -142,8 +140,8 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb, int best, band, pt; PLANE_TYPE type = xd->plane[plane].plane_type; int err_mult = plane_rd_mult[type]; - int default_eob, pad; - int const *scan, *nb; + int default_eob; + const int16_t *scan, *nb; const int mul = 1 + (tx_size == TX_32X32); uint8_t token_cache[1024]; const int ib = txfrm_block_to_raster_block(xd, bsize, plane, @@ -156,27 +154,21 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb, qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16); switch (tx_size) { default: - case TX_4X4: { - const TX_TYPE tx_type = plane == 0 ? get_tx_type_4x4(xd, ib) : DCT_DCT; + case TX_4X4: default_eob = 16; - scan = get_scan_4x4(tx_type); + scan = get_scan_4x4(get_tx_type_4x4(type, xd, ib)); band_translate = vp9_coefband_trans_4x4; break; - } - case TX_8X8: { - const TX_TYPE tx_type = plane == 0 ? get_tx_type_8x8(xd, ib) : DCT_DCT; - scan = get_scan_8x8(tx_type); + case TX_8X8: + scan = get_scan_8x8(get_tx_type_8x8(type, xd)); default_eob = 64; band_translate = vp9_coefband_trans_8x8plus; break; - } - case TX_16X16: { - const TX_TYPE tx_type = plane == 0 ? get_tx_type_16x16(xd, ib) : DCT_DCT; - scan = get_scan_16x16(tx_type); + case TX_16X16: + scan = get_scan_16x16(get_tx_type_16x16(type, xd)); default_eob = 256; band_translate = vp9_coefband_trans_8x8plus; break; - } case TX_32X32: scan = vp9_default_scan_32x32; default_eob = 1024; @@ -190,7 +182,6 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb, if (mb->e_mbd.mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) rdmult = (rdmult * 9) >> 4; rddiv = mb->rddiv; - memset(best_index, 0, sizeof(best_index)); /* Initialize the sentinel node of the trellis. */ tokens[eob][0].rate = 0; tokens[eob][0].error = 0; @@ -202,7 +193,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb, for (i = 0; i < eob; i++) token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[ qcoeff_ptr[scan[i]]].token]; - nb = vp9_get_coef_neighbors_handle(scan, &pad); + nb = vp9_get_coef_neighbors_handle(scan); for (i = eob; i-- > i0;) { int base_bits, d2, dx; @@ -221,14 +212,13 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb, /* Consider both possible successor states. */ if (next < default_eob) { band = get_coef_band(band_translate, i + 1); - pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache, - pad, default_eob); + pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); rate0 += - mb->token_costs_noskip[tx_size][type][ref][band][pt] - [tokens[next][0].token]; + mb->token_costs[tx_size][type][ref][0][band][pt] + [tokens[next][0].token]; rate1 += - mb->token_costs_noskip[tx_size][type][ref][band][pt] - [tokens[next][1].token]; + mb->token_costs[tx_size][type][ref][0][band][pt] + [tokens[next][1].token]; } UPDATE_RD_COST(); /* And pick the best. */ @@ -274,24 +264,14 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb, if (next < default_eob) { band = get_coef_band(band_translate, i + 1); if (t0 != DCT_EOB_TOKEN) { - pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache, - pad, default_eob); - if (!x) - rate0 += mb->token_costs[tx_size][type][ref][band][pt][ - tokens[next][0].token]; - else - rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][ - tokens[next][0].token]; + pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); + rate0 += mb->token_costs[tx_size][type][ref][!x][band][pt] + [tokens[next][0].token]; } if (t1 != DCT_EOB_TOKEN) { - pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache, - pad, default_eob); - if (!x) - rate1 += mb->token_costs[tx_size][type][ref][band][pt][ - tokens[next][1].token]; - else - rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][ - tokens[next][1].token]; + pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache); + rate1 += mb->token_costs[tx_size][type][ref][!x][band][pt] + [tokens[next][1].token]; } } @@ -323,14 +303,15 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb, /* Update the cost of each path if we're past the EOB token. */ if (t0 != DCT_EOB_TOKEN) { tokens[next][0].rate += - mb->token_costs[tx_size][type][ref][band][0][t0]; + mb->token_costs[tx_size][type][ref][1][band][0][t0]; tokens[next][0].token = ZERO_TOKEN; } if (t1 != DCT_EOB_TOKEN) { tokens[next][1].rate += - mb->token_costs[tx_size][type][ref][band][0][t1]; + mb->token_costs[tx_size][type][ref][1][band][0][t1]; tokens[next][1].token = ZERO_TOKEN; } + best_index[i][0] = best_index[i][1] = 0; /* Don't update next, because we didn't add a new node. */ } } @@ -344,8 +325,8 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb, error1 = tokens[next][1].error; t0 = tokens[next][0].token; t1 = tokens[next][1].token; - rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t0]; - rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t1]; + rate0 += mb->token_costs[tx_size][type][ref][0][band][pt][t0]; + rate1 += mb->token_costs[tx_size][type][ref][0][band][pt][t1]; UPDATE_RD_COST(); best = rd_cost1 < rd_cost0; final_eob = i0 - 1; @@ -369,12 +350,6 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb, *a = *l = (final_eob > 0); } -struct optimize_block_args { - VP9_COMMON *cm; - MACROBLOCK *x; - struct optimize_ctx *ctx; -}; - void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *mb, struct optimize_ctx *ctx) { @@ -390,7 +365,7 @@ void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize, int ss_txfrm_size, void *arg) { - const struct optimize_block_args* const args = arg; + const struct encode_b_args* const args = arg; vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, args->x, args->ctx); } @@ -427,7 +402,7 @@ void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { struct optimize_ctx ctx; - struct optimize_block_args arg = {cm, x, &ctx}; + struct encode_b_args arg = {cm, x, &ctx}; vp9_optimize_init(&x->e_mbd, bsize, &ctx); foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, optimize_block, &arg); } @@ -435,64 +410,83 @@ void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { struct optimize_ctx ctx; - struct optimize_block_args arg = {cm, x, &ctx}; + struct encode_b_args arg = {cm, x, &ctx}; vp9_optimize_init(&x->e_mbd, bsize, &ctx); foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg); } -struct encode_b_args { - VP9_COMMON *cm; - MACROBLOCK *x; - struct optimize_ctx *ctx; -}; - -static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { +void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize, + int ss_txfrm_size, void *arg) { struct encode_b_args* const args = arg; MACROBLOCK* const x = args->x; MACROBLOCKD* const xd = &x->e_mbd; - const int bw = plane_block_width(bsize, &xd->plane[plane]); - const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane, - block, ss_txfrm_size); - int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block, 16); - int16_t *const src_diff = raster_block_offset_int16(xd, bsize, plane, - raster_block, - x->plane[plane].src_diff); - TX_TYPE tx_type = DCT_DCT; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16); + int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16); + int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16); + const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2); + const int16_t *scan, *iscan; + uint16_t *eob = &pd->eobs[block]; + const int bwl = plane_block_width_log2by4(bsize, pd), bw = 1 << bwl; + const int twl = bwl - tx_size, twmask = (1 << twl) - 1; + int xoff, yoff; + int16_t *src_diff; - switch (ss_txfrm_size / 2) { + switch (tx_size) { case TX_32X32: + scan = vp9_default_scan_32x32; + iscan = vp9_default_iscan_32x32; + block >>= 6; + xoff = 32 * (block & twmask); + yoff = 32 * (block >> twl); + src_diff = p->src_diff + 4 * bw * yoff + xoff; if (x->rd_search) - vp9_short_fdct32x32_rd(src_diff, coeff, bw * 2); + vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8); else - vp9_short_fdct32x32(src_diff, coeff, bw * 2); + vp9_short_fdct32x32(src_diff, coeff, bw * 8); + vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); break; case TX_16X16: - tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT; - if (tx_type != DCT_DCT) - vp9_short_fht16x16(src_diff, coeff, bw, tx_type); - else - x->fwd_txm16x16(src_diff, coeff, bw * 2); + scan = vp9_default_scan_16x16; + iscan = vp9_default_iscan_16x16; + block >>= 4; + xoff = 16 * (block & twmask); + yoff = 16 * (block >> twl); + src_diff = p->src_diff + 4 * bw * yoff + xoff; + x->fwd_txm16x16(src_diff, coeff, bw * 8); + vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); break; case TX_8X8: - tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT; - if (tx_type != DCT_DCT) - vp9_short_fht8x8(src_diff, coeff, bw, tx_type); - else - x->fwd_txm8x8(src_diff, coeff, bw * 2); + scan = vp9_default_scan_8x8; + iscan = vp9_default_iscan_8x8; + block >>= 2; + xoff = 8 * (block & twmask); + yoff = 8 * (block >> twl); + src_diff = p->src_diff + 4 * bw * yoff + xoff; + x->fwd_txm8x8(src_diff, coeff, bw * 8); + vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); break; case TX_4X4: - tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT; - if (tx_type != DCT_DCT) - vp9_short_fht4x4(src_diff, coeff, bw, tx_type); - else - x->fwd_txm4x4(src_diff, coeff, bw * 2); + scan = vp9_default_scan_4x4; + iscan = vp9_default_iscan_4x4; + xoff = 4 * (block & twmask); + yoff = 4 * (block >> twl); + src_diff = p->src_diff + 4 * bw * yoff + xoff; + x->fwd_txm4x4(src_diff, coeff, bw * 8); + vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); break; default: assert(0); } - - vp9_quantize(x, plane, block, 16 << ss_txfrm_size, tx_type); } static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, @@ -507,41 +501,32 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane, raster_block, pd->dst.buf, pd->dst.stride); - TX_TYPE tx_type = DCT_DCT; - xform_quant(plane, block, bsize, ss_txfrm_size, arg); if (x->optimize) vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, x, args->ctx); + if (x->skip_encode) + return; + if (pd->eobs[block] == 0) + return; + switch (ss_txfrm_size / 2) { case TX_32X32: vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride); break; case TX_16X16: - tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT; - if (tx_type == DCT_DCT) - vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride); - else - vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type); + vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride); break; case TX_8X8: - tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT; - if (tx_type == DCT_DCT) - vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride); - else - vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type); + vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride); break; case TX_4X4: - tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT; - if (tx_type == DCT_DCT) - // this is like vp9_short_idct4x4 but has a special case around eob<=1 - // which is significant (not just an optimization) for the lossless - // case. - inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff, - dst, pd->dst.stride); - else - vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type); + // this is like vp9_short_idct4x4 but has a special case around eob<=1 + // which is significant (not just an optimization) for the lossless + // case. + inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff, + dst, pd->dst.stride); break; } } @@ -597,92 +582,157 @@ void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { foreach_transformed_block(xd, bsize, encode_block, &arg); } -static void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { +void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, + int ss_txfrm_size, void *arg) { struct encode_b_args* const args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2); struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16); - const int bw = plane_block_width(bsize, pd); - const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane, - block, ss_txfrm_size); - - uint8_t *const src = raster_block_offset_uint8(xd, bsize, plane, raster_block, - p->src.buf, p->src.stride); - uint8_t *const dst = raster_block_offset_uint8(xd, bsize, plane, raster_block, - pd->dst.buf, pd->dst.stride); - int16_t *const src_diff = raster_block_offset_int16(xd, bsize, plane, - raster_block, - p->src_diff); - - const int txfm_b_size = 4 << tx_size; - int ib = raster_block; - int tx_ib = ib >> tx_size; - int plane_b_size; - + int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16); + int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16); + int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16); + const int16_t *scan, *iscan; TX_TYPE tx_type; - int mode, b_mode; + MB_PREDICTION_MODE mode; + const int bwl = b_width_log2(bsize) - pd->subsampling_x, bw = 1 << bwl; + const int twl = bwl - tx_size, twmask = (1 << twl) - 1; + int xoff, yoff; + uint8_t *src, *dst; + int16_t *src_diff; + uint16_t *eob = &pd->eobs[block]; if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { extend_for_intra(xd, plane, block, bsize, ss_txfrm_size); } - mode = plane == 0? mbmi->mode: mbmi->uv_mode; - if (plane == 0 && - mbmi->sb_type < BLOCK_SIZE_SB8X8 && - mbmi->ref_frame[0] == INTRA_FRAME) - b_mode = xd->mode_info_context->bmi[ib].as_mode.first; - else - b_mode = mode; - - assert(b_mode >= DC_PRED && b_mode <= TM_PRED); - - plane_b_size = b_width_log2(bsize) - pd->subsampling_x; - vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode, - dst, pd->dst.stride); - vp9_subtract_block(txfm_b_size, txfm_b_size, src_diff, bw, - src, p->src.stride, dst, pd->dst.stride); - - xform_quant(plane, block, bsize, ss_txfrm_size, arg); - - // if (x->optimize) // vp9_optimize_b(plane, block, bsize, ss_txfrm_size, // args->cm, x, args->ctx); - switch (ss_txfrm_size / 2) { + switch (tx_size) { case TX_32X32: + scan = vp9_default_scan_32x32; + iscan = vp9_default_iscan_32x32; + mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; + block >>= 6; + xoff = 32 * (block & twmask); + yoff = 32 * (block >> twl); + dst = pd->dst.buf + yoff * pd->dst.stride + xoff; + src = p->src.buf + yoff * p->src.stride + xoff; + src_diff = p->src_diff + 4 * bw * yoff + xoff; + vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode, + dst, pd->dst.stride, dst, pd->dst.stride); + vp9_subtract_block(32, 32, src_diff, bw * 4, + src, p->src.stride, dst, pd->dst.stride); + if (x->rd_search) + vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8); + else + vp9_short_fdct32x32(src_diff, coeff, bw * 8); + vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); + if (!x->skip_encode && *eob) vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride); break; case TX_16X16: - tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT; - if (tx_type == DCT_DCT) - vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride); + tx_type = get_tx_type_16x16(pd->plane_type, xd); + scan = get_scan_16x16(tx_type); + iscan = get_iscan_16x16(tx_type); + mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; + block >>= 4; + xoff = 16 * (block & twmask); + yoff = 16 * (block >> twl); + dst = pd->dst.buf + yoff * pd->dst.stride + xoff; + src = p->src.buf + yoff * p->src.stride + xoff; + src_diff = p->src_diff + 4 * bw * yoff + xoff; + vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode, + dst, pd->dst.stride, dst, pd->dst.stride); + vp9_subtract_block(16, 16, src_diff, bw * 4, + src, p->src.stride, dst, pd->dst.stride); + if (tx_type != DCT_DCT) + vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type); else - vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type); + x->fwd_txm16x16(src_diff, coeff, bw * 8); + vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); + if (!x->skip_encode && *eob) { + if (tx_type == DCT_DCT) + vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride); + else + vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type); + } break; case TX_8X8: - tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT; - if (tx_type == DCT_DCT) - vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride); + tx_type = get_tx_type_8x8(pd->plane_type, xd); + scan = get_scan_8x8(tx_type); + iscan = get_iscan_8x8(tx_type); + mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; + block >>= 2; + xoff = 8 * (block & twmask); + yoff = 8 * (block >> twl); + dst = pd->dst.buf + yoff * pd->dst.stride + xoff; + src = p->src.buf + yoff * p->src.stride + xoff; + src_diff = p->src_diff + 4 * bw * yoff + xoff; + vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode, + dst, pd->dst.stride, dst, pd->dst.stride); + vp9_subtract_block(8, 8, src_diff, bw * 4, + src, p->src.stride, dst, pd->dst.stride); + if (tx_type != DCT_DCT) + vp9_short_fht8x8(src_diff, coeff, bw * 4, tx_type); else - vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type); + x->fwd_txm8x8(src_diff, coeff, bw * 8); + vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); + if (!x->skip_encode && *eob) { + if (tx_type == DCT_DCT) + vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride); + else + vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type); + } break; case TX_4X4: - tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT; - if (tx_type == DCT_DCT) - // this is like vp9_short_idct4x4 but has a special case around eob<=1 - // which is significant (not just an optimization) for the lossless - // case. - inverse_transform_b_4x4_add(xd, pd->eobs[block], dqcoeff, - dst, pd->dst.stride); + tx_type = get_tx_type_4x4(pd->plane_type, xd, block); + scan = get_scan_4x4(tx_type); + iscan = get_iscan_4x4(tx_type); + if (mbmi->sb_type < BLOCK_SIZE_SB8X8 && plane == 0) { + mode = xd->mode_info_context->bmi[block].as_mode; + } else { + mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; + } + xoff = 4 * (block & twmask); + yoff = 4 * (block >> twl); + dst = pd->dst.buf + yoff * pd->dst.stride + xoff; + src = p->src.buf + yoff * p->src.stride + xoff; + src_diff = p->src_diff + 4 * bw * yoff + xoff; + vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode, + dst, pd->dst.stride, dst, pd->dst.stride); + vp9_subtract_block(4, 4, src_diff, bw * 4, + src, p->src.stride, dst, pd->dst.stride); + if (tx_type != DCT_DCT) + vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type); else - vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type); + x->fwd_txm4x4(src_diff, coeff, bw * 8); + vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, + p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); + if (!x->skip_encode && *eob) { + if (tx_type == DCT_DCT) + // this is like vp9_short_idct4x4 but has a special case around eob<=1 + // which is significant (not just an optimization) for the lossless + // case. + inverse_transform_b_4x4_add(xd, *eob, dqcoeff, + dst, pd->dst.stride); + else + vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type); + } break; + default: + assert(0); } } diff --git a/libvpx/vp9/encoder/vp9_encodemb.h b/libvpx/vp9/encoder/vp9_encodemb.h index 5796903..defaa48 100644 --- a/libvpx/vp9/encoder/vp9_encodemb.h +++ b/libvpx/vp9/encoder/vp9_encodemb.h @@ -27,6 +27,12 @@ struct optimize_ctx { ENTROPY_CONTEXT tl[MAX_MB_PLANE][16]; }; +struct encode_b_args { + VP9_COMMON *cm; + MACROBLOCK *x; + struct optimize_ctx *ctx; +}; + void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, struct optimize_ctx *ctx); void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, @@ -39,13 +45,11 @@ void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); +void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize, + int ss_txfrm_size, void *arg); void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); -void vp9_subtract_block(int rows, int cols, - int16_t *diff_ptr, int diff_stride, - const uint8_t *src_ptr, int src_stride, - const uint8_t *pred_ptr, int pred_stride); void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize); diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c index a582d18..2f5e16c 100644 --- a/libvpx/vp9/encoder/vp9_encodemv.c +++ b/libvpx/vp9/encoder/vp9_encodemv.c @@ -128,111 +128,93 @@ static void build_nmv_component_cost_table(int *mvcost, } } -static int update_nmv_savings(const unsigned int ct[2], - const vp9_prob cur_p, - const vp9_prob new_p, - const vp9_prob upd_p) { - -#ifdef LOW_PRECISION_MV_UPDATE - vp9_prob mod_p = new_p | 1; -#else - vp9_prob mod_p = new_p; -#endif - const int cur_b = cost_branch256(ct, cur_p); - const int mod_b = cost_branch256(ct, mod_p); - const int cost = 7 * 256 + -#ifndef LOW_PRECISION_MV_UPDATE - 256 + -#endif - (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p)); - if (cur_b - mod_b - cost > 0) { - return cur_b - mod_b - cost; - } else { - return 0 - vp9_cost_zero(upd_p); - } -} - -static int update_nmv( - vp9_writer *const bc, - const unsigned int ct[2], - vp9_prob *const cur_p, - const vp9_prob new_p, - const vp9_prob upd_p) { - -#ifdef LOW_PRECISION_MV_UPDATE +static int update_mv(vp9_writer *w, const unsigned int ct[2], + vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) { vp9_prob mod_p = new_p | 1; -#else - vp9_prob mod_p = new_p; -#endif - const int cur_b = cost_branch256(ct, *cur_p); const int mod_b = cost_branch256(ct, mod_p); - const int cost = 7 * 256 + -#ifndef LOW_PRECISION_MV_UPDATE - 256 + -#endif - (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p)); - + const int cost = 7 * 256 + (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p)); if (cur_b - mod_b > cost) { *cur_p = mod_p; - vp9_write(bc, 1, upd_p); -#ifdef LOW_PRECISION_MV_UPDATE - vp9_write_literal(bc, mod_p >> 1, 7); -#else - vp9_write_literal(bc, mod_p, 8); -#endif + vp9_write(w, 1, upd_p); + vp9_write_literal(w, mod_p >> 1, 7); return 1; } else { - vp9_write(bc, 0, upd_p); + vp9_write(w, 0, upd_p); return 0; } } -void print_nmvcounts(nmv_context_counts tnmvcounts) { +static void counts_to_nmv_context( + nmv_context_counts *nmv_count, + nmv_context *prob, + int usehp, + unsigned int (*branch_ct_joint)[2], + unsigned int (*branch_ct_sign)[2], + unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2], + unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2], + unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2], + unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2], + unsigned int (*branch_ct_fp)[4 - 1][2], + unsigned int (*branch_ct_class0_hp)[2], + unsigned int (*branch_ct_hp)[2]) { int i, j, k; - printf("\nCounts =\n { "); - for (j = 0; j < MV_JOINTS; ++j) - printf("%d, ", tnmvcounts.joints[j]); - printf("},\n"); + vp9_counts_process(nmv_count, usehp); + vp9_tree_probs_from_distribution(vp9_mv_joint_tree, + prob->joints, + branch_ct_joint, + nmv_count->joints, 0); for (i = 0; i < 2; ++i) { - printf(" {\n"); - printf(" %d/%d,\n", tnmvcounts.comps[i].sign[0], - tnmvcounts.comps[i].sign[1]); - printf(" { "); - for (j = 0; j < MV_CLASSES; ++j) - printf("%d, ", tnmvcounts.comps[i].classes[j]); - printf("},\n"); - printf(" { "); - for (j = 0; j < CLASS0_SIZE; ++j) - printf("%d, ", tnmvcounts.comps[i].class0[j]); - printf("},\n"); - printf(" { "); - for (j = 0; j < MV_OFFSET_BITS; ++j) - printf("%d/%d, ", tnmvcounts.comps[i].bits[j][0], - tnmvcounts.comps[i].bits[j][1]); - printf("},\n"); + const uint32_t s0 = nmv_count->comps[i].sign[0]; + const uint32_t s1 = nmv_count->comps[i].sign[1]; + + prob->comps[i].sign = get_binary_prob(s0, s1); + branch_ct_sign[i][0] = s0; + branch_ct_sign[i][1] = s1; + vp9_tree_probs_from_distribution(vp9_mv_class_tree, + prob->comps[i].classes, + branch_ct_classes[i], + nmv_count->comps[i].classes, 0); + vp9_tree_probs_from_distribution(vp9_mv_class0_tree, + prob->comps[i].class0, + branch_ct_class0[i], + nmv_count->comps[i].class0, 0); + for (j = 0; j < MV_OFFSET_BITS; ++j) { + const uint32_t b0 = nmv_count->comps[i].bits[j][0]; + const uint32_t b1 = nmv_count->comps[i].bits[j][1]; - printf(" {"); - for (j = 0; j < CLASS0_SIZE; ++j) { - printf("{"); - for (k = 0; k < 4; ++k) - printf("%d, ", tnmvcounts.comps[i].class0_fp[j][k]); - printf("}, "); + prob->comps[i].bits[j] = get_binary_prob(b0, b1); + branch_ct_bits[i][j][0] = b0; + branch_ct_bits[i][j][1] = b1; + } + } + for (i = 0; i < 2; ++i) { + for (k = 0; k < CLASS0_SIZE; ++k) { + vp9_tree_probs_from_distribution(vp9_mv_fp_tree, + prob->comps[i].class0_fp[k], + branch_ct_class0_fp[i][k], + nmv_count->comps[i].class0_fp[k], 0); + } + vp9_tree_probs_from_distribution(vp9_mv_fp_tree, + prob->comps[i].fp, + branch_ct_fp[i], + nmv_count->comps[i].fp, 0); + } + if (usehp) { + for (i = 0; i < 2; ++i) { + const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0]; + const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1]; + const uint32_t hp0 = nmv_count->comps[i].hp[0]; + const uint32_t hp1 = nmv_count->comps[i].hp[1]; + + prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1); + branch_ct_class0_hp[i][0] = c0_hp0; + branch_ct_class0_hp[i][1] = c0_hp1; + + prob->comps[i].hp = get_binary_prob(hp0, hp1); + branch_ct_hp[i][0] = hp0; + branch_ct_hp[i][1] = hp1; } - printf("},\n"); - - printf(" { "); - for (j = 0; j < 4; ++j) - printf("%d, ", tnmvcounts.comps[i].fp[j]); - printf("},\n"); - - printf(" %d/%d,\n", - tnmvcounts.comps[i].class0_hp[0], - tnmvcounts.comps[i].class0_hp[1]); - printf(" %d/%d,\n", - tnmvcounts.comps[i].hp[0], - tnmvcounts.comps[i].hp[1]); - printf(" },\n"); } } @@ -253,11 +235,11 @@ void print_nmvstats() { unsigned int branch_ct_class0_hp[2][2]; unsigned int branch_ct_hp[2][2]; int i, j, k; - vp9_counts_to_nmv_context(&tnmvcounts, &prob, 1, - branch_ct_joint, branch_ct_sign, branch_ct_classes, - branch_ct_class0, branch_ct_bits, - branch_ct_class0_fp, branch_ct_fp, - branch_ct_class0_hp, branch_ct_hp); + counts_to_nmv_context(&tnmvcounts, &prob, 1, + branch_ct_joint, branch_ct_sign, branch_ct_classes, + branch_ct_class0, branch_ct_bits, + branch_ct_class0_fp, branch_ct_fp, + branch_ct_class0_hp, branch_ct_hp); printf("\nCounts =\n { "); for (j = 0; j < MV_JOINTS; ++j) @@ -394,154 +376,69 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) { unsigned int branch_ct_fp[2][4 - 1][2]; unsigned int branch_ct_class0_hp[2][2]; unsigned int branch_ct_hp[2][2]; -#ifdef MV_GROUP_UPDATE - int savings = 0; -#endif + nmv_context *mvc = &cpi->common.fc.nmvc; #ifdef NMV_STATS if (!cpi->dummy_packing) add_nmvcount(&tnmvcounts, &cpi->NMVcount); #endif - vp9_counts_to_nmv_context(&cpi->NMVcount, &prob, usehp, - branch_ct_joint, branch_ct_sign, branch_ct_classes, - branch_ct_class0, branch_ct_bits, - branch_ct_class0_fp, branch_ct_fp, - branch_ct_class0_hp, branch_ct_hp); - /* write updates if they help */ -#ifdef MV_GROUP_UPDATE - for (j = 0; j < MV_JOINTS - 1; ++j) { - savings += update_nmv_savings(branch_ct_joint[j], - cpi->common.fc.nmvc.joints[j], - prob.joints[j], - VP9_NMV_UPDATE_PROB); - } - for (i = 0; i < 2; ++i) { - savings += update_nmv_savings(branch_ct_sign[i], - cpi->common.fc.nmvc.comps[i].sign, - prob.comps[i].sign, - VP9_NMV_UPDATE_PROB); - for (j = 0; j < MV_CLASSES - 1; ++j) { - savings += update_nmv_savings(branch_ct_classes[i][j], - cpi->common.fc.nmvc.comps[i].classes[j], - prob.comps[i].classes[j], - VP9_NMV_UPDATE_PROB); - } - for (j = 0; j < CLASS0_SIZE - 1; ++j) { - savings += update_nmv_savings(branch_ct_class0[i][j], - cpi->common.fc.nmvc.comps[i].class0[j], - prob.comps[i].class0[j], - VP9_NMV_UPDATE_PROB); - } - for (j = 0; j < MV_OFFSET_BITS; ++j) { - savings += update_nmv_savings(branch_ct_bits[i][j], - cpi->common.fc.nmvc.comps[i].bits[j], - prob.comps[i].bits[j], - VP9_NMV_UPDATE_PROB); - } - } - for (i = 0; i < 2; ++i) { - for (j = 0; j < CLASS0_SIZE; ++j) { - int k; - for (k = 0; k < 3; ++k) { - savings += update_nmv_savings(branch_ct_class0_fp[i][j][k], - cpi->common.fc.nmvc.comps[i].class0_fp[j][k], - prob.comps[i].class0_fp[j][k], - VP9_NMV_UPDATE_PROB); - } - } - for (j = 0; j < 3; ++j) { - savings += update_nmv_savings(branch_ct_fp[i][j], - cpi->common.fc.nmvc.comps[i].fp[j], - prob.comps[i].fp[j], - VP9_NMV_UPDATE_PROB); - } - } - if (usehp) { - for (i = 0; i < 2; ++i) { - savings += update_nmv_savings(branch_ct_class0_hp[i], - cpi->common.fc.nmvc.comps[i].class0_hp, - prob.comps[i].class0_hp, - VP9_NMV_UPDATE_PROB); - savings += update_nmv_savings(branch_ct_hp[i], - cpi->common.fc.nmvc.comps[i].hp, - prob.comps[i].hp, - VP9_NMV_UPDATE_PROB); - } - } - if (savings <= 0) { - vp9_write_bit(bc, 0); - return; - } - vp9_write_bit(bc, 1); -#endif + counts_to_nmv_context(&cpi->NMVcount, &prob, usehp, + branch_ct_joint, branch_ct_sign, branch_ct_classes, + branch_ct_class0, branch_ct_bits, + branch_ct_class0_fp, branch_ct_fp, + branch_ct_class0_hp, branch_ct_hp); + + for (j = 0; j < MV_JOINTS - 1; ++j) + update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j], + VP9_NMV_UPDATE_PROB); - for (j = 0; j < MV_JOINTS - 1; ++j) { - update_nmv(bc, branch_ct_joint[j], - &cpi->common.fc.nmvc.joints[j], - prob.joints[j], - VP9_NMV_UPDATE_PROB); - } for (i = 0; i < 2; ++i) { - update_nmv(bc, branch_ct_sign[i], - &cpi->common.fc.nmvc.comps[i].sign, - prob.comps[i].sign, - VP9_NMV_UPDATE_PROB); - for (j = 0; j < MV_CLASSES - 1; ++j) { - update_nmv(bc, branch_ct_classes[i][j], - &cpi->common.fc.nmvc.comps[i].classes[j], - prob.comps[i].classes[j], - VP9_NMV_UPDATE_PROB); - } - for (j = 0; j < CLASS0_SIZE - 1; ++j) { - update_nmv(bc, branch_ct_class0[i][j], - &cpi->common.fc.nmvc.comps[i].class0[j], - prob.comps[i].class0[j], - VP9_NMV_UPDATE_PROB); - } - for (j = 0; j < MV_OFFSET_BITS; ++j) { - update_nmv(bc, branch_ct_bits[i][j], - &cpi->common.fc.nmvc.comps[i].bits[j], - prob.comps[i].bits[j], - VP9_NMV_UPDATE_PROB); - } + update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, + prob.comps[i].sign, VP9_NMV_UPDATE_PROB); + for (j = 0; j < MV_CLASSES - 1; ++j) + update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j], + prob.comps[i].classes[j], VP9_NMV_UPDATE_PROB); + + for (j = 0; j < CLASS0_SIZE - 1; ++j) + update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j], + prob.comps[i].class0[j], VP9_NMV_UPDATE_PROB); + + for (j = 0; j < MV_OFFSET_BITS; ++j) + update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j], + prob.comps[i].bits[j], VP9_NMV_UPDATE_PROB); } + for (i = 0; i < 2; ++i) { for (j = 0; j < CLASS0_SIZE; ++j) { int k; - for (k = 0; k < 3; ++k) { - update_nmv(bc, branch_ct_class0_fp[i][j][k], - &cpi->common.fc.nmvc.comps[i].class0_fp[j][k], - prob.comps[i].class0_fp[j][k], - VP9_NMV_UPDATE_PROB); - } - } - for (j = 0; j < 3; ++j) { - update_nmv(bc, branch_ct_fp[i][j], - &cpi->common.fc.nmvc.comps[i].fp[j], - prob.comps[i].fp[j], - VP9_NMV_UPDATE_PROB); + for (k = 0; k < 3; ++k) + update_mv(bc, branch_ct_class0_fp[i][j][k], + &mvc->comps[i].class0_fp[j][k], + prob.comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB); } + + for (j = 0; j < 3; ++j) + update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], + prob.comps[i].fp[j], VP9_NMV_UPDATE_PROB); } + if (usehp) { for (i = 0; i < 2; ++i) { - update_nmv(bc, branch_ct_class0_hp[i], - &cpi->common.fc.nmvc.comps[i].class0_hp, - prob.comps[i].class0_hp, - VP9_NMV_UPDATE_PROB); - update_nmv(bc, branch_ct_hp[i], - &cpi->common.fc.nmvc.comps[i].hp, - prob.comps[i].hp, - VP9_NMV_UPDATE_PROB); + update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp, + prob.comps[i].class0_hp, VP9_NMV_UPDATE_PROB); + update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp, + prob.comps[i].hp, VP9_NMV_UPDATE_PROB); } } } -void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref, +void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w, + const MV* mv, const MV* ref, const nmv_context* mvctx, int usehp) { const MV diff = {mv->row - ref->row, mv->col - ref->col}; const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff); - usehp = usehp && vp9_use_nmv_hp(ref); + usehp = usehp && vp9_use_mv_hp(ref); write_token(w, vp9_mv_joint_tree, mvctx->joints, &vp9_mv_joint_encodings[j]); if (mv_joint_vertical(j)) @@ -549,6 +446,13 @@ void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref, if (mv_joint_horizontal(j)) encode_mv_component(w, diff.col, &mvctx->comps[1], usehp); + + // If auto_mv_step_size is enabled then keep track of the largest + // motion vector component used. + if (!cpi->dummy_packing && cpi->sf.auto_mv_step_size) { + unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3; + cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude); + } } void vp9_build_nmv_cost_table(int *mvjoint, @@ -567,44 +471,42 @@ void vp9_build_nmv_cost_table(int *mvjoint, void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv, int_mv *second_best_ref_mv) { - MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi; - MV mv; - int bwl = b_width_log2(mbmi->sb_type), bw = 1 << bwl; - int bhl = b_height_log2(mbmi->sb_type), bh = 1 << bhl; + MODE_INFO *mi = x->e_mbd.mode_info_context; + MB_MODE_INFO *const mbmi = &mi->mbmi; + MV diff; + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type]; int idx, idy; if (mbmi->sb_type < BLOCK_SIZE_SB8X8) { - int i; PARTITION_INFO *pi = x->partition_info; - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { - i = idy * 2 + idx; + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + const int i = idy * 2 + idx; if (pi->bmi[i].mode == NEWMV) { - mv.row = (pi->bmi[i].mv.as_mv.row - best_ref_mv->as_mv.row); - mv.col = (pi->bmi[i].mv.as_mv.col - best_ref_mv->as_mv.col); - vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, - x->e_mbd.allow_high_precision_mv); + diff.row = mi->bmi[i].as_mv[0].as_mv.row - best_ref_mv->as_mv.row; + diff.col = mi->bmi[i].as_mv[0].as_mv.col - best_ref_mv->as_mv.col; + vp9_inc_mv(&diff, &cpi->NMVcount); + if (x->e_mbd.mode_info_context->mbmi.ref_frame[1] > INTRA_FRAME) { - mv.row = pi->bmi[i].second_mv.as_mv.row - + diff.row = mi->bmi[i].as_mv[1].as_mv.row - second_best_ref_mv->as_mv.row; - mv.col = pi->bmi[i].second_mv.as_mv.col - + diff.col = mi->bmi[i].as_mv[1].as_mv.col - second_best_ref_mv->as_mv.col; - vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, - x->e_mbd.allow_high_precision_mv); + vp9_inc_mv(&diff, &cpi->NMVcount); } } } } } else if (mbmi->mode == NEWMV) { - mv.row = (mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row); - mv.col = (mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col); - vp9_increment_nmv(&mv, &best_ref_mv->as_mv, &cpi->NMVcount, - x->e_mbd.allow_high_precision_mv); + diff.row = mbmi->mv[0].as_mv.row - best_ref_mv->as_mv.row; + diff.col = mbmi->mv[0].as_mv.col - best_ref_mv->as_mv.col; + vp9_inc_mv(&diff, &cpi->NMVcount); + if (mbmi->ref_frame[1] > INTRA_FRAME) { - mv.row = (mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row); - mv.col = (mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col); - vp9_increment_nmv(&mv, &second_best_ref_mv->as_mv, &cpi->NMVcount, - x->e_mbd.allow_high_precision_mv); + diff.row = mbmi->mv[1].as_mv.row - second_best_ref_mv->as_mv.row; + diff.col = mbmi->mv[1].as_mv.col - second_best_ref_mv->as_mv.col; + vp9_inc_mv(&diff, &cpi->NMVcount); } } } diff --git a/libvpx/vp9/encoder/vp9_encodemv.h b/libvpx/vp9/encoder/vp9_encodemv.h index cb25d85..2789ce1 100644 --- a/libvpx/vp9/encoder/vp9_encodemv.h +++ b/libvpx/vp9/encoder/vp9_encodemv.h @@ -16,7 +16,7 @@ void vp9_write_nmv_probs(VP9_COMP* const, int usehp, vp9_writer* const); -void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref, +void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref, const nmv_context* mvctx, int usehp); void vp9_build_nmv_cost_table(int *mvjoint, @@ -28,6 +28,4 @@ void vp9_build_nmv_cost_table(int *mvjoint, void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv, int_mv *second_best_ref_mv); -void print_nmvcounts(nmv_context_counts tnmvcounts); - #endif // VP9_ENCODER_VP9_ENCODEMV_H_ diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c index 5e26cd8..ec2e361 100644 --- a/libvpx/vp9/encoder/vp9_firstpass.c +++ b/libvpx/vp9/encoder/vp9_firstpass.c @@ -370,19 +370,6 @@ static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *r } } -static enum BlockSize get_bs(BLOCK_SIZE_TYPE b) { - switch (b) { - case BLOCK_SIZE_SB8X8: - return BLOCK_8X8; - case BLOCK_SIZE_SB16X8: - return BLOCK_16X8; - case BLOCK_SIZE_SB8X16: - return BLOCK_8X16; - default: - return BLOCK_16X16; - } -} - static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int_mv *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, @@ -398,7 +385,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; int n; vp9_variance_fn_ptr_t v_fn_ptr = - cpi->fn_ptr[get_bs(xd->mode_info_context->mbmi.sb_type)]; + cpi->fn_ptr[xd->mode_info_context->mbmi.sb_type]; int new_mv_mode_penalty = 256; int sr = 0; @@ -514,16 +501,14 @@ void vp9_first_pass(VP9_COMP *cpi) { vp9_clear_system_state(); // __asm emms; vp9_setup_src_planes(x, cpi->Source, 0, 0); - setup_pre_planes(xd, lst_yv12, NULL, 0, 0, NULL, NULL); + setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL); setup_dst_planes(xd, new_yv12, 0, 0); x->partition_info = x->pi; xd->mode_info_context = cm->mi; - vp9_build_block_offsets(x); - - vp9_setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); + setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); vp9_frame_init_quantizer(cpi); @@ -986,9 +971,11 @@ static int estimate_max_q(VP9_COMP *cpi, // Corrections for higher compression speed settings // (reduced compression expected) + // FIXME(jimbankoski): Once we settle on vp9 speed features we need to + // change this code. if (cpi->compressor_speed == 1) speed_correction = cpi->oxcf.cpu_used <= 5 ? - 1.04 + (cpi->oxcf.cpu_used * 0.04) : + 1.04 + (/*cpi->oxcf.cpu_used*/0 * 0.04) : 1.25; // Try and pick a max Q that will be high enough to encode the @@ -1051,7 +1038,7 @@ static int estimate_cq(VP9_COMP *cpi, // (reduced compression expected) if (cpi->compressor_speed == 1) { if (cpi->oxcf.cpu_used <= 5) - speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04); + speed_correction = 1.04 + (/*cpi->oxcf.cpu_used*/ 0 * 0.04); else speed_correction = 1.25; } @@ -1106,13 +1093,13 @@ static int estimate_cq(VP9_COMP *cpi, } -extern void vp9_new_frame_rate(VP9_COMP *cpi, double framerate); +extern void vp9_new_framerate(VP9_COMP *cpi, double framerate); void vp9_init_second_pass(VP9_COMP *cpi) { FIRSTPASS_STATS this_frame; FIRSTPASS_STATS *start_pos; - double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate; + double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate; double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); @@ -1133,10 +1120,10 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // encoded in the second pass is a guess. However the sum duration is not. // Its calculated based on the actual durations of all frames from the first // pass. - vp9_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / + vp9_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration); - cpi->output_frame_rate = cpi->oxcf.frame_rate; + cpi->output_framerate = cpi->oxcf.framerate; cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0); cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * @@ -2216,7 +2203,7 @@ void vp9_second_pass(VP9_COMP *cpi) { // Set nominal per second bandwidth for this frame cpi->target_bandwidth = (int)(cpi->per_frame_bandwidth - * cpi->output_frame_rate); + * cpi->output_framerate); if (cpi->target_bandwidth < 0) cpi->target_bandwidth = 0; @@ -2636,7 +2623,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { cpi->per_frame_bandwidth = cpi->twopass.kf_bits; // Convert to a per second bitrate cpi->target_bandwidth = (int)(cpi->twopass.kf_bits * - cpi->output_frame_rate); + cpi->output_framerate); } // Note the total error score of the kf group minus the key frame itself diff --git a/libvpx/vp9/encoder/vp9_lookahead.c b/libvpx/vp9/encoder/vp9_lookahead.c index b07d92a..81445a9 100644 --- a/libvpx/vp9/encoder/vp9_lookahead.c +++ b/libvpx/vp9/encoder/vp9_lookahead.c @@ -15,8 +15,6 @@ #include "vp9/encoder/vp9_lookahead.h" #include "vp9/common/vp9_extend.h" -#define MAX_LAG_BUFFERS 25 - struct lookahead_ctx { unsigned int max_sz; /* Absolute size of the queue */ unsigned int sz; /* Number of buffers currently in the queue */ diff --git a/libvpx/vp9/encoder/vp9_lookahead.h b/libvpx/vp9/encoder/vp9_lookahead.h index 81baa2c..c773f8f 100644 --- a/libvpx/vp9/encoder/vp9_lookahead.h +++ b/libvpx/vp9/encoder/vp9_lookahead.h @@ -14,6 +14,8 @@ #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" +#define MAX_LAG_BUFFERS 25 + struct lookahead_entry { YV12_BUFFER_CONFIG img; int64_t ts_start; diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c index 65fdcbe..7d6db07 100644 --- a/libvpx/vp9/encoder/vp9_mbgraph.c +++ b/libvpx/vp9/encoder/vp9_mbgraph.c @@ -15,6 +15,7 @@ #include <vp9/encoder/vp9_rdopt.h> #include <vp9/common/vp9_blockd.h> #include <vp9/common/vp9_reconinter.h> +#include <vp9/common/vp9_reconintra.h> #include <vp9/common/vp9_systemdependent.h> #include <vp9/encoder/vp9_segmentation.h> @@ -35,8 +36,9 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, int_mv ref_full; // Further step/diamond searches as necessary - int step_param = cpi->sf.first_step + + int step_param = cpi->sf.reduce_first_step_size + (cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2); + step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2)); vp9_clamp_mv_min_max(x, ref_mv); @@ -145,16 +147,11 @@ static int find_best_16x16_intra(VP9_COMP *cpi, // we're intentionally not doing 4x4, we just want a rough estimate for (mode = DC_PRED; mode <= TM_PRED; mode++) { unsigned int err; - const int bwl = b_width_log2(BLOCK_SIZE_MB16X16), bw = 4 << bwl; - const int bhl = b_height_log2(BLOCK_SIZE_MB16X16), bh = 4 << bhl; xd->mode_info_context->mbmi.mode = mode; - vp9_build_intra_predictors(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride, - xd->mode_info_context->mbmi.mode, - bw, bh, - xd->up_available, xd->left_available, - xd->right_available); + vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode, + x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, xd->plane[0].dst.stride); err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, xd->plane[0].dst.stride, best_err); @@ -323,8 +320,9 @@ static void separate_arf_mbs(VP9_COMP *cpi) { int *arf_not_zz; - CHECK_MEM_ERROR(arf_not_zz, - vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1)); + CHECK_MEM_ERROR(cm, arf_not_zz, + vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), + 1)); // We are not interested in results beyond the alt ref itself. if (n_frames > cpi->frames_till_gf_update_due) @@ -408,8 +406,8 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) { // being a GF - so exit if we don't look ahead beyond that if (n_frames <= cpi->frames_till_gf_update_due) return; - if (n_frames > (int)cpi->common.frames_till_alt_ref_frame) - n_frames = cpi->common.frames_till_alt_ref_frame; + if (n_frames > (int)cpi->frames_till_alt_ref_frame) + n_frames = cpi->frames_till_alt_ref_frame; if (n_frames > MAX_LAG_BUFFERS) n_frames = MAX_LAG_BUFFERS; diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c index 2e99736..0be9891 100644 --- a/libvpx/vp9/encoder/vp9_mcomp.c +++ b/libvpx/vp9/encoder/vp9_mcomp.c @@ -19,11 +19,13 @@ #include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_common.h" +// #define NEW_DIAMOND_SEARCH + void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) { int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL + - ((ref_mv->as_mv.col & 7) ? 1 : 0); + ((ref_mv->as_mv.col & 7) ? 1 : 0); int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL + - ((ref_mv->as_mv.row & 7) ? 1 : 0); + ((ref_mv->as_mv.row & 7) ? 1 : 0); int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL; int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL; @@ -38,16 +40,20 @@ void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) { x->mv_row_max = row_max; } -int vp9_init_search_range(int width, int height) { +int vp9_init_search_range(VP9_COMP *cpi, int size) { int sr = 0; - int frm = MIN(width, height); - while ((frm << sr) < MAX_FULL_PEL_VAL) + // Minimum search size no matter what the passed in value. + size = MAX(16, size); + + while ((size << sr) < MAX_FULL_PEL_VAL) sr++; if (sr) sr--; + sr += cpi->sf.reduce_first_step_size; + sr = MIN(sr, (cpi->sf.max_step_search_steps - 2)); return sr; } @@ -366,7 +372,7 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, } if (xd->allow_high_precision_mv) { - usehp = vp9_use_nmv_hp(&ref_mv->as_mv); + usehp = vp9_use_mv_hp(&ref_mv->as_mv); } else { usehp = 0; } @@ -447,7 +453,7 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x, int offset; int usehp = xd->allow_high_precision_mv; - uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t)); + DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); uint8_t *y = xd->plane[0].pre[0].buf + (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + bestmv->as_mv.col; @@ -556,7 +562,7 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x, } if (xd->allow_high_precision_mv) { - usehp = vp9_use_nmv_hp(&ref_mv->as_mv); + usehp = vp9_use_mv_hp(&ref_mv->as_mv); } else { usehp = 0; } @@ -597,8 +603,6 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x, bestmv->as_mv.row = br; bestmv->as_mv.col = bc; - vpx_free(comp_pred); - if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) return INT_MAX; @@ -930,7 +934,7 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x, } if (x->e_mbd.allow_high_precision_mv) { - usehp = vp9_use_nmv_hp(&ref_mv->as_mv); + usehp = vp9_use_mv_hp(&ref_mv->as_mv); } else { usehp = 0; } @@ -1509,12 +1513,13 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x, this_row_offset = best_mv->as_mv.row + ss[i].mv.row; this_col_offset = best_mv->as_mv.col + ss[i].mv.col; - if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) - - { + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) { check_here = ss[i].offset + best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, + bestsad); if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; @@ -1537,6 +1542,34 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x, best_mv->as_mv.col += ss[best_site].mv.col; best_address += ss[best_site].offset; last_site = best_site; +#if defined(NEW_DIAMOND_SEARCH) + while (1) { + this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row; + this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col; + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) { + check_here = ss[best_site].offset + best_address; + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, + bestsad); + if (thissad < bestsad) { + this_mv.as_mv.row = this_row_offset; + this_mv.as_mv.col = this_col_offset; + thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + mvjsadcost, mvsadcost, sad_per_bit); + if (thissad < bestsad) { + bestsad = thissad; + best_mv->as_mv.row += ss[best_site].mv.row; + best_mv->as_mv.col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + continue; + } + } + } + break; + }; +#endif } else if (best_address == in_what) (*num00)++; } @@ -1678,12 +1711,39 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, i++; } } - if (best_site != last_site) { best_mv->as_mv.row += ss[best_site].mv.row; best_mv->as_mv.col += ss[best_site].mv.col; best_address += ss[best_site].offset; last_site = best_site; +#if defined(NEW_DIAMOND_SEARCH) + while (1) { + this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row; + this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col; + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) { + check_here = ss[best_site].offset + best_address; + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, + bestsad); + if (thissad < bestsad) { + this_mv.as_mv.row = this_row_offset; + this_mv.as_mv.col = this_col_offset; + thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + mvjsadcost, mvsadcost, sad_per_bit); + if (thissad < bestsad) { + bestsad = thissad; + best_mv->as_mv.row += ss[best_site].mv.row; + best_mv->as_mv.col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + continue; + } + } + } + break; + }; +#endif } else if (best_address == in_what) (*num00)++; } @@ -1704,6 +1764,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, /* do_refine: If last step (1-away) of n-step search doesn't pick the center point as the best match, we will do a final 1-away diamond refining search */ + int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, int_mv *mvp_full, int step_param, int sadpb, int further_steps, @@ -2355,16 +2416,12 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, int *mvjsadcost = x->nmvjointsadcost; int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - /* Compound pred buffer */ - uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t)); - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; /* Get compound pred by averaging two pred blocks. */ - comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride); - - bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) + + bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride, + second_pred, 0x7fffffff) + mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); for (i = 0; i < search_range; i++) { @@ -2382,9 +2439,8 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, best_address; /* Get compound block and use it to calculate SAD. */ - comp_avg_pred(comp_pred, second_pred, w, h, check_here, - in_what_stride); - thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad); + thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride, + second_pred, bestsad); if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; @@ -2414,16 +2470,15 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, this_mv.as_mv.col = ref_mv->as_mv.col << 3; if (bestsad < INT_MAX) { - int besterr; - comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride); - besterr = fn_ptr->vf(what, what_stride, comp_pred, w, - (unsigned int *)(&thissad)) + + // FIXME(rbultje, yunqing): add full-pixel averaging variance functions + // so we don't have to use the subpixel with xoff=0,yoff=0 here. + int besterr = fn_ptr->svaf(best_address, in_what_stride, 0, 0, + what, what_stride, (unsigned int *)(&thissad), + second_pred) + mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit, xd->allow_high_precision_mv); - vpx_free(comp_pred); return besterr; } else { - vpx_free(comp_pred); return INT_MAX; } } diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h index 28b2efd..c13ea75 100644 --- a/libvpx/vp9/encoder/vp9_mcomp.h +++ b/libvpx/vp9/encoder/vp9_mcomp.h @@ -24,15 +24,15 @@ #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv); -int vp9_init_search_range(int width, int height); - int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2], int weight, int ishp); void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride); void vp9_init3smotion_compensation(MACROBLOCK *x, int stride); -// Runs sequence of diamond searches in smaller steps for RD struct VP9_COMP; +int vp9_init_search_range(struct VP9_COMP *cpi, int size); + +// Runs sequence of diamond searches in smaller steps for RD int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, int_mv *mvp_full, int step_param, int sadpb, int further_steps, int do_refine, diff --git a/libvpx/vp9/encoder/vp9_modecosts.c b/libvpx/vp9/encoder/vp9_modecosts.c index f2e4ce4..993aba7 100644 --- a/libvpx/vp9/encoder/vp9_modecosts.c +++ b/libvpx/vp9/encoder/vp9_modecosts.c @@ -22,8 +22,8 @@ void vp9_init_mode_costs(VP9_COMP *c) { for (i = 0; i < VP9_INTRA_MODES; i++) { for (j = 0; j < VP9_INTRA_MODES; j++) { - vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], - x->kf_y_mode_prob[i][j], KT); + vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j], + KT); } } @@ -33,7 +33,8 @@ void vp9_init_mode_costs(VP9_COMP *c) { vp9_cost_tokens(c->mb.intra_uv_mode_cost[1], x->fc.uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree); vp9_cost_tokens(c->mb.intra_uv_mode_cost[0], - x->kf_uv_mode_prob[VP9_INTRA_MODES - 1], vp9_intra_mode_tree); + vp9_kf_uv_mode_prob[VP9_INTRA_MODES - 1], + vp9_intra_mode_tree); for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i) vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i], diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c index 6a14df4..e5f1a5c 100644 --- a/libvpx/vp9/encoder/vp9_onyx_if.c +++ b/libvpx/vp9/encoder/vp9_onyx_if.c @@ -131,6 +131,32 @@ static int gf_low_motion_minq[QINDEX_RANGE]; static int gf_high_motion_minq[QINDEX_RANGE]; static int inter_minq[QINDEX_RANGE]; +static INLINE void Scale2Ratio(int mode, int *hr, int *hs) { + switch (mode) { + case NORMAL: + *hr = 1; + *hs = 1; + break; + case FOURFIVE: + *hr = 4; + *hs = 5; + break; + case THREEFIVE: + *hr = 3; + *hs = 5; + break; + case ONETWO: + *hr = 1; + *hs = 2; + break; + default: + *hr = 1; + *hs = 1; + assert(0); + break; + } +} + // Functions to compute the active minq lookup table entries based on a // formulaic approach to facilitate easier adjustment of the Q tables. // The formulae were derived from computing a 3rd order polynomial best @@ -217,22 +243,23 @@ void vp9_initialize_enc() { static void setup_features(VP9_COMP *cpi) { MACROBLOCKD *xd = &cpi->mb.e_mbd; + struct loopfilter *lf = &xd->lf; // Set up default state for MB feature flags - xd->segmentation_enabled = 0; + xd->seg.enabled = 0; - xd->update_mb_segmentation_map = 0; - xd->update_mb_segmentation_data = 0; - vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs)); + xd->seg.update_map = 0; + xd->seg.update_data = 0; + vpx_memset(xd->seg.tree_probs, 255, sizeof(xd->seg.tree_probs)); - vp9_clearall_segfeatures(xd); + vp9_clearall_segfeatures(&xd->seg); - xd->mode_ref_lf_delta_enabled = 0; - xd->mode_ref_lf_delta_update = 0; - vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas)); - vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas)); - vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas)); - vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas)); + lf->mode_ref_delta_enabled = 0; + lf->mode_ref_delta_update = 0; + vp9_zero(lf->ref_deltas); + vp9_zero(lf->mode_deltas); + vp9_zero(lf->last_ref_deltas); + vp9_zero(lf->last_mode_deltas); set_default_lf_deltas(cpi); } @@ -305,26 +332,26 @@ static void configure_static_seg_features(VP9_COMP *cpi) { if (cm->frame_type == KEY_FRAME) { // Clear down the global segmentation map vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); - xd->update_mb_segmentation_map = 0; - xd->update_mb_segmentation_data = 0; + xd->seg.update_map = 0; + xd->seg.update_data = 0; cpi->static_mb_pct = 0; // Disable segmentation vp9_disable_segmentation((VP9_PTR)cpi); // Clear down the segment features. - vp9_clearall_segfeatures(xd); + vp9_clearall_segfeatures(&xd->seg); } else if (cpi->refresh_alt_ref_frame) { // If this is an alt ref frame // Clear down the global segmentation map vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); - xd->update_mb_segmentation_map = 0; - xd->update_mb_segmentation_data = 0; + xd->seg.update_map = 0; + xd->seg.update_data = 0; cpi->static_mb_pct = 0; // Disable segmentation and individual segment features by default vp9_disable_segmentation((VP9_PTR)cpi); - vp9_clearall_segfeatures(xd); + vp9_clearall_segfeatures(&xd->seg); // Scan frames from current to arf frame. // This function re-enables segmentation if appropriate. @@ -332,45 +359,45 @@ static void configure_static_seg_features(VP9_COMP *cpi) { // If segmentation was enabled set those features needed for the // arf itself. - if (xd->segmentation_enabled) { - xd->update_mb_segmentation_map = 1; - xd->update_mb_segmentation_data = 1; + if (xd->seg.enabled) { + xd->seg.update_map = 1; + xd->seg.update_data = 1; qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875)); - vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta - 2)); - vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2); + vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_Q, (qi_delta - 2)); + vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_LF, -2); - vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q); - vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF); + vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_Q); + vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_LF); // Where relevant assume segment data is delta data - xd->mb_segment_abs_delta = SEGMENT_DELTADATA; + xd->seg.abs_delta = SEGMENT_DELTADATA; } - } else if (xd->segmentation_enabled) { + } else if (xd->seg.enabled) { // All other frames if segmentation has been enabled // First normal frame in a valid gf or alt ref group - if (cpi->common.frames_since_golden == 0) { + if (cpi->frames_since_golden == 0) { // Set up segment features for normal frames in an arf group if (cpi->source_alt_ref_active) { - xd->update_mb_segmentation_map = 0; - xd->update_mb_segmentation_data = 1; - xd->mb_segment_abs_delta = SEGMENT_DELTADATA; + xd->seg.update_map = 0; + xd->seg.update_data = 1; + xd->seg.abs_delta = SEGMENT_DELTADATA; qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 1.125)); - vp9_set_segdata(xd, 1, SEG_LVL_ALT_Q, (qi_delta + 2)); - vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_Q); + vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_Q, (qi_delta + 2)); + vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_Q); - vp9_set_segdata(xd, 1, SEG_LVL_ALT_LF, -2); - vp9_enable_segfeature(xd, 1, SEG_LVL_ALT_LF); + vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_LF, -2); + vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_LF); // Segment coding disabled for compred testing if (high_q || (cpi->static_mb_pct == 100)) { - vp9_set_segdata(xd, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); - vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME); - vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP); + vp9_set_segdata(&xd->seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); + vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_REF_FRAME); + vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_SKIP); } } else { // Disable segmentation and clear down features if alt ref @@ -380,10 +407,10 @@ static void configure_static_seg_features(VP9_COMP *cpi) { vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); - xd->update_mb_segmentation_map = 0; - xd->update_mb_segmentation_data = 0; + xd->seg.update_map = 0; + xd->seg.update_data = 0; - vp9_clearall_segfeatures(xd); + vp9_clearall_segfeatures(&xd->seg); } } else if (cpi->is_src_frame_alt_ref) { // Special case where we are coding over the top of a previous @@ -391,28 +418,28 @@ static void configure_static_seg_features(VP9_COMP *cpi) { // Segment coding disabled for compred testing // Enable ref frame features for segment 0 as well - vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME); - vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME); + vp9_enable_segfeature(&xd->seg, 0, SEG_LVL_REF_FRAME); + vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_REF_FRAME); // All mbs should use ALTREF_FRAME - vp9_clear_segdata(xd, 0, SEG_LVL_REF_FRAME); - vp9_set_segdata(xd, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME); - vp9_clear_segdata(xd, 1, SEG_LVL_REF_FRAME); - vp9_set_segdata(xd, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); + vp9_clear_segdata(&xd->seg, 0, SEG_LVL_REF_FRAME); + vp9_set_segdata(&xd->seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME); + vp9_clear_segdata(&xd->seg, 1, SEG_LVL_REF_FRAME); + vp9_set_segdata(&xd->seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); // Skip all MBs if high Q (0,0 mv and skip coeffs) if (high_q) { - vp9_enable_segfeature(xd, 0, SEG_LVL_SKIP); - vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP); + vp9_enable_segfeature(&xd->seg, 0, SEG_LVL_SKIP); + vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_SKIP); } - // Enable data udpate - xd->update_mb_segmentation_data = 1; + // Enable data update + xd->seg.update_data = 1; } else { // All other frames. // No updates.. leave things as they are. - xd->update_mb_segmentation_map = 0; - xd->update_mb_segmentation_data = 0; + xd->seg.update_map = 0; + xd->seg.update_data = 0; } } } @@ -518,20 +545,22 @@ static void update_reference_segmentation_map(VP9_COMP *cpi) { } static void set_default_lf_deltas(VP9_COMP *cpi) { - cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1; - cpi->mb.e_mbd.mode_ref_lf_delta_update = 1; + struct loopfilter *lf = &cpi->mb.e_mbd.lf; + + lf->mode_ref_delta_enabled = 1; + lf->mode_ref_delta_update = 1; - vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas)); - vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas)); + vp9_zero(lf->ref_deltas); + vp9_zero(lf->mode_deltas); // Test of ref frame deltas - cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2; - cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0; - cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2; - cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2; + lf->ref_deltas[INTRA_FRAME] = 2; + lf->ref_deltas[LAST_FRAME] = 0; + lf->ref_deltas[GOLDEN_FRAME] = -2; + lf->ref_deltas[ALTREF_FRAME] = -2; - cpi->mb.e_mbd.mode_lf_deltas[0] = 0; // Zero - cpi->mb.e_mbd.mode_lf_deltas[1] = 0; // New mv + lf->mode_deltas[0] = 0; // Zero + lf->mode_deltas[1] = 0; // New mv } static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) { @@ -543,70 +572,70 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) { for (i = 0; i < MAX_MODES; ++i) sf->thresh_mult[i] = mode == 0 ? -500 : 0; - sf->thresh_mult[THR_ZEROMV ] = 0; - sf->thresh_mult[THR_ZEROG ] = 0; - sf->thresh_mult[THR_ZEROA ] = 0; - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_NEARESTG ] = 0; - sf->thresh_mult[THR_NEARESTA ] = 0; - - sf->thresh_mult[THR_NEARMV ] += speed_multiplier * 1000; - sf->thresh_mult[THR_NEARG ] += speed_multiplier * 1000; - sf->thresh_mult[THR_NEARA ] += speed_multiplier * 1000; - - sf->thresh_mult[THR_DC ] = 0; - sf->thresh_mult[THR_TM ] += speed_multiplier * 1000; - sf->thresh_mult[THR_V_PRED ] += speed_multiplier * 1000; - sf->thresh_mult[THR_H_PRED ] += speed_multiplier * 1000; - sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 1500; - sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 1500; - sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 1500; - sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 1500; - sf->thresh_mult[THR_D27_PRED ] += speed_multiplier * 1500; - sf->thresh_mult[THR_D63_PRED ] += speed_multiplier * 1500; - - sf->thresh_mult[THR_B_PRED ] += speed_multiplier * 2500; - - sf->thresh_mult[THR_NEWMV ] += speed_multiplier * 1000; - sf->thresh_mult[THR_NEWG ] += speed_multiplier * 1000; - sf->thresh_mult[THR_NEWA ] += speed_multiplier * 1000; - - sf->thresh_mult[THR_SPLITMV ] += speed_multiplier * 2500; - sf->thresh_mult[THR_SPLITG ] += speed_multiplier * 2500; - sf->thresh_mult[THR_SPLITA ] += speed_multiplier * 2500; - - sf->thresh_mult[THR_COMP_ZEROLA ] += speed_multiplier * 1500; - sf->thresh_mult[THR_COMP_ZEROGA ] += speed_multiplier * 1500; - - sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1500; - sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1500; - - sf->thresh_mult[THR_COMP_NEARLA ] += speed_multiplier * 1500; - sf->thresh_mult[THR_COMP_NEARGA ] += speed_multiplier * 1500; - - sf->thresh_mult[THR_COMP_NEWLA ] += speed_multiplier * 2000; - sf->thresh_mult[THR_COMP_NEWGA ] += speed_multiplier * 2000; - - sf->thresh_mult[THR_COMP_SPLITLA ] += speed_multiplier * 4500; - sf->thresh_mult[THR_COMP_SPLITGA ] += speed_multiplier * 4500; - - if (speed > 4) { + sf->thresh_mult[THR_NEARESTG] = 0; + sf->thresh_mult[THR_NEARESTA] = 0; + + sf->thresh_mult[THR_NEWMV] += speed_multiplier * 1000; + sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1000; + sf->thresh_mult[THR_NEARMV] += speed_multiplier * 1000; + sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1000; + + sf->thresh_mult[THR_DC] += speed_multiplier * 1000; + + sf->thresh_mult[THR_NEWG] += speed_multiplier * 1000; + sf->thresh_mult[THR_NEWA] += speed_multiplier * 1000; + sf->thresh_mult[THR_NEARA] += speed_multiplier * 1000; + + sf->thresh_mult[THR_TM] += speed_multiplier * 1000; + + sf->thresh_mult[THR_COMP_NEARLA] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_NEWLA] += speed_multiplier * 2000; + sf->thresh_mult[THR_NEARG] += speed_multiplier * 1000; + sf->thresh_mult[THR_COMP_NEARGA] += speed_multiplier * 1500; + sf->thresh_mult[THR_COMP_NEWGA] += speed_multiplier * 2000; + + sf->thresh_mult[THR_SPLITMV] += speed_multiplier * 2500; + sf->thresh_mult[THR_SPLITG] += speed_multiplier * 2500; + sf->thresh_mult[THR_SPLITA] += speed_multiplier * 2500; + sf->thresh_mult[THR_COMP_SPLITLA] += speed_multiplier * 4500; + sf->thresh_mult[THR_COMP_SPLITGA] += speed_multiplier * 4500; + + sf->thresh_mult[THR_ZEROMV] += speed_multiplier * 2000; + sf->thresh_mult[THR_ZEROG] += speed_multiplier * 2000; + sf->thresh_mult[THR_ZEROA] += speed_multiplier * 2000; + sf->thresh_mult[THR_COMP_ZEROLA] += speed_multiplier * 2500; + sf->thresh_mult[THR_COMP_ZEROGA] += speed_multiplier * 2500; + + sf->thresh_mult[THR_B_PRED] += speed_multiplier * 2500; + sf->thresh_mult[THR_H_PRED] += speed_multiplier * 2000; + sf->thresh_mult[THR_V_PRED] += speed_multiplier * 2000; + sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 2500; + sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 2500; + sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 2500; + sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 2500; + sf->thresh_mult[THR_D27_PRED] += speed_multiplier * 2500; + sf->thresh_mult[THR_D63_PRED] += speed_multiplier * 2500; + + if (cpi->sf.skip_lots_of_modes) { for (i = 0; i < MAX_MODES; ++i) sf->thresh_mult[i] = INT_MAX; - sf->thresh_mult[THR_DC ] = 0; - sf->thresh_mult[THR_TM ] = 0; - sf->thresh_mult[THR_NEWMV ] = 4000; - sf->thresh_mult[THR_NEWG ] = 4000; - sf->thresh_mult[THR_NEWA ] = 4000; + sf->thresh_mult[THR_DC] = 2000; + sf->thresh_mult[THR_TM] = 2000; + sf->thresh_mult[THR_NEWMV] = 4000; + sf->thresh_mult[THR_NEWG] = 4000; + sf->thresh_mult[THR_NEWA] = 4000; sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_NEARESTG ] = 0; - sf->thresh_mult[THR_NEARESTA ] = 0; - sf->thresh_mult[THR_NEARMV ] = 2000; - sf->thresh_mult[THR_NEARG ] = 2000; - sf->thresh_mult[THR_NEARA ] = 2000; + sf->thresh_mult[THR_NEARESTG] = 0; + sf->thresh_mult[THR_NEARESTA] = 0; + sf->thresh_mult[THR_NEARMV] = 2000; + sf->thresh_mult[THR_NEARG] = 2000; + sf->thresh_mult[THR_NEARA] = 2000; sf->thresh_mult[THR_COMP_NEARESTLA] = 2000; + sf->thresh_mult[THR_SPLITMV] = 2500; + sf->thresh_mult[THR_SPLITG] = 2500; + sf->thresh_mult[THR_SPLITA] = 2500; sf->recode_loop = 0; } @@ -649,6 +678,15 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) { sf->thresh_mult[THR_COMP_NEWGA ] = INT_MAX; sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX; } + + if (sf->disable_splitmv == 1) { + sf->thresh_mult[THR_SPLITMV ] = INT_MAX; + sf->thresh_mult[THR_SPLITG ] = INT_MAX; + sf->thresh_mult[THR_SPLITA ] = INT_MAX; + + sf->thresh_mult[THR_COMP_SPLITLA ] = INT_MAX; + sf->thresh_mult[THR_COMP_SPLITGA ] = INT_MAX; + } } void vp9_set_speed_features(VP9_COMP *cpi) { @@ -677,10 +715,38 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->half_pixel_search = 1; sf->iterative_sub_pixel = 1; sf->optimize_coefficients = !cpi->oxcf.lossless; - sf->first_step = 0; + sf->reduce_first_step_size = 0; + sf->auto_mv_step_size = 0; sf->max_step_search_steps = MAX_MVSEARCH_STEPS; sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4; - sf->adpative_rd_thresh = 0; + sf->adaptive_rd_thresh = 0; + sf->use_lastframe_partitioning = 0; + sf->tx_size_search_method = USE_FULL_RD; + sf->use_8tap_always = 0; + sf->use_avoid_tested_higherror = 0; + sf->reference_masking = 0; + sf->skip_lots_of_modes = 0; + sf->adjust_thresholds_by_speed = 0; + sf->partition_by_variance = 0; + sf->use_one_partition_size_always = 0; + sf->less_rectangular_check = 0; + sf->use_square_partition_only = 0; + sf->use_partitions_less_than = 0; + sf->less_than_block_size = BLOCK_SIZE_MB16X16; + sf->use_partitions_greater_than = 0; + sf->greater_than_block_size = BLOCK_SIZE_SB8X8; + sf->adjust_partitioning_from_last_frame = 0; + sf->last_partitioning_redo_frequency = 4; + sf->disable_splitmv = 0; + sf->mode_search_skip_flags = 0; + sf->last_chroma_intra_mode = TM_PRED; + sf->use_rd_breakout = 0; + sf->skip_encode_sb = 0; + sf->use_uv_intra_rd_estimate = 0; + sf->using_small_partition_info = 0; + // Skip any mode not chosen at size < X for all sizes > X + // Hence BLOCK_SIZE_SB64X64 (skip is off) + sf->unused_mode_skip_lvl = BLOCK_SIZE_SB64X64; #if CONFIG_MULTIPLE_ARF // Switch segmentation off. @@ -701,19 +767,121 @@ void vp9_set_speed_features(VP9_COMP *cpi) { #else sf->static_segmentation = 0; #endif - sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8; - sf->adpative_rd_thresh = 1; - if (speed > 0) { + sf->use_avoid_tested_higherror = 1; + sf->adaptive_rd_thresh = 1; + sf->last_chroma_intra_mode = TM_PRED; + + if (speed == 1) { + sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; + sf->less_rectangular_check = 1; + sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || + cpi->common.intra_only || + cpi->common.show_frame == 0) ? + USE_FULL_RD : + USE_LARGESTALL); + sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME || + cpi->common.intra_only || + cpi->common.show_frame == 0); + sf->disable_splitmv = + (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0; + sf->unused_mode_skip_lvl = BLOCK_SIZE_SB32X32; + sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | + FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA; + sf->last_chroma_intra_mode = H_PRED; + sf->use_rd_breakout = 1; + sf->skip_encode_sb = 1; + sf->auto_mv_step_size = 1; + } + if (speed == 2) { + sf->adjust_thresholds_by_speed = 1; + sf->less_rectangular_check = 1; + sf->use_square_partition_only = 1; + sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; + sf->use_lastframe_partitioning = 1; + sf->adjust_partitioning_from_last_frame = 1; + sf->last_partitioning_redo_frequency = 3; + sf->unused_mode_skip_lvl = BLOCK_SIZE_SB32X32; + sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || + cpi->common.intra_only || + cpi->common.show_frame == 0) ? + USE_FULL_RD : + USE_LARGESTALL); + sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | + FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | + FLAG_SKIP_COMP_REFMISMATCH; + sf->last_chroma_intra_mode = DC_PRED; + sf->use_rd_breakout = 1; + sf->skip_encode_sb = 1; + sf->use_uv_intra_rd_estimate = 1; + sf->using_small_partition_info = 1; + sf->disable_splitmv = + (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0; + sf->auto_mv_step_size = 1; + } + if (speed == 3) { sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; + sf->partition_by_variance = 1; + sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || + cpi->common.intra_only || + cpi->common.show_frame == 0) ? + USE_FULL_RD : + USE_LARGESTALL); + sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | + FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | + FLAG_SKIP_COMP_REFMISMATCH; + sf->use_rd_breakout = 1; + sf->skip_encode_sb = 1; + sf->disable_splitmv = 1; + sf->auto_mv_step_size = 1; + } + if (speed == 4) { + sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; + sf->use_one_partition_size_always = 1; + sf->always_this_block_size = BLOCK_SIZE_MB16X16; + sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || + cpi->common.intra_only || + cpi->common.show_frame == 0) ? + USE_FULL_RD : + USE_LARGESTALL); + sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | + FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | + FLAG_SKIP_COMP_REFMISMATCH; + sf->use_rd_breakout = 1; sf->optimize_coefficients = 0; - sf->first_step = 1; + sf->auto_mv_step_size = 1; + // sf->reduce_first_step_size = 1; + // sf->reference_masking = 1; + + sf->disable_splitmv = 1; + } + /* + if (speed == 2) { + sf->first_step = 0; + sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8; + sf->use_partitions_less_than = 1; + sf->less_than_block_size = BLOCK_SIZE_MB16X16; + } + if (speed == 3) { + sf->first_step = 0; + sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8; + sf->use_partitions_greater_than = 1; + sf->greater_than_block_size = BLOCK_SIZE_SB8X8; } + */ + break; }; /* switch */ // Set rd thresholds based on mode and speed setting - set_rd_speed_thresholds(cpi, mode, speed); + if (cpi->sf.adjust_thresholds_by_speed) + set_rd_speed_thresholds(cpi, mode, speed); + else + set_rd_speed_thresholds(cpi, mode, 0); // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. @@ -732,8 +900,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) { cpi->mb.quantize_b_4x4 = vp9_regular_quantize_b_4x4; - vp9_init_quantizer(cpi); - if (cpi->sf.iterative_sub_pixel == 1) { cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_step_iteratively; } else if (cpi->sf.quarter_pixel_search) { @@ -770,8 +936,8 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) { static int alloc_partition_data(VP9_COMP *cpi) { vpx_free(cpi->mb.pip); - cpi->mb.pip = vpx_calloc((cpi->common.mode_info_stride) * - (cpi->common.mi_rows + 64 / MI_SIZE), + cpi->mb.pip = vpx_calloc(cpi->common.mode_info_stride * + (cpi->common.mi_rows + MI_BLOCK_SIZE), sizeof(PARTITION_INFO)); if (!cpi->mb.pip) return 1; @@ -811,7 +977,7 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) { { unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols); - CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok))); + CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok))); } // Data used for real time vc mode to see if gf needs refreshing @@ -820,12 +986,12 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) { cpi->gf_update_recommended = 0; vpx_free(cpi->mb_activity_map); - CHECK_MEM_ERROR(cpi->mb_activity_map, + CHECK_MEM_ERROR(cm, cpi->mb_activity_map, vpx_calloc(sizeof(unsigned int), cm->mb_rows * cm->mb_cols)); vpx_free(cpi->mb_norm_activity_map); - CHECK_MEM_ERROR(cpi->mb_norm_activity_map, + CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map, vpx_calloc(sizeof(unsigned int), cm->mb_rows * cm->mb_cols)); } @@ -889,14 +1055,14 @@ int vp9_reverse_trans(int x) { return 63; }; -void vp9_new_frame_rate(VP9_COMP *cpi, double framerate) { +void vp9_new_framerate(VP9_COMP *cpi, double framerate) { if (framerate < 0.1) framerate = 30; - cpi->oxcf.frame_rate = framerate; - cpi->output_frame_rate = cpi->oxcf.frame_rate; - cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate); - cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate); + cpi->oxcf.framerate = framerate; + cpi->output_framerate = cpi->oxcf.framerate; + cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate); + cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate); cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); @@ -931,19 +1097,13 @@ static int64_t rescale(int val, int64_t num, int denom) { static void set_tile_limits(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; - int min_log2_tiles, max_log2_tiles; - cm->log2_tile_columns = cpi->oxcf.tile_columns; - cm->log2_tile_rows = cpi->oxcf.tile_rows; + int min_log2_tile_cols, max_log2_tile_cols; + vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); - vp9_get_tile_n_bits(cm, &min_log2_tiles, &max_log2_tiles); - max_log2_tiles += min_log2_tiles; - - cm->log2_tile_columns = clamp(cm->log2_tile_columns, - min_log2_tiles, max_log2_tiles); - - cm->tile_columns = 1 << cm->log2_tile_columns; - cm->tile_rows = 1 << cm->log2_tile_rows; + cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns, + min_log2_tile_cols, max_log2_tile_cols); + cm->log2_tile_rows = cpi->oxcf.tile_rows; } static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { @@ -1059,7 +1219,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { { int i; - for (i = 0; i < MAX_MB_SEGMENTS; i++) + for (i = 0; i < MAX_SEGMENTS; i++) cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; } @@ -1093,7 +1253,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->oxcf.target_bandwidth, 1000); // Set up frame rate and related parameters rate control values. - vp9_new_frame_rate(cpi, cpi->oxcf.frame_rate); + vp9_new_framerate(cpi, cpi->oxcf.framerate); // Set absolute upper and lower quality limits cpi->worst_quality = cpi->oxcf.worst_allowed_q; @@ -1122,7 +1282,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) cpi->oxcf.Sharpness = MIN(7, cpi->oxcf.Sharpness); - cm->sharpness_level = cpi->oxcf.Sharpness; + cpi->mb.e_mbd.lf.sharpness_level = cpi->oxcf.Sharpness; if (cpi->initial_width) { // Increasing the size of the frame beyond the first seen frame, or some @@ -1233,15 +1393,16 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { return 0; } - cpi->common.error.setjmp = 1; + cm->error.setjmp = 1; - CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1)); + CHECK_MEM_ERROR(cm, cpi->mb.ss, vpx_calloc(sizeof(search_site), + (MAX_MVSEARCH_STEPS * 8) + 1)); - vp9_create_common(&cpi->common); + vp9_create_common(cm); init_config((VP9_PTR)cpi, oxcf); - cpi->common.current_video_frame = 0; + cm->current_video_frame = 0; cpi->kf_overspend_bits = 0; cpi->kf_bitrate_adjustment = 0; cpi->frames_till_gf_update_due = 0; @@ -1249,7 +1410,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->non_gf_bitrate_adjustment = 0; // Set reference frame sign bias for ALTREF frame to 1 (for now) - cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1; + cm->ref_frame_sign_bias[ALTREF_FRAME] = 1; cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL; @@ -1258,28 +1419,27 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->gold_is_alt = 0; // Create the encoder segmentation map and set all entries to 0 - CHECK_MEM_ERROR(cpi->segmentation_map, - vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1)); + CHECK_MEM_ERROR(cm, cpi->segmentation_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); // And a copy in common for temporal coding - CHECK_MEM_ERROR(cm->last_frame_seg_map, - vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1)); + CHECK_MEM_ERROR(cm, cm->last_frame_seg_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); // And a place holder structure is the coding context // for use if we want to save and restore it - CHECK_MEM_ERROR(cpi->coding_context.last_frame_seg_map_copy, - vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, 1)); + CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy, + vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); - CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1)); - vpx_memset(cpi->active_map, 1, (cpi->common.mb_rows * cpi->common.mb_cols)); + CHECK_MEM_ERROR(cm, cpi->active_map, vpx_calloc(cm->MBs, 1)); + vpx_memset(cpi->active_map, 1, cm->MBs); cpi->active_map_enabled = 0; for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0])); i++) { - CHECK_MEM_ERROR(cpi->mbgraph_stats[i].mb_stats, - vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols * - sizeof(*cpi->mbgraph_stats[i].mb_stats), - 1)); + CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats, + vpx_calloc(cm->MBs * + sizeof(*cpi->mbgraph_stats[i].mb_stats), 1)); } #ifdef ENTROPY_STATS @@ -1385,7 +1545,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp); for (i = 0; i < KEY_FRAME_CONTEXT; i++) - cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate; + cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate; #ifdef OUTPUT_YUV_SRC yuv_file = fopen("bd.yuv", "ab"); @@ -1420,8 +1580,10 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { for (i = 0; i < MAX_MODES; i++) cpi->rd_thresh_mult[i] = 128; -#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\ +#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \ + SDX3F, SDX8F, SDX4DF)\ cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ cpi->fn_ptr[BT].vf = VF; \ cpi->fn_ptr[BT].svf = SVF; \ cpi->fn_ptr[BT].svaf = SVAF; \ @@ -1432,67 +1594,80 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->fn_ptr[BT].sdx8f = SDX8F; \ cpi->fn_ptr[BT].sdx4df = SDX4DF; - BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16, + BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg, + vp9_variance32x16, vp9_sub_pixel_variance32x16, vp9_sub_pixel_avg_variance32x16, NULL, NULL, NULL, NULL, NULL, vp9_sad32x16x4d) - BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32, + BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg, + vp9_variance16x32, vp9_sub_pixel_variance16x32, vp9_sub_pixel_avg_variance16x32, NULL, NULL, NULL, NULL, NULL, vp9_sad16x32x4d) - BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32, + BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg, + vp9_variance64x32, vp9_sub_pixel_variance64x32, vp9_sub_pixel_avg_variance64x32, NULL, NULL, NULL, NULL, NULL, vp9_sad64x32x4d) - BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64, + BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg, + vp9_variance32x64, vp9_sub_pixel_variance32x64, vp9_sub_pixel_avg_variance32x64, NULL, NULL, NULL, NULL, NULL, vp9_sad32x64x4d) - BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32, + BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg, + vp9_variance32x32, vp9_sub_pixel_variance32x32, vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v, vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8, vp9_sad32x32x4d) - BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64, + BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg, + vp9_variance64x64, vp9_sub_pixel_variance64x64, vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v, vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8, vp9_sad64x64x4d) - BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16, + BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg, + vp9_variance16x16, vp9_sub_pixel_variance16x16, vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v, vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8, vp9_sad16x16x4d) - BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8, + BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg, + vp9_variance16x8, vp9_sub_pixel_variance16x8, vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d) - BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16, + BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg, + vp9_variance8x16, vp9_sub_pixel_variance8x16, vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d) - BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8, + BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg, + vp9_variance8x8, vp9_sub_pixel_variance8x8, vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d) - BFP(BLOCK_8X4, vp9_sad8x4, vp9_variance8x4, vp9_sub_pixel_variance8x4, + BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg, + vp9_variance8x4, vp9_sub_pixel_variance8x4, vp9_sub_pixel_avg_variance8x4, NULL, NULL, NULL, NULL, vp9_sad8x4x8, vp9_sad8x4x4d) - BFP(BLOCK_4X8, vp9_sad4x8, vp9_variance4x8, vp9_sub_pixel_variance4x8, + BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg, + vp9_variance4x8, vp9_sub_pixel_variance4x8, vp9_sub_pixel_avg_variance4x8, NULL, NULL, NULL, NULL, vp9_sad4x8x8, vp9_sad4x8x4d) - BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4, + BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg, + vp9_variance4x4, vp9_sub_pixel_variance4x4, vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d) @@ -1510,7 +1685,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { */ vp9_init_quantizer(cpi); - vp9_loop_filter_init(cm); + vp9_loop_filter_init(cm, &cpi->mb.e_mbd.lf); cpi->common.error.setjmp = 0; @@ -1756,8 +1931,8 @@ static void generate_psnr_packet(VP9_COMP *cpi) { struct vpx_codec_cx_pkt pkt; uint64_t sse; int i; - unsigned int width = cpi->common.width; - unsigned int height = cpi->common.height; + unsigned int width = orig->y_crop_width; + unsigned int height = orig->y_crop_height; pkt.kind = VPX_CODEC_PSNR_PKT; sse = calc_plane_error(orig->y_buffer, orig->y_stride, @@ -1768,8 +1943,8 @@ static void generate_psnr_packet(VP9_COMP *cpi) { pkt.data.psnr.samples[0] = width * height; pkt.data.psnr.samples[1] = width * height; - width = orig->uv_width; - height = orig->uv_height; + width = orig->uv_crop_width; + height = orig->uv_crop_height; sse = calc_plane_error(orig->u_buffer, orig->uv_stride, recon->u_buffer, recon->uv_stride, @@ -1997,7 +2172,7 @@ static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb, static void update_alt_ref_frame_stats(VP9_COMP *cpi) { // this frame refreshes means next frames don't unless specified by user - cpi->common.frames_since_golden = 0; + cpi->frames_since_golden = 0; #if CONFIG_MULTIPLE_ARF if (!cpi->multi_arf_enabled) @@ -2013,7 +2188,7 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { if (cpi->refresh_golden_frame) { // this frame refreshes means next frames don't unless specified by user cpi->refresh_golden_frame = 0; - cpi->common.frames_since_golden = 0; + cpi->frames_since_golden = 0; // ******** Fixed Q test code only ************ // If we are going to use the ALT reference for the next group of frames set a flag to say so. @@ -2035,10 +2210,10 @@ static void update_golden_frame_stats(VP9_COMP *cpi) { if (cpi->frames_till_gf_update_due > 0) cpi->frames_till_gf_update_due--; - if (cpi->common.frames_till_alt_ref_frame) - cpi->common.frames_till_alt_ref_frame--; + if (cpi->frames_till_alt_ref_frame) + cpi->frames_till_alt_ref_frame--; - cpi->common.frames_since_golden++; + cpi->frames_since_golden++; } } @@ -2230,8 +2405,9 @@ static void update_reference_frames(VP9_COMP * const cpi) { } static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { - if (cpi->mb.e_mbd.lossless) { - cm->filter_level = 0; + MACROBLOCKD *xd = &cpi->mb.e_mbd; + if (xd->lossless) { + xd->lf.filter_level = 0; } else { struct vpx_usec_timer timer; @@ -2245,53 +2421,13 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); } - if (cm->filter_level > 0) { - vp9_set_alt_lf_level(cpi, cm->filter_level); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0); + if (xd->lf.filter_level > 0) { + vp9_set_alt_lf_level(cpi, xd->lf.filter_level); + vp9_loop_filter_frame(cm, xd, xd->lf.filter_level, 0); } - vp9_extend_frame_borders(cm->frame_to_show, - cm->subsampling_x, cm->subsampling_y); - -} - -void vp9_select_interp_filter_type(VP9_COMP *cpi) { - int i; - int high_filter_index = 0; - unsigned int thresh; - unsigned int high_count = 0; - unsigned int count_sum = 0; - unsigned int *hist = cpi->best_switchable_interp_count; - - if (DEFAULT_INTERP_FILTER != SWITCHABLE) { - cpi->common.mcomp_filter_type = DEFAULT_INTERP_FILTER; - return; - } - - // TODO(agrange): Look at using RD criteria to select the interpolation - // filter to use for the next frame rather than this simpler counting scheme. - - // Select the interpolation filter mode for the next frame - // based on the selection frequency seen in the current frame. - for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) { - unsigned int count = hist[i]; - count_sum += count; - if (count > high_count) { - high_count = count; - high_filter_index = i; - } - } - - thresh = (unsigned int)(0.80 * count_sum); - - if (high_count > thresh) { - // One filter accounts for 80+% of cases so force the next - // frame to use this filter exclusively using frame-level flag. - cpi->common.mcomp_filter_type = vp9_switchable_interp[high_filter_index]; - } else { - // Use a MB-level switchable filter selection strategy. - cpi->common.mcomp_filter_type = SWITCHABLE; - } + vp9_extend_frame_inner_borders(cm->frame_to_show, + cm->subsampling_x, cm->subsampling_y); } static void scale_references(VP9_COMP *cpi) { @@ -2326,6 +2462,31 @@ static void release_scaled_references(VP9_COMP *cpi) { cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--; } +static void full_to_model_count(unsigned int *model_count, + unsigned int *full_count) { + int n; + model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN]; + model_count[ONE_TOKEN] = full_count[ONE_TOKEN]; + model_count[TWO_TOKEN] = full_count[TWO_TOKEN]; + for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n) + model_count[TWO_TOKEN] += full_count[n]; + model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN]; +} + +static void full_to_model_counts( + vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) { + int i, j, k, l; + for (i = 0; i < BLOCK_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + if (l >= 3 && k == 0) + continue; + full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]); + } +} + + static void encode_frame_to_data_rate(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, @@ -2351,6 +2512,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, int undershoot_seen = 0; SPEED_FEATURES *sf = &cpi->sf; + unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height); #if RESET_FOREACH_FILTER int q_low0; int q_high0; @@ -2392,7 +2554,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->per_frame_bandwidth = cpi->twopass.gf_bits; // per second target bitrate cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * - cpi->output_frame_rate); + cpi->output_framerate); } // Clear zbin over-quant value and mode boost values. @@ -2421,8 +2583,26 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } // Set default state for segment based loop filter update flags - xd->mode_ref_lf_delta_update = 0; - + xd->lf.mode_ref_delta_update = 0; + + // Initialize cpi->mv_step_param to default based on max resolution + cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def); + // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate. + if (sf->auto_mv_step_size) { + if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only) { + // initialize max_mv_magnitude for use in the first INTER frame + // after a key/intra-only frame + cpi->max_mv_magnitude = max_mv_def; + } else { + if (cm->show_frame) + // allow mv_steps to correspond to twice the max mv magnitude found + // in the previous frame, capped by the default max_mv_magnitude based + // on resolution + cpi->mv_step_param = vp9_init_search_range( + cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude)); + cpi->max_mv_magnitude = 0; + } + } // Set various flags etc to special state if it is a key frame if (cm->frame_type == KEY_FRAME) { @@ -2432,9 +2612,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, setup_features(cpi); // If segmentation is enabled force a map update for key frames - if (xd->segmentation_enabled) { - xd->update_mb_segmentation_map = 1; - xd->update_mb_segmentation_data = 1; + if (xd->seg.enabled) { + xd->seg.update_map = 1; + xd->seg.update_data = 1; } // The alternate reference frame cannot be active for a key frame @@ -2965,35 +3145,36 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->dummy_packing = 0; vp9_pack_bitstream(cpi, dest, size); - if (xd->update_mb_segmentation_map) { + if (xd->seg.update_map) update_reference_segmentation_map(cpi); - } release_scaled_references(cpi); update_reference_frames(cpi); for (t = TX_4X4; t <= TX_32X32; t++) - vp9_full_to_model_counts(cpi->common.fc.coef_counts[t], - cpi->coef_counts[t]); + full_to_model_counts(cpi->common.counts.coef[t], + cpi->coef_counts[t]); if (!cpi->common.error_resilient_mode && !cpi->common.frame_parallel_decoding_mode) { vp9_adapt_coef_probs(&cpi->common); } if (cpi->common.frame_type != KEY_FRAME) { - vp9_copy(cpi->common.fc.y_mode_counts, cpi->y_mode_count); - vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count); - vp9_copy(cpi->common.fc.partition_counts, cpi->partition_count); - vp9_copy(cm->fc.intra_inter_count, cpi->intra_inter_count); - vp9_copy(cm->fc.comp_inter_count, cpi->comp_inter_count); - vp9_copy(cm->fc.single_ref_count, cpi->single_ref_count); - vp9_copy(cm->fc.comp_ref_count, cpi->comp_ref_count); - cpi->common.fc.NMVcount = cpi->NMVcount; + FRAME_COUNTS *counts = &cpi->common.counts; + + vp9_copy(counts->y_mode, cpi->y_mode_count); + vp9_copy(counts->uv_mode, cpi->y_uv_mode_count); + vp9_copy(counts->partition, cpi->partition_count); + vp9_copy(counts->intra_inter, cpi->intra_inter_count); + vp9_copy(counts->comp_inter, cpi->comp_inter_count); + vp9_copy(counts->single_ref, cpi->single_ref_count); + vp9_copy(counts->comp_ref, cpi->comp_ref_count); + counts->mv = cpi->NMVcount; if (!cpi->common.error_resilient_mode && !cpi->common.frame_parallel_decoding_mode) { vp9_adapt_mode_probs(&cpi->common); vp9_adapt_mode_context(&cpi->common); - vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv); + vp9_adapt_mv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv); } } @@ -3273,23 +3454,32 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas. - xd->update_mb_segmentation_map = 0; - xd->update_mb_segmentation_data = 0; - xd->mode_ref_lf_delta_update = 0; + xd->seg.update_map = 0; + xd->seg.update_data = 0; + xd->lf.mode_ref_delta_update = 0; // keep track of the last coded dimensions cm->last_width = cm->width; cm->last_height = cm->height; - // Don't increment frame counters if this was an altref buffer - // update not a real frame + // reset to normal state now that we are done. cm->last_show_frame = cm->show_frame; if (cm->show_frame) { + // current mip will be the prev_mip for the next frame + MODE_INFO *temp = cm->prev_mip; + cm->prev_mip = cm->mip; + cm->mip = temp; + + // update the upper left visible macroblock ptrs + cm->mi = cm->mip + cm->mode_info_stride + 1; + + // Don't increment frame counters if this was an altref buffer + // update not a real frame ++cm->current_video_frame; ++cpi->frames_since_key; } - - // reset to normal state now that we are done. + // restore prev_mi + cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; #if 0 { @@ -3307,17 +3497,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_write_yuv_rec_frame(cm); #endif - if (cm->show_frame) { - vpx_memcpy(cm->prev_mip, cm->mip, - cm->mode_info_stride * (cm->mi_rows + 64 / MI_SIZE) * - sizeof(MODE_INFO)); - } else { - vpx_memset(cm->prev_mip, 0, - cm->mode_info_stride * (cm->mi_rows + 64 / MI_SIZE) * - sizeof(MODE_INFO)); - } - // restore prev_mi - cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; } static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, @@ -3327,7 +3506,7 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, vp9_second_pass(cpi); encode_frame_to_data_rate(cpi, size, dest, frame_flags); - + // vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt"); #ifdef DISABLE_RC_LONG_TERM_MEM cpi->twopass.bits_left -= cpi->this_frame_target; #else @@ -3335,14 +3514,14 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, #endif if (!cpi->refresh_alt_ref_frame) { - double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate; + double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate; double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); if (two_pass_min_rate < lower_bounds_min_rate) two_pass_min_rate = lower_bounds_min_rate; - cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.frame_rate); + cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.framerate); } } @@ -3368,7 +3547,6 @@ int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags, if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL)) res = -1; - cm->clr_type = sd->clrtype; vpx_usec_timer_mark(&timer); cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); @@ -3385,9 +3563,9 @@ static int frame_is_reference(const VP9_COMP *cpi) { cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame || cm->refresh_frame_context || - mb->mode_ref_lf_delta_update || - mb->update_mb_segmentation_map || - mb->update_mb_segmentation_data; + mb->lf.mode_ref_delta_update || + mb->seg.update_map || + mb->seg.update_data; } #if CONFIG_MULTIPLE_ARF @@ -3458,7 +3636,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, cpi->is_src_frame_alt_ref = 0; // TODO(agrange) This needs to vary depending on where the next ARF is. - cm->frames_till_alt_ref_frame = frames_to_arf; + cpi->frames_till_alt_ref_frame = frames_to_arf; #if CONFIG_MULTIPLE_ARF if (!cpi->multi_arf_enabled) @@ -3565,18 +3743,18 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, if (this_duration) { if (step) { - vp9_new_frame_rate(cpi, 10000000.0 / this_duration); + vp9_new_framerate(cpi, 10000000.0 / this_duration); } else { // Average this frame's rate into the last second's average // frame rate. If we haven't seen 1 second yet, then average // over the whole interval seen. const double interval = MIN((double)(cpi->source->ts_end - cpi->first_time_stamp_ever), 10000000.0); - double avg_duration = 10000000.0 / cpi->oxcf.frame_rate; + double avg_duration = 10000000.0 / cpi->oxcf.framerate; avg_duration *= (interval - avg_duration + this_duration); avg_duration /= interval; - vp9_new_frame_rate(cpi, 10000000.0 / avg_duration); + vp9_new_framerate(cpi, 10000000.0 / avg_duration); } } @@ -3691,16 +3869,16 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, double sq_error; ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride, - recon->y_buffer, recon->y_stride, orig->y_width, - orig->y_height); + recon->y_buffer, recon->y_stride, + orig->y_crop_width, orig->y_crop_height); ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride, - recon->u_buffer, recon->uv_stride, orig->uv_width, - orig->uv_height); + recon->u_buffer, recon->uv_stride, + orig->uv_crop_width, orig->uv_crop_height); ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride, - recon->v_buffer, recon->uv_stride, orig->uv_width, - orig->uv_height); + recon->v_buffer, recon->uv_stride, + orig->uv_crop_width, orig->uv_crop_height); sq_error = ye + ue + ve; @@ -3716,21 +3894,21 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, double weight = 0; #if CONFIG_POSTPROC vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, - cm->filter_level * 10 / 6); + cpi->mb.e_mbd.lf.filter_level * 10 / 6); #endif vp9_clear_system_state(); ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride, - pp->y_buffer, pp->y_stride, orig->y_width, - orig->y_height); + pp->y_buffer, pp->y_stride, + orig->y_crop_width, orig->y_crop_height); ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride, - pp->u_buffer, pp->uv_stride, orig->uv_width, - orig->uv_height); + pp->u_buffer, pp->uv_stride, + orig->uv_crop_width, orig->uv_crop_height); ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride, - pp->v_buffer, pp->uv_stride, orig->uv_width, - orig->uv_height); + pp->v_buffer, pp->uv_stride, + orig->uv_crop_width, orig->uv_crop_height); sq_error = ye + ue + ve; @@ -3791,7 +3969,7 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, else { int ret; #if CONFIG_POSTPROC - ret = vp9_post_proc_frame(&cpi->common, dest, flags); + ret = vp9_post_proc_frame(&cpi->common, &cpi->mb.e_mbd.lf, dest, flags); #else if (cpi->common.frame_to_show) { @@ -3811,11 +3989,11 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, } int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows, - unsigned int cols, int delta_q[MAX_MB_SEGMENTS], - int delta_lf[MAX_MB_SEGMENTS], - unsigned int threshold[MAX_MB_SEGMENTS]) { + unsigned int cols, int delta_q[MAX_SEGMENTS], + int delta_lf[MAX_SEGMENTS], + unsigned int threshold[MAX_SEGMENTS]) { VP9_COMP *cpi = (VP9_COMP *) comp; - signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS]; + signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS]; MACROBLOCKD *xd = &cpi->mb.e_mbd; int i; @@ -3834,23 +4012,23 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows, vp9_enable_segmentation((VP9_PTR)cpi); // Set up the quan, LF and breakout threshold segment data - for (i = 0; i < MAX_MB_SEGMENTS; i++) { + for (i = 0; i < MAX_SEGMENTS; i++) { feature_data[SEG_LVL_ALT_Q][i] = delta_q[i]; feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i]; cpi->segment_encode_breakout[i] = threshold[i]; } // Enable the loop and quant changes in the feature mask - for (i = 0; i < MAX_MB_SEGMENTS; i++) { + for (i = 0; i < MAX_SEGMENTS; i++) { if (delta_q[i]) - vp9_enable_segfeature(xd, i, SEG_LVL_ALT_Q); + vp9_enable_segfeature(&xd->seg, i, SEG_LVL_ALT_Q); else - vp9_disable_segfeature(xd, i, SEG_LVL_ALT_Q); + vp9_disable_segfeature(&xd->seg, i, SEG_LVL_ALT_Q); if (delta_lf[i]) - vp9_enable_segfeature(xd, i, SEG_LVL_ALT_LF); + vp9_enable_segfeature(&xd->seg, i, SEG_LVL_ALT_LF); else - vp9_disable_segfeature(xd, i, SEG_LVL_ALT_LF); + vp9_disable_segfeature(&xd->seg, i, SEG_LVL_ALT_LF); } // Initialise the feature data structure diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h index f5f1c07..0798927 100644 --- a/libvpx/vp9/encoder/vp9_onyx_int.h +++ b/libvpx/vp9/encoder/vp9_onyx_int.h @@ -89,9 +89,7 @@ typedef struct { int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2]; vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1]; - vp9_prob tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3]; - vp9_prob tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2]; - vp9_prob tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1]; + struct tx_probs tx_probs; vp9_prob mbskip_probs[MBSKIP_CONTEXTS]; } CODING_CONTEXT; @@ -143,55 +141,52 @@ typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS; +// This enumerator type needs to be kept aligned with the mode order in +// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code. typedef enum { - THR_ZEROMV, - THR_DC, - THR_NEARESTMV, - THR_NEARMV, - - THR_ZEROG, + THR_NEARESTA, THR_NEARESTG, + THR_NEWMV, + THR_COMP_NEARESTLA, + THR_NEARMV, + THR_COMP_NEARESTGA, - THR_ZEROA, - THR_NEARESTA, + THR_DC, - THR_NEARG, + THR_NEWG, + THR_NEWA, THR_NEARA, - THR_V_PRED, - THR_H_PRED, - THR_D45_PRED, - THR_D135_PRED, - THR_D117_PRED, - THR_D153_PRED, - THR_D27_PRED, - THR_D63_PRED, THR_TM, - THR_NEWMV, - THR_NEWG, - THR_NEWA, + THR_COMP_NEARLA, + THR_COMP_NEWLA, + THR_NEARG, + THR_COMP_NEARGA, + THR_COMP_NEWGA, THR_SPLITMV, THR_SPLITG, THR_SPLITA, + THR_COMP_SPLITLA, + THR_COMP_SPLITGA, - THR_B_PRED, - + THR_ZEROMV, + THR_ZEROG, + THR_ZEROA, THR_COMP_ZEROLA, - THR_COMP_NEARESTLA, - THR_COMP_NEARLA, - THR_COMP_ZEROGA, - THR_COMP_NEARESTGA, - THR_COMP_NEARGA, - THR_COMP_NEWLA, - THR_COMP_NEWGA, - - THR_COMP_SPLITLA, - THR_COMP_SPLITGA, + THR_B_PRED, + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D27_PRED, + THR_D153_PRED, + THR_D63_PRED, + THR_D117_PRED, + THR_D45_PRED, } THR_MODES; typedef enum { @@ -200,6 +195,37 @@ typedef enum { HEX = 2 } SEARCH_METHODS; +typedef enum { + USE_FULL_RD = 0, + USE_LARGESTINTRA, + USE_LARGESTINTRA_MODELINTER, + USE_LARGESTALL +} TX_SIZE_SEARCH_METHOD; + +typedef enum { + // Values should be powers of 2 so that they can be selected as bits of + // an integer flags field + + // terminate search early based on distortion so far compared to + // qp step, distortion in the neighborhood of the frame, etc. + FLAG_EARLY_TERMINATE = 1, + + // skips comp inter modes if the best so far is an intra mode + FLAG_SKIP_COMP_BESTINTRA = 2, + + // skips comp inter modes if the best single intermode so far does + // not have the same reference as one of the two references being + // tested + FLAG_SKIP_COMP_REFMISMATCH = 4, + + // skips oblique intra modes if the best so far is an inter mode + FLAG_SKIP_INTRA_BESTINTER = 8, + + // skips oblique intra modes at angles 27, 63, 117, 153 if the best + // intra so far is not one of the neighboring directions + FLAG_SKIP_INTRA_DIRMISMATCH = 16, +} MODE_SEARCH_SKIP_LOGIC; + typedef struct { int RD; SEARCH_METHODS search_method; @@ -210,53 +236,63 @@ typedef struct { int quarter_pixel_search; int thresh_mult[MAX_MODES]; int max_step_search_steps; - int first_step; + int reduce_first_step_size; + int auto_mv_step_size; int optimize_coefficients; int search_best_filter; int static_segmentation; int comp_inter_joint_search_thresh; - int adpative_rd_thresh; + int adaptive_rd_thresh; + int skip_encode_sb; + int skip_encode_frame; + int use_lastframe_partitioning; + TX_SIZE_SEARCH_METHOD tx_size_search_method; + int use_8tap_always; + int use_avoid_tested_higherror; + int skip_lots_of_modes; + int adjust_thresholds_by_speed; + int partition_by_variance; + int use_one_partition_size_always; + int less_rectangular_check; + int use_square_partition_only; + int unused_mode_skip_lvl; + int reference_masking; + BLOCK_SIZE_TYPE always_this_block_size; + int use_partitions_greater_than; + BLOCK_SIZE_TYPE greater_than_block_size; + int use_partitions_less_than; + BLOCK_SIZE_TYPE less_than_block_size; + int adjust_partitioning_from_last_frame; + int last_partitioning_redo_frequency; + int disable_splitmv; + int using_small_partition_info; + + // Implements various heuristics to skip searching modes + // The heuristics selected are based on flags + // defined in the MODE_SEARCH_SKIP_HEURISTICS enum + unsigned int mode_search_skip_flags; + MB_PREDICTION_MODE last_chroma_intra_mode; + int use_rd_breakout; + int use_uv_intra_rd_estimate; } SPEED_FEATURES; -enum BlockSize { - BLOCK_4X4, - BLOCK_4X8, - BLOCK_8X4, - BLOCK_8X8, - BLOCK_8X16, - BLOCK_16X8, - BLOCK_16X16, - BLOCK_32X32, - BLOCK_32X16, - BLOCK_16X32, - BLOCK_64X32, - BLOCK_32X64, - BLOCK_64X64, - BLOCK_MAX_SB_SEGMENTS, -}; - typedef struct VP9_COMP { + DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, short, y_quant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, unsigned char, y_quant_shift[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, y_zbin[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, y_round[QINDEX_RANGE][16]); - - DECLARE_ALIGNED(16, short, uv_quant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, unsigned char, uv_quant_shift[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, uv_zbin[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, uv_round[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); #if CONFIG_ALPHA - DECLARE_ALIGNED(16, short, a_quant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, unsigned char, a_quant_shift[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, a_zbin[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, a_round[QINDEX_RANGE][16]); - - DECLARE_ALIGNED(16, short, zrun_zbin_boost_a[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, int16_t, a_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, a_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, a_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, a_round[QINDEX_RANGE][8]); #endif - DECLARE_ALIGNED(16, short, zrun_zbin_boost_y[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]); MACROBLOCK mb; VP9_COMMON common; @@ -274,6 +310,7 @@ typedef struct VP9_COMP { YV12_BUFFER_CONFIG *un_scaled_source; YV12_BUFFER_CONFIG scaled_source; + unsigned int frames_till_alt_ref_frame; int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref int source_alt_ref_active; // an alt ref frame has been encoded and is usable @@ -316,6 +353,9 @@ typedef struct VP9_COMP { unsigned int mode_check_freq[MAX_MODES]; unsigned int mode_test_hit_counts[MAX_MODES]; unsigned int mode_chosen_counts[MAX_MODES]; + int64_t unused_mode_skip_mask; + int ref_frame_mask; + int set_ref_frame_mask; int rd_thresh_mult[MAX_MODES]; int rd_baseline_thresh[BLOCK_SIZE_TYPES][MAX_MODES]; @@ -323,17 +363,21 @@ typedef struct VP9_COMP { int rd_thresh_freq_fact[BLOCK_SIZE_TYPES][MAX_MODES]; int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES]; + // FIXME(rbultje) int64_t? int rd_prediction_type_threshes[4][NB_PREDICTION_TYPES]; unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2]; unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2]; unsigned int single_ref_count[REF_CONTEXTS][2][2]; unsigned int comp_ref_count[REF_CONTEXTS][2]; - // FIXME contextualize - int64_t rd_tx_select_diff[NB_TXFM_MODES]; + // FIXME(rbultje) can this overflow? int rd_tx_select_threshes[4][NB_TXFM_MODES]; + int64_t rd_filter_diff[VP9_SWITCHABLE_FILTERS + 1]; + int64_t rd_filter_threshes[4][VP9_SWITCHABLE_FILTERS + 1]; + int64_t rd_filter_cache[VP9_SWITCHABLE_FILTERS + 1]; + int RDMULT; int RDDIV; @@ -349,6 +393,7 @@ typedef struct VP9_COMP { double key_frame_rate_correction_factor; double gf_rate_correction_factor; + unsigned int frames_since_golden; int frames_till_gf_update_due; // Count down till next GF int gf_overspend_bits; // Total bits overspent becasue of GF boost (cumulative) @@ -368,7 +413,7 @@ typedef struct VP9_COMP { int av_per_frame_bandwidth; // Average frame size target for clip int min_frame_bandwidth; // Minimum allocation that should be used for any frame int inter_frame_target; - double output_frame_rate; + double output_framerate; int64_t last_time_stamp_seen; int64_t last_end_time_stamp_seen; int64_t first_time_stamp_ever; @@ -458,6 +503,9 @@ typedef struct VP9_COMP { SPEED_FEATURES sf; int error_bins[1024]; + unsigned int max_mv_magnitude; + int mv_step_param; + // Data used for real time conferencing mode to help determine if it would be good to update the gf int inter_zz_count; int gf_bad_count; @@ -466,7 +514,7 @@ typedef struct VP9_COMP { unsigned char *segmentation_map; // segment threashold for encode breakout - int segment_encode_breakout[MAX_MB_SEGMENTS]; + int segment_encode_breakout[MAX_SEGMENTS]; unsigned char *active_map; unsigned int active_map_enabled; @@ -475,7 +523,7 @@ typedef struct VP9_COMP { vp9_full_search_fn_t full_search_sad; vp9_refining_search_fn_t refining_search_sad; vp9_diamond_search_fn_t diamond_search_sad; - vp9_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SB_SEGMENTS]; + vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZE_TYPES]; uint64_t time_receive_data; uint64_t time_compress_data; uint64_t time_pick_lpf; @@ -570,7 +618,8 @@ typedef struct VP9_COMP { unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1] [VP9_SWITCHABLE_FILTERS]; - unsigned int best_switchable_interp_count[VP9_SWITCHABLE_FILTERS]; + + unsigned int txfm_stepdown_count[TX_SIZE_MAX_SB]; int initial_width; int initial_height; @@ -617,21 +666,4 @@ extern int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, extern void vp9_alloc_compressor_data(VP9_COMP *cpi); -#if CONFIG_DEBUG -#define CHECK_MEM_ERROR(lval,expr) do {\ - lval = (expr); \ - if(!lval) \ - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\ - "Failed to allocate "#lval" at %s:%d", \ - __FILE__,__LINE__);\ - } while(0) -#else -#define CHECK_MEM_ERROR(lval,expr) do {\ - lval = (expr); \ - if(!lval) \ - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\ - "Failed to allocate "#lval);\ - } while(0) -#endif - #endif // VP9_ENCODER_VP9_ONYX_INT_H_ diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c index a87d058..2b8f2cd 100644 --- a/libvpx/vp9/encoder/vp9_picklpf.c +++ b/libvpx/vp9/encoder/vp9_picklpf.c @@ -127,6 +127,7 @@ void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) { void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; + struct loopfilter *lf = &cpi->mb.e_mbd.lf; int best_err = 0; int filt_err = 0; @@ -135,7 +136,8 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { int filter_step; int filt_high = 0; - int filt_mid = cm->filter_level; // Start search at previous frame filter level + // Start search at previous frame filter level + int filt_mid = lf->filter_level; int filt_low = 0; int filt_best; int filt_direction = 0; @@ -146,12 +148,12 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { vp8_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf); if (cm->frame_type == KEY_FRAME) - cm->sharpness_level = 0; + lf->sharpness_level = 0; else - cm->sharpness_level = cpi->oxcf.Sharpness; + lf->sharpness_level = cpi->oxcf.Sharpness; // Start the search at the previous frame filter level unless it is now out of range. - filt_mid = cm->filter_level; + filt_mid = lf->filter_level; if (filt_mid < min_filter_level) filt_mid = min_filter_level; @@ -179,7 +181,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { Bias = Bias * cpi->twopass.section_intra_rating / 20; // yx, bias less for large block size - if (cpi->common.txfm_mode != ONLY_4X4) + if (cpi->common.tx_mode != ONLY_4X4) Bias >>= 1; filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step); @@ -232,5 +234,5 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { } } - cm->filter_level = filt_best; + lf->filter_level = filt_best; } diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c index 53d8be7..525f4da 100644 --- a/libvpx/vp9/encoder/vp9_quantize.c +++ b/libvpx/vp9/encoder/vp9_quantize.c @@ -21,105 +21,145 @@ extern int enc_debug; #endif -static INLINE int plane_idx(int plane) { - return plane == 0 ? 0 : - plane == 1 ? 16 : 20; -} - -static void quantize(int16_t *zbin_boost_orig_ptr, - int16_t *coeff_ptr, int n_coeffs, int skip_block, - int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, - uint8_t *quant_shift_ptr, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, - int16_t *dequant_ptr, int zbin_oq_value, - uint16_t *eob_ptr, - const int *scan, int mul) { +void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, + int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { int i, rc, eob; - int zbin; + int zbins[2], nzbins[2], zbin; int x, y, z, sz; - int zero_run = 0; - int16_t *zbin_boost_ptr = zbin_boost_orig_ptr; + int zero_flag = n_coeffs; vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); eob = -1; + // Base ZBIN + zbins[0] = zbin_ptr[0] + zbin_oq_value; + zbins[1] = zbin_ptr[1] + zbin_oq_value; + nzbins[0] = zbins[0] * -1; + nzbins[1] = zbins[1] * -1; + if (!skip_block) { - for (i = 0; i < n_coeffs; i++) { - rc = scan[i]; - z = coeff_ptr[rc] * mul; + // Pre-scan pass + for (i = n_coeffs - 1; i >= 0; i--) { + rc = scan[i]; + z = coeff_ptr[rc]; + + if (z < zbins[rc != 0] && z > nzbins[rc != 0]) { + zero_flag--; + } else { + break; + } + } - zbin = (zbin_ptr[rc != 0] + zbin_boost_ptr[zero_run] + zbin_oq_value); - zero_run += (zero_run < 15); + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < zero_flag; i++) { + rc = scan[i]; + z = coeff_ptr[rc]; + + zbin = (zbins[rc != 0]); sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) + x = (z ^ sz) - sz; if (x >= zbin) { x += (round_ptr[rc != 0]); - y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) - >> quant_shift_ptr[rc != 0]; // quantize (x) + y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) * + quant_shift_ptr[rc != 0]) >> 16; // quantize (x) x = (y ^ sz) - sz; // get the sign back qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value + dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value if (y) { eob = i; // last nonzero coeffs - zero_run = 0; } } } } - *eob_ptr = eob + 1; } -void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs, - TX_TYPE tx_type) { - MACROBLOCKD *const xd = &mb->e_mbd; - const int mul = n_coeffs == 1024 ? 2 : 1; - const int *scan; - - // These contexts may be available in the caller - switch (n_coeffs) { - case 4 * 4: - scan = get_scan_4x4(tx_type); - break; - case 8 * 8: - scan = get_scan_8x8(tx_type); - break; - case 16 * 16: - scan = get_scan_16x16(tx_type); - break; - default: - scan = vp9_default_scan_32x32; - break; - } +// This function works well for large transform size. +void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, + int16_t *zbin_ptr, int16_t *round_ptr, + int16_t *quant_ptr, int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + int16_t *dequant_ptr, int zbin_oq_value, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + int i, rc, eob; + int zbins[2], nzbins[2], zbin; + int x, y, z, sz; + int idx = 0; + int idx_arr[1024]; + + vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); + + eob = -1; + + // Base ZBIN + zbins[0] = zbin_ptr[0] + zbin_oq_value; + zbins[1] = zbin_ptr[1] + zbin_oq_value; + nzbins[0] = zbins[0] * -1; + nzbins[1] = zbins[1] * -1; + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + rc = scan[i]; + z = coeff_ptr[rc] * 2; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (z >= zbins[rc != 0] || z <= nzbins[rc != 0]) + idx_arr[idx++] = i; + } - quantize(mb->plane[plane].zrun_zbin_boost, - BLOCK_OFFSET(mb->plane[plane].coeff, block, 16), - n_coeffs, mb->skip_block, - mb->plane[plane].zbin, - mb->plane[plane].round, - mb->plane[plane].quant, - mb->plane[plane].quant_shift, - BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16), - BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), - xd->plane[plane].dequant, - mb->plane[plane].zbin_extra, - &xd->plane[plane].eobs[block], - scan, mul); + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + rc = scan[idx_arr[i]]; + + // Calculate ZBIN + zbin = (zbins[rc != 0]); + + z = coeff_ptr[rc] * 2; + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) + + if (x >= zbin) { + x += (round_ptr[rc != 0]); + y = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) * + quant_shift_ptr[rc != 0]) >> 16; // quantize (x) + + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2; // dequantized value + + if (y) { + eob = idx_arr[i]; // last nonzero coeffs + } + } + } + } + *eob_ptr = eob + 1; } void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, int y_blocks) { MACROBLOCKD *const xd = &mb->e_mbd; const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx); - const int *pt_scan = get_scan_4x4(tx_type); + const int16_t *scan = get_scan_4x4(tx_type); + const int16_t *iscan = get_iscan_4x4(tx_type); - quantize(mb->plane[pb_idx.plane].zrun_zbin_boost, - BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16), + vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16), 16, mb->skip_block, mb->plane[pb_idx.plane].zbin, mb->plane[pb_idx.plane].round, @@ -130,10 +170,10 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, xd->plane[pb_idx.plane].dequant, mb->plane[pb_idx.plane].zbin_extra, &xd->plane[pb_idx.plane].eobs[pb_idx.block], - pt_scan, 1); + scan, iscan); } -static void invert_quant(int16_t *quant, uint8_t *shift, int d) { +static void invert_quant(int16_t *quant, int16_t *shift, int d) { unsigned t; int l; t = d; @@ -141,7 +181,7 @@ static void invert_quant(int16_t *quant, uint8_t *shift, int d) { t >>= 1; t = 1 + (1 << (16 + l)) / d; *quant = (int16_t)(t - (1 << 16)); - *shift = l; + *shift = 1 << (16 - l); } void vp9_init_quantizer(VP9_COMP *cpi) { @@ -153,9 +193,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) { #endif int q; - static const int zbin_boost[16] = { 0, 0, 0, 8, 8, 8, 10, 12, - 14, 16, 20, 24, 28, 32, 36, 40 }; - for (q = 0; q < QINDEX_RANGE; q++) { int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80; int qrounding_factor = 48; @@ -163,20 +200,19 @@ void vp9_init_quantizer(VP9_COMP *cpi) { qzbin_factor = 64; qrounding_factor = 64; } + // dc values quant_val = vp9_dc_quant(q, cpi->common.y_dc_delta_q); invert_quant(cpi->y_quant[q] + 0, cpi->y_quant_shift[q] + 0, quant_val); cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7; cpi->common.y_dequant[q][0] = quant_val; - cpi->zrun_zbin_boost_y[q][0] = (quant_val * zbin_boost[0]) >> 7; quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q); invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val); cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7; cpi->common.uv_dequant[q][0] = quant_val; - cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7; #if CONFIG_ALPHA quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q); @@ -184,42 +220,49 @@ void vp9_init_quantizer(VP9_COMP *cpi) { cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7; cpi->common.a_dequant[q][0] = quant_val; - cpi->zrun_zbin_boost_a[q][0] = (quant_val * zbin_boost[0]) >> 7; #endif quant_val = vp9_ac_quant(q, 0); + invert_quant(cpi->y_quant[q] + 1, cpi->y_quant_shift[q] + 1, quant_val); + cpi->y_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); + cpi->y_round[q][1] = (qrounding_factor * quant_val) >> 7; cpi->common.y_dequant[q][1] = quant_val; + quant_uv_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q); + invert_quant(cpi->uv_quant[q] + 1, cpi->uv_quant_shift[q] + 1, + quant_uv_val); + cpi->uv_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7); + cpi->uv_round[q][1] = (qrounding_factor * quant_uv_val) >> 7; cpi->common.uv_dequant[q][1] = quant_uv_val; + #if CONFIG_ALPHA quant_alpha_val = vp9_ac_quant(q, cpi->common.a_ac_delta_q); + invert_quant(cpi->a_quant[q] + 1, cpi->a_quant_shift[q] + 1, + quant_alpha_val); + cpi->a_zbin[q][1] = ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7); + cpi->a_round[q][1] = (qrounding_factor * quant_alpha_val) >> 7; cpi->common.a_dequant[q][1] = quant_alpha_val; #endif - // all the 4x4 ac values =; - for (i = 1; i < 16; i++) { - int rc = vp9_default_scan_4x4[i]; - - invert_quant(cpi->y_quant[q] + rc, cpi->y_quant_shift[q] + rc, quant_val); - cpi->y_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); - cpi->y_round[q][rc] = (qrounding_factor * quant_val) >> 7; - cpi->zrun_zbin_boost_y[q][i] = - ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7); - - invert_quant(cpi->uv_quant[q] + rc, cpi->uv_quant_shift[q] + rc, - quant_uv_val); - cpi->uv_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7); - cpi->uv_round[q][rc] = (qrounding_factor * quant_uv_val) >> 7; - cpi->zrun_zbin_boost_uv[q][i] = - ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7); + + for (i = 2; i < 8; i++) { + cpi->y_quant[q][i] = cpi->y_quant[q][1]; + cpi->y_quant_shift[q][i] = cpi->y_quant_shift[q][1]; + cpi->y_zbin[q][i] = cpi->y_zbin[q][1]; + cpi->y_round[q][i] = cpi->y_round[q][1]; + cpi->common.y_dequant[q][i] = cpi->common.y_dequant[q][1]; + + cpi->uv_quant[q][i] = cpi->uv_quant[q][1]; + cpi->uv_quant_shift[q][i] = cpi->uv_quant_shift[q][1]; + cpi->uv_zbin[q][i] = cpi->uv_zbin[q][1]; + cpi->uv_round[q][i] = cpi->uv_round[q][1]; + cpi->common.uv_dequant[q][i] = cpi->common.uv_dequant[q][1]; #if CONFIG_ALPHA - invert_quant(cpi->a_quant[q] + rc, cpi->a_quant_shift[q] + rc, - quant_alpha_val); - cpi->a_zbin[q][rc] = - ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7); - cpi->a_round[q][rc] = (qrounding_factor * quant_alpha_val) >> 7; - cpi->zrun_zbin_boost_a[q][i] = - ROUND_POWER_OF_TWO(quant_alpha_val * zbin_boost[i], 7); + cpi->a_quant[q][i] = cpi->a_quant[q][1]; + cpi->a_quant_shift[q][i] = cpi->a_quant_shift[q][1]; + cpi->a_zbin[q][i] = cpi->a_zbin[q][1]; + cpi->a_round[q][i] = cpi->a_round[q][1]; + cpi->common.a_dequant[q][i] = cpi->common.a_dequant[q][1]; #endif } } @@ -240,7 +283,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[0].quant_shift = cpi->y_quant_shift[qindex]; x->plane[0].zbin = cpi->y_zbin[qindex]; x->plane[0].round = cpi->y_round[qindex]; - x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y[qindex]; x->plane[0].zbin_extra = (int16_t)zbin_extra; x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex]; @@ -253,7 +295,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[i].quant_shift = cpi->uv_quant_shift[qindex]; x->plane[i].zbin = cpi->uv_zbin[qindex]; x->plane[i].round = cpi->uv_round[qindex]; - x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex]; x->plane[i].zbin_extra = (int16_t)zbin_extra; x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex]; } @@ -263,12 +304,11 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[3].quant_shift = cpi->a_quant_shift[qindex]; x->plane[3].zbin = cpi->a_zbin[qindex]; x->plane[3].round = cpi->a_round[qindex]; - x->plane[3].zrun_zbin_boost = cpi->zrun_zbin_boost_a[qindex]; x->plane[3].zbin_extra = (int16_t)zbin_extra; x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex]; #endif - x->skip_block = vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); + x->skip_block = vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP); /* save this macroblock QIndex for vp9_update_zbin_extra() */ x->e_mbd.q_index = qindex; diff --git a/libvpx/vp9/encoder/vp9_quantize.h b/libvpx/vp9/encoder/vp9_quantize.h index 2b1eeab..3229eaa 100644 --- a/libvpx/vp9/encoder/vp9_quantize.h +++ b/libvpx/vp9/encoder/vp9_quantize.h @@ -22,9 +22,6 @@ #define prototype_quantize_mb(sym) \ void (sym)(MACROBLOCK *x) -void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coefs, - TX_TYPE tx_type); - void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2, int y_blocks); void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c index 430d3a8..d3a9529 100644 --- a/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/libvpx/vp9/encoder/vp9_ratectrl.c @@ -17,7 +17,6 @@ #include <math.h> #include "vp9/common/vp9_alloccommon.h" -#include "vp9/common/vp9_modecont.h" #include "vp9/common/vp9_common.h" #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/common/vp9_entropymode.h" @@ -33,46 +32,8 @@ // Bits Per MB at different Q (Multiplied by 512) #define BPER_MB_NORMBITS 9 -// % adjustment to target kf size based on seperation from previous frame -static const int kf_boost_seperation_adjustment[16] = { - 30, 40, 50, 55, 60, 65, 70, 75, - 80, 85, 90, 95, 100, 100, 100, 100, -}; - -static const int gf_adjust_table[101] = { - 100, - 115, 130, 145, 160, 175, 190, 200, 210, 220, 230, - 240, 260, 270, 280, 290, 300, 310, 320, 330, 340, - 350, 360, 370, 380, 390, 400, 400, 400, 400, 400, - 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, - 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, - 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, - 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, - 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, - 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, - 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, -}; - -static const int gf_intra_usage_adjustment[20] = { - 125, 120, 115, 110, 105, 100, 95, 85, 80, 75, - 70, 65, 60, 55, 50, 50, 50, 50, 50, 50, -}; - -static const int gf_interval_table[101] = { - 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, -}; - -static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 }; +static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = + { 1, 2, 3, 4, 5 }; // These functions use formulaic calculations to make playing with the // quantizer tables easier. If necessary they can be replaced by lookup @@ -128,7 +89,7 @@ void vp9_save_coding_context(VP9_COMP *cpi) { vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob); vp9_copy(cc->partition_prob, cm->fc.partition_prob); - vp9_copy(cc->segment_pred_probs, cm->segment_pred_probs); + vp9_copy(cc->segment_pred_probs, xd->seg.pred_probs); vp9_copy(cc->intra_inter_prob, cm->fc.intra_inter_prob); vp9_copy(cc->comp_inter_prob, cm->fc.comp_inter_prob); @@ -138,14 +99,12 @@ void vp9_save_coding_context(VP9_COMP *cpi) { vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy, cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols)); - vp9_copy(cc->last_ref_lf_deltas, xd->last_ref_lf_deltas); - vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas); + vp9_copy(cc->last_ref_lf_deltas, xd->lf.last_ref_deltas); + vp9_copy(cc->last_mode_lf_deltas, xd->lf.last_mode_deltas); vp9_copy(cc->coef_probs, cm->fc.coef_probs); vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob); - vp9_copy(cc->tx_probs_8x8p, cm->fc.tx_probs_8x8p); - vp9_copy(cc->tx_probs_16x16p, cm->fc.tx_probs_16x16p); - vp9_copy(cc->tx_probs_32x32p, cm->fc.tx_probs_32x32p); + cc->tx_probs = cm->fc.tx_probs; vp9_copy(cc->mbskip_probs, cm->fc.mbskip_probs); } @@ -168,7 +127,7 @@ void vp9_restore_coding_context(VP9_COMP *cpi) { vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob); vp9_copy(cm->fc.partition_prob, cc->partition_prob); - vp9_copy(cm->segment_pred_probs, cc->segment_pred_probs); + vp9_copy(xd->seg.pred_probs, cc->segment_pred_probs); vp9_copy(cm->fc.intra_inter_prob, cc->intra_inter_prob); vp9_copy(cm->fc.comp_inter_prob, cc->comp_inter_prob); @@ -179,14 +138,12 @@ void vp9_restore_coding_context(VP9_COMP *cpi) { cpi->coding_context.last_frame_seg_map_copy, (cm->mi_rows * cm->mi_cols)); - vp9_copy(xd->last_ref_lf_deltas, cc->last_ref_lf_deltas); - vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas); + vp9_copy(xd->lf.last_ref_deltas, cc->last_ref_lf_deltas); + vp9_copy(xd->lf.last_mode_deltas, cc->last_mode_lf_deltas); vp9_copy(cm->fc.coef_probs, cc->coef_probs); vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob); - vp9_copy(cm->fc.tx_probs_8x8p, cc->tx_probs_8x8p); - vp9_copy(cm->fc.tx_probs_16x16p, cc->tx_probs_16x16p); - vp9_copy(cm->fc.tx_probs_32x32p, cc->tx_probs_32x32p); + cm->fc.tx_probs = cc->tx_probs; vp9_copy(cm->fc.mbskip_probs, cc->mbskip_probs); } @@ -456,7 +413,7 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) { * whichever is smaller. */ int key_freq = cpi->oxcf.key_freq > 0 ? cpi->oxcf.key_freq : 1; - av_key_frame_frequency = (int)cpi->output_frame_rate * 2; + av_key_frame_frequency = (int)cpi->output_framerate * 2; if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq) av_key_frame_frequency = cpi->oxcf.key_freq; diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c index 9cb7ab0..843cf3f 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.c +++ b/libvpx/vp9/encoder/vp9_rdopt.c @@ -53,56 +53,49 @@ DECLARE_ALIGNED(16, extern const uint8_t, #define SPLITMV 0x10000 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { - {ZEROMV, LAST_FRAME, NONE}, - {DC_PRED, INTRA_FRAME, NONE}, - {NEARESTMV, LAST_FRAME, NONE}, - {NEARMV, LAST_FRAME, NONE}, - - {ZEROMV, GOLDEN_FRAME, NONE}, + {NEARESTMV, ALTREF_FRAME, NONE}, {NEARESTMV, GOLDEN_FRAME, NONE}, + {NEWMV, LAST_FRAME, NONE}, + {NEARESTMV, LAST_FRAME, ALTREF_FRAME}, + {NEARMV, LAST_FRAME, NONE}, + {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME}, - {ZEROMV, ALTREF_FRAME, NONE}, - {NEARESTMV, ALTREF_FRAME, NONE}, + {DC_PRED, INTRA_FRAME, NONE}, - {NEARMV, GOLDEN_FRAME, NONE}, + {NEWMV, GOLDEN_FRAME, NONE}, + {NEWMV, ALTREF_FRAME, NONE}, {NEARMV, ALTREF_FRAME, NONE}, - {V_PRED, INTRA_FRAME, NONE}, - {H_PRED, INTRA_FRAME, NONE}, - {D45_PRED, INTRA_FRAME, NONE}, - {D135_PRED, INTRA_FRAME, NONE}, - {D117_PRED, INTRA_FRAME, NONE}, - {D153_PRED, INTRA_FRAME, NONE}, - {D27_PRED, INTRA_FRAME, NONE}, - {D63_PRED, INTRA_FRAME, NONE}, - {TM_PRED, INTRA_FRAME, NONE}, - {NEWMV, LAST_FRAME, NONE}, - {NEWMV, GOLDEN_FRAME, NONE}, - {NEWMV, ALTREF_FRAME, NONE}, + {NEARMV, LAST_FRAME, ALTREF_FRAME}, + {NEWMV, LAST_FRAME, ALTREF_FRAME}, + {NEARMV, GOLDEN_FRAME, NONE}, + {NEARMV, GOLDEN_FRAME, ALTREF_FRAME}, + {NEWMV, GOLDEN_FRAME, ALTREF_FRAME}, {SPLITMV, LAST_FRAME, NONE}, {SPLITMV, GOLDEN_FRAME, NONE}, {SPLITMV, ALTREF_FRAME, NONE}, + {SPLITMV, LAST_FRAME, ALTREF_FRAME}, + {SPLITMV, GOLDEN_FRAME, ALTREF_FRAME}, - {I4X4_PRED, INTRA_FRAME, NONE}, - - /* compound prediction modes */ + {ZEROMV, LAST_FRAME, NONE}, + {ZEROMV, GOLDEN_FRAME, NONE}, + {ZEROMV, ALTREF_FRAME, NONE}, {ZEROMV, LAST_FRAME, ALTREF_FRAME}, - {NEARESTMV, LAST_FRAME, ALTREF_FRAME}, - {NEARMV, LAST_FRAME, ALTREF_FRAME}, - {ZEROMV, GOLDEN_FRAME, ALTREF_FRAME}, - {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME}, - {NEARMV, GOLDEN_FRAME, ALTREF_FRAME}, - - {NEWMV, LAST_FRAME, ALTREF_FRAME}, - {NEWMV, GOLDEN_FRAME, ALTREF_FRAME}, - {SPLITMV, LAST_FRAME, ALTREF_FRAME}, - {SPLITMV, GOLDEN_FRAME, ALTREF_FRAME}, + {I4X4_PRED, INTRA_FRAME, NONE}, + {H_PRED, INTRA_FRAME, NONE}, + {V_PRED, INTRA_FRAME, NONE}, + {D135_PRED, INTRA_FRAME, NONE}, + {D27_PRED, INTRA_FRAME, NONE}, + {D153_PRED, INTRA_FRAME, NONE}, + {D63_PRED, INTRA_FRAME, NONE}, + {D117_PRED, INTRA_FRAME, NONE}, + {D45_PRED, INTRA_FRAME, NONE}, }; // The baseline rd thresholds for breaking out of the rd loop for @@ -116,8 +109,7 @@ static int rd_thresh_block_size_factor[BLOCK_SIZE_TYPES] = #define MAX_RD_THRESH_FREQ_FACT 32 #define MAX_RD_THRESH_FREQ_INC 1 -static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES], - vp9_coeff_count (*cnoskip)[BLOCK_TYPES], +static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES][2], vp9_coeff_probs_model (*p)[BLOCK_TYPES]) { int i, j, k, l; TX_SIZE t; @@ -128,26 +120,21 @@ static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES], for (l = 0; l < PREV_COEF_CONTEXTS; l++) { vp9_prob probs[ENTROPY_NODES]; vp9_model_to_full_probs(p[t][i][j][k][l], probs); - vp9_cost_tokens((int *)cnoskip[t][i][j][k][l], probs, + vp9_cost_tokens((int *)c[t][i][j][0][k][l], probs, vp9_coef_tree); -#if CONFIG_BALANCED_COEFTREE - // Replace the eob node prob with a very small value so that the - // cost approximately equals the cost without the eob node - probs[1] = 1; - vp9_cost_tokens((int *)c[t][i][j][k][l], probs, vp9_coef_tree); -#else - vp9_cost_tokens_skip((int *)c[t][i][j][k][l], probs, + vp9_cost_tokens_skip((int *)c[t][i][j][1][k][l], probs, vp9_coef_tree); - assert(c[t][i][j][k][l][DCT_EOB_TOKEN] == - cnoskip[t][i][j][k][l][DCT_EOB_TOKEN]); -#endif + assert(c[t][i][j][0][k][l][DCT_EOB_TOKEN] == + c[t][i][j][1][k][l][DCT_EOB_TOKEN]); } } -static int rd_iifactor[32] = { 4, 4, 3, 2, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, }; +static const int rd_iifactor[32] = { + 4, 4, 3, 2, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; // 3* dc_qlookup[Q]*dc_qlookup[Q]; @@ -227,7 +214,11 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { cpi->rd_threshes[bsize][i] = INT_MAX; } cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i]; - cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT; + + if (cpi->sf.adaptive_rd_thresh) + cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT; + else + cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT; } } } else { @@ -247,14 +238,16 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { cpi->rd_threshes[bsize][i] = INT_MAX; } cpi->rd_baseline_thresh[bsize][i] = cpi->rd_threshes[bsize][i]; - cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT; + + if (cpi->sf.adaptive_rd_thresh) + cpi->rd_thresh_freq_fact[bsize][i] = MAX_RD_THRESH_FREQ_FACT; + else + cpi->rd_thresh_freq_fact[bsize][i] = BASE_RD_THRESH_FREQ_FACT; } } } - fill_token_costs(cpi->mb.token_costs, - cpi->mb.token_costs_noskip, - cpi->common.fc.coef_probs); + fill_token_costs(cpi->mb.token_costs, cpi->common.fc.coef_probs); for (i = 0; i < NUM_PARTITION_CONTEXTS; i++) vp9_cost_tokens(cpi->mb.partition_cost[i], @@ -271,168 +264,619 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { cpi->mb.nmvcost_hp : cpi->mb.nmvcost, &cpi->common.fc.nmvc, cpi->mb.e_mbd.allow_high_precision_mv, 1, 1); + + for (i = 0; i < INTER_MODE_CONTEXTS; i++) { + MB_PREDICTION_MODE m; + + for (m = NEARESTMV; m < MB_MODE_COUNT; m++) + cpi->mb.inter_mode_cost[i][m - NEARESTMV] = + cost_token(vp9_inter_mode_tree, + cpi->common.fc.inter_mode_probs[i], + vp9_inter_mode_encodings - NEARESTMV + m); + } + } +} + +static INLINE BLOCK_SIZE_TYPE get_block_size(int bwl, int bhl) { + return bsize_from_dim_lookup[bwl][bhl]; +} + +static BLOCK_SIZE_TYPE get_plane_block_size(BLOCK_SIZE_TYPE bsize, + struct macroblockd_plane *pd) { + return get_block_size(plane_block_width_log2by4(bsize, pd), + plane_block_height_log2by4(bsize, pd)); +} + +static INLINE void linear_interpolate2(double x, int ntab, int inv_step, + const double *tab1, const double *tab2, + double *v1, double *v2) { + double y = x * inv_step; + int d = (int) y; + if (d >= ntab - 1) { + *v1 = tab1[ntab - 1]; + *v2 = tab2[ntab - 1]; + } else { + double a = y - d; + *v1 = tab1[d] * (1 - a) + tab1[d + 1] * a; + *v2 = tab2[d] * (1 - a) + tab2[d + 1] * a; } } -int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) { - int i, error = 0; +static void model_rd_norm(double x, double *R, double *D) { + static const int inv_tab_step = 8; + static const int tab_size = 120; + // NOTE: The tables below must be of the same size + // + // Normalized rate + // This table models the rate for a Laplacian source + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expression is: + // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], + // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), + // and H(x) is the binary entropy function. + static const double rate_tab[] = { + 64.00, 4.944, 3.949, 3.372, 2.966, 2.655, 2.403, 2.194, + 2.014, 1.858, 1.720, 1.596, 1.485, 1.384, 1.291, 1.206, + 1.127, 1.054, 0.986, 0.923, 0.863, 0.808, 0.756, 0.708, + 0.662, 0.619, 0.579, 0.541, 0.506, 0.473, 0.442, 0.412, + 0.385, 0.359, 0.335, 0.313, 0.291, 0.272, 0.253, 0.236, + 0.220, 0.204, 0.190, 0.177, 0.165, 0.153, 0.142, 0.132, + 0.123, 0.114, 0.106, 0.099, 0.091, 0.085, 0.079, 0.073, + 0.068, 0.063, 0.058, 0.054, 0.050, 0.047, 0.043, 0.040, + 0.037, 0.034, 0.032, 0.029, 0.027, 0.025, 0.023, 0.022, + 0.020, 0.019, 0.017, 0.016, 0.015, 0.014, 0.013, 0.012, + 0.011, 0.010, 0.009, 0.008, 0.008, 0.007, 0.007, 0.006, + 0.006, 0.005, 0.005, 0.005, 0.004, 0.004, 0.004, 0.003, + 0.003, 0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002, + 0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, + 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.000, + }; + // Normalized distortion + // This table models the normalized distortion for a Laplacian source + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expression is: + // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) + // where x = qpstep / sqrt(variance) + // Note the actual distortion is Dn * variance. + static const double dist_tab[] = { + 0.000, 0.001, 0.005, 0.012, 0.021, 0.032, 0.045, 0.061, + 0.079, 0.098, 0.119, 0.142, 0.166, 0.190, 0.216, 0.242, + 0.269, 0.296, 0.324, 0.351, 0.378, 0.405, 0.432, 0.458, + 0.484, 0.509, 0.534, 0.557, 0.580, 0.603, 0.624, 0.645, + 0.664, 0.683, 0.702, 0.719, 0.735, 0.751, 0.766, 0.780, + 0.794, 0.807, 0.819, 0.830, 0.841, 0.851, 0.861, 0.870, + 0.878, 0.886, 0.894, 0.901, 0.907, 0.913, 0.919, 0.925, + 0.930, 0.935, 0.939, 0.943, 0.947, 0.951, 0.954, 0.957, + 0.960, 0.963, 0.966, 0.968, 0.971, 0.973, 0.975, 0.976, + 0.978, 0.980, 0.981, 0.982, 0.984, 0.985, 0.986, 0.987, + 0.988, 0.989, 0.990, 0.990, 0.991, 0.992, 0.992, 0.993, + 0.993, 0.994, 0.994, 0.995, 0.995, 0.996, 0.996, 0.996, + 0.996, 0.997, 0.997, 0.997, 0.997, 0.998, 0.998, 0.998, + 0.998, 0.998, 0.998, 0.999, 0.999, 0.999, 0.999, 0.999, + 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1.000, + }; + /* + assert(sizeof(rate_tab) == tab_size * sizeof(rate_tab[0]); + assert(sizeof(dist_tab) == tab_size * sizeof(dist_tab[0]); + assert(sizeof(rate_tab) == sizeof(dist_tab)); + */ + assert(x >= 0.0); + linear_interpolate2(x, tab_size, inv_tab_step, + rate_tab, dist_tab, R, D); +} + +static void model_rd_from_var_lapndz(int var, int n, int qstep, + int *rate, int64_t *dist) { + // This function models the rate and distortion for a Laplacian + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expressions are in: + // Hang and Chen, "Source Model for transform video coder and its + // application - Part I: Fundamental Theory", IEEE Trans. Circ. + // Sys. for Video Tech., April 1997. + vp9_clear_system_state(); + if (var == 0 || n == 0) { + *rate = 0; + *dist = 0; + } else { + double D, R; + double s2 = (double) var / n; + double x = qstep / sqrt(s2); + model_rd_norm(x, &R, &D); + *rate = ((n << 8) * R + 0.5); + *dist = (var * D + 0.5); + } + vp9_clear_system_state(); +} + +static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + int *out_rate_sum, int64_t *out_dist_sum) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + int i, rate_sum = 0, dist_sum = 0; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &xd->plane[i]; + + // TODO(dkovalev) the same code in get_plane_block_size + const int bwl = plane_block_width_log2by4(bsize, pd); + const int bhl = plane_block_height_log2by4(bsize, pd); + const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl); + unsigned int sse; + int rate; + int64_t dist; + (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, &sse); + // sse works better than var, since there is no dc prediction used + model_rd_from_var_lapndz(sse, 16 << (bwl + bhl), + pd->dequant[1] >> 3, &rate, &dist); + + rate_sum += rate; + dist_sum += dist; + } + + *out_rate_sum = rate_sum; + *out_dist_sum = dist_sum << 4; +} + +static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + int *out_rate_sum, int64_t *out_dist_sum) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + + // TODO(dkovalev) the same code in get_plane_block_size + const int bwl = plane_block_width_log2by4(bsize, pd); + const int bhl = plane_block_height_log2by4(bsize, pd); + const BLOCK_SIZE_TYPE bs = get_block_size(bwl, bhl); + unsigned int sse; + int rate; + int64_t dist; + (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, &sse); + // sse works better than var, since there is no dc prediction used + model_rd_from_var_lapndz(sse, 16 << (bwl + bhl), + pd->dequant[1] >> 3, &rate, &dist); + + *out_rate_sum = rate; + *out_dist_sum = dist << 4; +} + +static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, + TX_SIZE tx_size, + MACROBLOCK *x, MACROBLOCKD *xd, + int *out_rate_sum, int64_t *out_dist_sum, + int *out_skip) { + int t = 4, j, k; + BLOCK_SIZE_TYPE bs = BLOCK_SIZE_AB4X4; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int width = plane_block_width(bsize, pd); + const int height = plane_block_height(bsize, pd); + int rate_sum = 0; + int64_t dist_sum = 0; + + if (tx_size == TX_4X4) { + bs = BLOCK_4X4; + t = 4; + } else if (tx_size == TX_8X8) { + bs = BLOCK_8X8; + t = 8; + } else if (tx_size == TX_16X16) { + bs = BLOCK_16X16; + t = 16; + } else if (tx_size == TX_32X32) { + bs = BLOCK_32X32; + t = 32; + } else { + assert(0); + } + *out_skip = 1; + for (j = 0; j < height; j += t) { + for (k = 0; k < width; k += t) { + int rate; + int64_t dist; + unsigned int sse; + (void) cpi->fn_ptr[bs].vf(p->src.buf + j * p->src.stride + k, + p->src.stride, + pd->dst.buf + j * pd->dst.stride + k, + pd->dst.stride, &sse); + // sse works better than var, since there is no dc prediction used + model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, + &rate, &dist); + rate_sum += rate; + dist_sum += dist; + *out_skip &= (rate < 1024); + } + } + *out_rate_sum = rate_sum; + *out_dist_sum = (dist_sum << 4); +} + +int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int i; + int64_t error = 0, sqcoeff = 0; for (i = 0; i < block_size; i++) { int this_diff = coeff[i] - dqcoeff[i]; - error += this_diff * this_diff; + error += (unsigned)this_diff * this_diff; + sqcoeff += (unsigned) coeff[i] * coeff[i]; } + *ssz = sqcoeff; return error; } +static const int16_t band_counts[TX_SIZE_MAX_SB][8] = { + { 1, 2, 3, 4, 3, 16 - 13 }, + { 1, 2, 3, 4, 11, 64 - 21 }, + { 1, 2, 3, 4, 11, 256 - 21 }, + { 1, 2, 3, 4, 11, 1024 - 21 }, +}; + static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, int plane, int block, PLANE_TYPE type, - ENTROPY_CONTEXT *A, - ENTROPY_CONTEXT *L, + ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L, TX_SIZE tx_size, - int y_blocks) { + const int16_t *scan, const int16_t *nb) { MACROBLOCKD *const xd = &mb->e_mbd; MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; - int pt; - int c = 0; - int cost = 0, pad; - const int *scan, *nb; + int pt, c, cost; + const int16_t *band_count = band_counts[tx_size]; const int eob = xd->plane[plane].eobs[block]; - const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, - block, 16); + const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16); const int ref = mbmi->ref_frame[0] != INTRA_FRAME; - unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] = - mb->token_costs[tx_size][type][ref]; - ENTROPY_CONTEXT above_ec, left_ec; - TX_TYPE tx_type = DCT_DCT; - - const int segment_id = xd->mode_info_context->mbmi.segment_id; - unsigned int (*token_costs_noskip)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] = - mb->token_costs_noskip[tx_size][type][ref]; - - int seg_eob, default_eob; + unsigned int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS] + [MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref]; + ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L; uint8_t token_cache[1024]; - const uint8_t * band_translate; // Check for consistency of tx_size with mode info assert((!type && !plane) || (type && plane)); if (type == PLANE_TYPE_Y_WITH_DC) { assert(xd->mode_info_context->mbmi.txfm_size == tx_size); } else { - TX_SIZE tx_size_uv = get_uv_tx_size(mbmi); - assert(tx_size == tx_size_uv); + assert(tx_size == get_uv_tx_size(mbmi)); } + pt = combine_entropy_contexts(above_ec, left_ec); + + if (eob == 0) { + // single eob token + cost = token_costs[0][0][pt][DCT_EOB_TOKEN]; + c = 0; + } else { + int v, prev_t, band = 1, band_left = band_count[1]; + + // dc token + v = qcoeff_ptr[0]; + prev_t = vp9_dct_value_tokens_ptr[v].token; + cost = token_costs[0][0][pt][prev_t] + vp9_dct_value_cost_ptr[v]; + token_cache[0] = vp9_pt_energy_class[prev_t]; + + // ac tokens + for (c = 1; c < eob; c++) { + const int rc = scan[c]; + int t; + + v = qcoeff_ptr[rc]; + t = vp9_dct_value_tokens_ptr[v].token; + pt = get_coef_context(nb, token_cache, c); + cost += token_costs[!prev_t][band][pt][t] + vp9_dct_value_cost_ptr[v]; + token_cache[rc] = vp9_pt_energy_class[t]; + prev_t = t; + if (!--band_left) { + band_left = band_count[++band]; + } + } + + // eob token + if (band < 6) { + pt = get_coef_context(nb, token_cache, c); + cost += token_costs[0][band][pt][DCT_EOB_TOKEN]; + } + } + + // is eob first coefficient; + *A = *L = c > 0; + + return cost; +} + +struct rdcost_block_args { + VP9_COMMON *cm; + MACROBLOCK *x; + ENTROPY_CONTEXT t_above[16]; + ENTROPY_CONTEXT t_left[16]; + TX_SIZE tx_size; + int bw; + int bh; + int rate; + int64_t dist; + int64_t sse; + int64_t best_rd; + int skip; + const int16_t *scan, *nb; +}; + +static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize, + int ss_txfrm_size, void *arg) { + struct rdcost_block_args* args = arg; + MACROBLOCK* const x = args->x; + MACROBLOCKD* const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + int64_t this_sse; + int shift = args->tx_size == TX_32X32 ? 0 : 2; + int16_t *const coeff = BLOCK_OFFSET(p->coeff, block, 16); + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16); + args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, + &this_sse) >> shift; + args->sse += this_sse >> shift; + + if (x->skip_encode && + xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) { + // TODO(jingning): tune the model to better capture the distortion. + int64_t p = (pd->dequant[1] * pd->dequant[1] * + (1 << ss_txfrm_size)) >> shift; + args->dist += p; + args->sse += p; + } +} + +static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize, + int ss_txfrm_size, void *arg) { + struct rdcost_block_args* args = arg; + int x_idx, y_idx; + MACROBLOCKD * const xd = &args->x->e_mbd; + + txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx, + &y_idx); + + args->rate += cost_coeffs(args->cm, args->x, plane, block, + xd->plane[plane].plane_type, args->t_above + x_idx, + args->t_left + y_idx, args->tx_size, + args->scan, args->nb); +} + +// FIXME(jingning): need to make the rd test of chroma components consistent +// with that of luma component. this function should be deprecated afterwards. +static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane, + BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { + MACROBLOCKD * const xd = &x->e_mbd; + const int bwl = plane_block_width_log2by4(bsize, &xd->plane[plane]); + const int bhl = plane_block_height_log2by4(bsize, &xd->plane[plane]); + const int bw = 1 << bwl, bh = 1 << bhl; + int i; + struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, + 0, 0, 0, INT64_MAX, 0 }; + switch (tx_size) { - case TX_4X4: { - tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type_4x4(xd, block) : DCT_DCT; - above_ec = A[0] != 0; - left_ec = L[0] != 0; - seg_eob = 16; - scan = get_scan_4x4(tx_type); - band_translate = vp9_coefband_trans_4x4; + case TX_4X4: + vpx_memcpy(&args.t_above, xd->plane[plane].above_context, + sizeof(ENTROPY_CONTEXT) * bw); + vpx_memcpy(&args.t_left, xd->plane[plane].left_context, + sizeof(ENTROPY_CONTEXT) * bh); + args.scan = vp9_default_scan_4x4; + args.nb = vp9_default_scan_4x4_neighbors; break; - } - case TX_8X8: { - const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; - const int sz = 1 + b_width_log2(sb_type); - const int x = block & ((1 << sz) - 1), y = block - x; - TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT; - above_ec = (A[0] + A[1]) != 0; - left_ec = (L[0] + L[1]) != 0; - scan = get_scan_8x8(tx_type); - seg_eob = 64; - band_translate = vp9_coefband_trans_8x8plus; + case TX_8X8: + for (i = 0; i < bw; i += 2) + args.t_above[i] = !!*(uint16_t *)&xd->plane[plane].above_context[i]; + for (i = 0; i < bh; i += 2) + args.t_left[i] = !!*(uint16_t *)&xd->plane[plane].left_context[i]; + args.scan = vp9_default_scan_8x8; + args.nb = vp9_default_scan_8x8_neighbors; break; - } - case TX_16X16: { - const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; - const int sz = 2 + b_width_log2(sb_type); - const int x = block & ((1 << sz) - 1), y = block - x; - TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT; - scan = get_scan_16x16(tx_type); - seg_eob = 256; - above_ec = (A[0] + A[1] + A[2] + A[3]) != 0; - left_ec = (L[0] + L[1] + L[2] + L[3]) != 0; - band_translate = vp9_coefband_trans_8x8plus; + case TX_16X16: + for (i = 0; i < bw; i += 4) + args.t_above[i] = !!*(uint32_t *)&xd->plane[plane].above_context[i]; + for (i = 0; i < bh; i += 4) + args.t_left[i] = !!*(uint32_t *)&xd->plane[plane].left_context[i]; + args.scan = vp9_default_scan_16x16; + args.nb = vp9_default_scan_16x16_neighbors; break; - } case TX_32X32: - scan = vp9_default_scan_32x32; - seg_eob = 1024; - above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0; - left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0; - band_translate = vp9_coefband_trans_8x8plus; + for (i = 0; i < bw; i += 8) + args.t_above[i] = !!*(uint64_t *)&xd->plane[plane].above_context[i]; + for (i = 0; i < bh; i += 8) + args.t_left[i] = !!*(uint64_t *)&xd->plane[plane].left_context[i]; + args.scan = vp9_default_scan_32x32; + args.nb = vp9_default_scan_32x32_neighbors; break; default: - abort(); - break; + assert(0); } - assert(eob <= seg_eob); - pt = combine_entropy_contexts(above_ec, left_ec); - nb = vp9_get_coef_neighbors_handle(scan, &pad); - default_eob = seg_eob; - - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) - seg_eob = 0; - - /* sanity check to ensure that we do not have spurious non-zero q values */ - if (eob < seg_eob) - assert(qcoeff_ptr[scan[eob]] == 0); - - { - for (c = 0; c < eob; c++) { - int v = qcoeff_ptr[scan[c]]; - int t = vp9_dct_value_tokens_ptr[v].token; - int band = get_coef_band(band_translate, c); - if (c) - pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob); - - if (!c || token_cache[scan[c - 1]]) // do not skip eob - cost += token_costs_noskip[band][pt][t] + vp9_dct_value_cost_ptr[v]; - else - cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v]; - token_cache[scan[c]] = vp9_pt_energy_class[t]; - } - if (c < seg_eob) { - if (c) - pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob); - cost += mb->token_costs_noskip[tx_size][type][ref] - [get_coef_band(band_translate, c)] - [pt][DCT_EOB_TOKEN]; - } + foreach_transformed_block_in_plane(xd, bsize, plane, rate_block, &args); + return args.rate; +} + +static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x, + BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { + int cost = 0, plane; + + for (plane = 1; plane < MAX_MB_PLANE; plane++) { + cost += rdcost_plane(cm, x, plane, bsize, tx_size); } + return cost; +} - // is eob first coefficient; - for (pt = 0; pt < (1 << tx_size); pt++) { - A[pt] = L[pt] = c > 0; +static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, + int shift, int64_t *sse) { + struct macroblockd_plane *p = &x->e_mbd.plane[0]; + const int bwl = plane_block_width_log2by4(bsize, p); + const int bhl = plane_block_height_log2by4(bsize, p); + int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff, + 16 << (bwl + bhl), sse) >> shift; + *sse >>= shift; + return e; +} + +static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, + int shift, int64_t *sse) { + int64_t sum = 0, this_sse; + int plane; + + *sse = 0; + for (plane = 1; plane < MAX_MB_PLANE; plane++) { + struct macroblockd_plane *p = &x->e_mbd.plane[plane]; + const int bwl = plane_block_width_log2by4(bsize, p); + const int bhl = plane_block_height_log2by4(bsize, p); + sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff, + 16 << (bwl + bhl), &this_sse); + *sse += this_sse; } + *sse >>= shift; + return sum >> shift; +} - return cost; +static void block_yrd_txfm(int plane, int block, BLOCK_SIZE_TYPE bsize, + int ss_txfrm_size, void *arg) { + struct rdcost_block_args *args = arg; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct encode_b_args encode_args = {args->cm, x, NULL}; + int64_t rd1, rd2, rd; + + if (args->skip) + return; + rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist); + rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse); + rd = MIN(rd1, rd2); + if (rd > args->best_rd) { + args->skip = 1; + args->rate = INT_MAX; + args->dist = INT64_MAX; + args->sse = INT64_MAX; + return; + } + + if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) + encode_block_intra(plane, block, bsize, ss_txfrm_size, &encode_args); + else + xform_quant(plane, block, bsize, ss_txfrm_size, &encode_args); + + dist_block(plane, block, bsize, ss_txfrm_size, args); + rate_block(plane, block, bsize, ss_txfrm_size, args); +} + +static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int64_t *distortion, + int *skippable, int64_t *sse, + int64_t ref_best_rd, + BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int bwl = plane_block_width_log2by4(bsize, pd); + const int bhl = plane_block_height_log2by4(bsize, pd); + const int bw = 1 << bwl, bh = 1 << bhl; + int i; + struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, + 0, 0, 0, ref_best_rd, 0 }; + xd->mode_info_context->mbmi.txfm_size = tx_size; + switch (tx_size) { + case TX_4X4: + vpx_memcpy(&args.t_above, pd->above_context, + sizeof(ENTROPY_CONTEXT) * bw); + vpx_memcpy(&args.t_left, pd->left_context, + sizeof(ENTROPY_CONTEXT) * bh); + get_scan_nb_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, 0), + &args.scan, &args.nb); + break; + case TX_8X8: + for (i = 0; i < bw; i += 2) + args.t_above[i] = !!*(uint16_t *)&pd->above_context[i]; + for (i = 0; i < bh; i += 2) + args.t_left[i] = !!*(uint16_t *)&pd->left_context[i]; + get_scan_nb_8x8(get_tx_type_8x8(PLANE_TYPE_Y_WITH_DC, xd), + &args.scan, &args.nb); + break; + case TX_16X16: + for (i = 0; i < bw; i += 4) + args.t_above[i] = !!*(uint32_t *)&pd->above_context[i]; + for (i = 0; i < bh; i += 4) + args.t_left[i] = !!*(uint32_t *)&pd->left_context[i]; + get_scan_nb_16x16(get_tx_type_16x16(PLANE_TYPE_Y_WITH_DC, xd), + &args.scan, &args.nb); + break; + case TX_32X32: + for (i = 0; i < bw; i += 8) + args.t_above[i] = !!*(uint64_t *)&pd->above_context[i]; + for (i = 0; i < bh; i += 8) + args.t_left[i] = !!*(uint64_t *)&pd->left_context[i]; + args.scan = vp9_default_scan_32x32; + args.nb = vp9_default_scan_32x32_neighbors; + break; + default: + assert(0); + } + + foreach_transformed_block_in_plane(xd, bsize, 0, block_yrd_txfm, &args); + *distortion = args.dist; + *rate = args.rate; + *sse = args.sse; + *skippable = vp9_sby_is_skippable(xd, bsize) && (!args.skip); +} + +static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, + int *rate, int64_t *distortion, + int *skip, int64_t *sse, + int64_t ref_best_rd, + BLOCK_SIZE_TYPE bs) { + const TX_SIZE max_txfm_size = TX_32X32 + - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16); + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + if (max_txfm_size == TX_32X32 && + (cm->tx_mode == ALLOW_32X32 || + cm->tx_mode == TX_MODE_SELECT)) { + mbmi->txfm_size = TX_32X32; + } else if (max_txfm_size >= TX_16X16 && + (cm->tx_mode == ALLOW_16X16 || + cm->tx_mode == ALLOW_32X32 || + cm->tx_mode == TX_MODE_SELECT)) { + mbmi->txfm_size = TX_16X16; + } else if (cm->tx_mode != ONLY_4X4) { + mbmi->txfm_size = TX_8X8; + } else { + mbmi->txfm_size = TX_4X4; + } + super_block_yrd_for_txfm(cm, x, rate, distortion, skip, + &sse[mbmi->txfm_size], ref_best_rd, bs, + mbmi->txfm_size); + cpi->txfm_stepdown_count[0]++; } static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int (*r)[2], int *rate, - int *d, int *distortion, + int64_t *d, int64_t *distortion, int *s, int *skip, int64_t txfm_cache[NB_TXFM_MODES], - TX_SIZE max_txfm_size) { + BLOCK_SIZE_TYPE bs) { + const TX_SIZE max_txfm_size = TX_32X32 + - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16); VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; - vp9_prob skip_prob = vp9_get_pred_prob(cm, xd, PRED_MBSKIP); + vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd); int64_t rd[TX_SIZE_MAX_SB][2]; int n, m; int s0, s1; - const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE); + const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs); for (n = TX_4X4; n <= max_txfm_size; n++) { r[n][1] = r[n][0]; + if (r[n][0] == INT_MAX) + continue; for (m = 0; m <= n - (n == max_txfm_size); m++) { if (m == n) r[n][1] += vp9_cost_zero(tx_probs[m]); @@ -446,6 +890,10 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, s1 = vp9_cost_bit(skip_prob, 1); for (n = TX_4X4; n <= max_txfm_size; n++) { + if (d[n] == INT64_MAX) { + rd[n][0] = rd[n][1] = INT64_MAX; + continue; + } if (s[n]) { rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]); } else { @@ -455,29 +903,29 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, } if (max_txfm_size == TX_32X32 && - (cm->txfm_mode == ALLOW_32X32 || - (cm->txfm_mode == TX_MODE_SELECT && + (cm->tx_mode == ALLOW_32X32 || + (cm->tx_mode == TX_MODE_SELECT && rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] && rd[TX_32X32][1] < rd[TX_4X4][1]))) { mbmi->txfm_size = TX_32X32; } else if (max_txfm_size >= TX_16X16 && - (cm->txfm_mode == ALLOW_16X16 || - cm->txfm_mode == ALLOW_32X32 || - (cm->txfm_mode == TX_MODE_SELECT && + (cm->tx_mode == ALLOW_16X16 || + cm->tx_mode == ALLOW_32X32 || + (cm->tx_mode == TX_MODE_SELECT && rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1]))) { mbmi->txfm_size = TX_16X16; - } else if (cm->txfm_mode == ALLOW_8X8 || - cm->txfm_mode == ALLOW_16X16 || - cm->txfm_mode == ALLOW_32X32 || - (cm->txfm_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) { + } else if (cm->tx_mode == ALLOW_8X8 || + cm->tx_mode == ALLOW_16X16 || + cm->tx_mode == ALLOW_32X32 || + (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) { mbmi->txfm_size = TX_8X8; } else { mbmi->txfm_size = TX_4X4; } *distortion = d[mbmi->txfm_size]; - *rate = r[mbmi->txfm_size][cm->txfm_mode == TX_MODE_SELECT]; + *rate = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT]; *skip = s[mbmi->txfm_size]; txfm_cache[ONLY_4X4] = rd[TX_4X4][0]; @@ -494,119 +942,134 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, else txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ? rd[TX_4X4][1] : rd[TX_8X8][1]; -} -static int block_error(int16_t *coeff, int16_t *dqcoeff, - int block_size, int shift) { - int i; - int64_t error = 0; - - for (i = 0; i < block_size; i++) { - int this_diff = coeff[i] - dqcoeff[i]; - error += (unsigned)this_diff * this_diff; - } - error >>= shift; - - return error > INT_MAX ? INT_MAX : (int)error; -} - -static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) { - const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); - return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff, - 16 << (bwl + bhl), shift); -} - -static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) { - const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); - int64_t sum = 0; - int plane; - - for (plane = 1; plane < MAX_MB_PLANE; plane++) { - const int subsampling = x->e_mbd.plane[plane].subsampling_x + - x->e_mbd.plane[plane].subsampling_y; - sum += block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff, - 16 << (bwl + bhl - subsampling), 0); + if (max_txfm_size == TX_32X32 && + rd[TX_32X32][1] < rd[TX_16X16][1] && + rd[TX_32X32][1] < rd[TX_8X8][1] && + rd[TX_32X32][1] < rd[TX_4X4][1]) { + cpi->txfm_stepdown_count[0]++; + } else if (max_txfm_size >= TX_16X16 && + rd[TX_16X16][1] < rd[TX_8X8][1] && + rd[TX_16X16][1] < rd[TX_4X4][1]) { + cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++; + } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) { + cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++; + } else { + cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++; } - sum >>= shift; - return sum > INT_MAX ? INT_MAX : (int)sum; -} - -struct rdcost_block_args { - VP9_COMMON *cm; - MACROBLOCK *x; - ENTROPY_CONTEXT t_above[16]; - ENTROPY_CONTEXT t_left[16]; - TX_SIZE tx_size; - int bw; - int bh; - int cost; -}; - -static void rdcost_block(int plane, int block, BLOCK_SIZE_TYPE bsize, - int ss_txfrm_size, void *arg) { - struct rdcost_block_args* args = arg; - int x_idx, y_idx; - MACROBLOCKD * const xd = &args->x->e_mbd; - - txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx, - &y_idx); - - args->cost += cost_coeffs(args->cm, args->x, plane, block, - xd->plane[plane].plane_type, args->t_above + x_idx, - args->t_left + y_idx, args->tx_size, - args->bw * args->bh); } -static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane, - BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { - MACROBLOCKD * const xd = &x->e_mbd; - const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; - const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; - const int bw = 1 << bwl, bh = 1 << bhl; - struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, 0 }; +static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, + int (*r)[2], int *rate, + int64_t *d, int64_t *distortion, + int *s, int *skip, int64_t *sse, + int64_t ref_best_rd, + BLOCK_SIZE_TYPE bs, + int *model_used) { + const TX_SIZE max_txfm_size = TX_32X32 + - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16); + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd); + int64_t rd[TX_SIZE_MAX_SB][2]; + int n, m; + int s0, s1; + double scale_rd[TX_SIZE_MAX_SB] = {1.73, 1.44, 1.20, 1.00}; + // double scale_r[TX_SIZE_MAX_SB] = {2.82, 2.00, 1.41, 1.00}; - vpx_memcpy(&args.t_above, xd->plane[plane].above_context, - sizeof(ENTROPY_CONTEXT) * bw); - vpx_memcpy(&args.t_left, xd->plane[plane].left_context, - sizeof(ENTROPY_CONTEXT) * bh); + const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs); - foreach_transformed_block_in_plane(xd, bsize, plane, rdcost_block, &args); + // for (n = TX_4X4; n <= max_txfm_size; n++) + // r[n][0] = (r[n][0] * scale_r[n]); - return args.cost; -} + for (n = TX_4X4; n <= max_txfm_size; n++) { + r[n][1] = r[n][0]; + for (m = 0; m <= n - (n == max_txfm_size); m++) { + if (m == n) + r[n][1] += vp9_cost_zero(tx_probs[m]); + else + r[n][1] += vp9_cost_one(tx_probs[m]); + } + } -static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x, - BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { - int cost = 0, plane; + assert(skip_prob > 0); + s0 = vp9_cost_bit(skip_prob, 0); + s1 = vp9_cost_bit(skip_prob, 1); - for (plane = 1; plane < MAX_MB_PLANE; plane++) { - cost += rdcost_plane(cm, x, plane, bsize, tx_size); + for (n = TX_4X4; n <= max_txfm_size; n++) { + if (s[n]) { + rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]); + } else { + rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]); + rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]); + } + } + for (n = TX_4X4; n <= max_txfm_size; n++) { + rd[n][0] = (scale_rd[n] * rd[n][0]); + rd[n][1] = (scale_rd[n] * rd[n][1]); } - return cost; -} -static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, - int *rate, int *distortion, int *skippable, - BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { - MACROBLOCKD *const xd = &x->e_mbd; - xd->mode_info_context->mbmi.txfm_size = tx_size; + if (max_txfm_size == TX_32X32 && + (cm->tx_mode == ALLOW_32X32 || + (cm->tx_mode == TX_MODE_SELECT && + rd[TX_32X32][1] <= rd[TX_16X16][1] && + rd[TX_32X32][1] <= rd[TX_8X8][1] && + rd[TX_32X32][1] <= rd[TX_4X4][1]))) { + mbmi->txfm_size = TX_32X32; + } else if (max_txfm_size >= TX_16X16 && + (cm->tx_mode == ALLOW_16X16 || + cm->tx_mode == ALLOW_32X32 || + (cm->tx_mode == TX_MODE_SELECT && + rd[TX_16X16][1] <= rd[TX_8X8][1] && + rd[TX_16X16][1] <= rd[TX_4X4][1]))) { + mbmi->txfm_size = TX_16X16; + } else if (cm->tx_mode == ALLOW_8X8 || + cm->tx_mode == ALLOW_16X16 || + cm->tx_mode == ALLOW_32X32 || + (cm->tx_mode == TX_MODE_SELECT && + rd[TX_8X8][1] <= rd[TX_4X4][1])) { + mbmi->txfm_size = TX_8X8; + } else { + mbmi->txfm_size = TX_4X4; + } - if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) - vp9_encode_intra_block_y(cm, x, bsize); - else - vp9_xform_quant_sby(cm, x, bsize); + if (model_used[mbmi->txfm_size]) { + // Actually encode using the chosen mode if a model was used, but do not + // update the r, d costs + super_block_yrd_for_txfm(cm, x, rate, distortion, skip, + &sse[mbmi->txfm_size], ref_best_rd, + bs, mbmi->txfm_size); + } else { + *distortion = d[mbmi->txfm_size]; + *rate = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT]; + *skip = s[mbmi->txfm_size]; + } - *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2); - *rate = rdcost_plane(cm, x, 0, bsize, tx_size); - *skippable = vp9_sby_is_skippable(xd, bsize); + if (max_txfm_size == TX_32X32 && + rd[TX_32X32][1] <= rd[TX_16X16][1] && + rd[TX_32X32][1] <= rd[TX_8X8][1] && + rd[TX_32X32][1] <= rd[TX_4X4][1]) { + cpi->txfm_stepdown_count[0]++; + } else if (max_txfm_size >= TX_16X16 && + rd[TX_16X16][1] <= rd[TX_8X8][1] && + rd[TX_16X16][1] <= rd[TX_4X4][1]) { + cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++; + } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) { + cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++; + } else { + cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++; + } } static void super_block_yrd(VP9_COMP *cpi, - MACROBLOCK *x, int *rate, int *distortion, - int *skip, BLOCK_SIZE_TYPE bs, - int64_t txfm_cache[NB_TXFM_MODES]) { + MACROBLOCK *x, int *rate, int64_t *distortion, + int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs, + int64_t txfm_cache[NB_TXFM_MODES], + int64_t ref_best_rd) { VP9_COMMON *const cm = &cpi->common; - int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB]; + int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB]; + int64_t d[TX_SIZE_MAX_SB], sse[TX_SIZE_MAX_SB]; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; @@ -614,36 +1077,95 @@ static void super_block_yrd(VP9_COMP *cpi, if (mbmi->ref_frame[0] > INTRA_FRAME) vp9_subtract_sby(x, bs); - if (cpi->speed > 4) { + if (cpi->sf.tx_size_search_method == USE_LARGESTALL || + (cpi->sf.tx_size_search_method != USE_FULL_RD && + mbmi->ref_frame[0] == INTRA_FRAME)) { + vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t)); + choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, + ref_best_rd, bs); + if (psse) + *psse = sse[mbmi->txfm_size]; + return; + } + + if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER && + mbmi->ref_frame[0] > INTRA_FRAME) { + int model_used[TX_SIZE_MAX_SB] = {1, 1, 1, 1}; if (bs >= BLOCK_SIZE_SB32X32) { - mbmi->txfm_size = TX_32X32; - } else if (bs >= BLOCK_SIZE_MB16X16) { - mbmi->txfm_size = TX_16X16; - } else if (bs >= BLOCK_SIZE_SB8X8) { - mbmi->txfm_size = TX_8X8; + if (model_used[TX_32X32]) { + model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd, + &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]); + } else { + super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], + &s[TX_32X32], &sse[TX_32X32], INT64_MAX, + bs, TX_32X32); + } + } + if (bs >= BLOCK_SIZE_MB16X16) { + if (model_used[TX_16X16]) { + model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd, + &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]); + } else { + super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], + &s[TX_16X16], &sse[TX_16X16], INT64_MAX, + bs, TX_16X16); + } + } + if (model_used[TX_8X8]) { + model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd, + &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]); } else { - mbmi->txfm_size = TX_4X4; + super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], + &sse[TX_8X8], INT64_MAX, bs, TX_8X8); } - vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t)); - super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs, - mbmi->txfm_size); - return; + if (model_used[TX_4X4]) { + model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd, + &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]); + } else { + super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], + &sse[TX_4X4], INT64_MAX, bs, TX_4X4); + } + choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s, + skip, sse, ref_best_rd, bs, model_used); + } else { + if (bs >= BLOCK_SIZE_SB32X32) + super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], + &s[TX_32X32], &sse[TX_32X32], ref_best_rd, + bs, TX_32X32); + if (bs >= BLOCK_SIZE_MB16X16) + super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], + &s[TX_16X16], &sse[TX_16X16], ref_best_rd, + bs, TX_16X16); + super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], + &sse[TX_8X8], ref_best_rd, bs, TX_8X8); + super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], + &sse[TX_4X4], ref_best_rd, bs, TX_4X4); + choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, + skip, txfm_cache, bs); } - if (bs >= BLOCK_SIZE_SB32X32) - super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], - bs, TX_32X32); - if (bs >= BLOCK_SIZE_MB16X16) - super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], - bs, TX_16X16); - super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs, - TX_8X8); - super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs, - TX_4X4); - - choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, - skip, txfm_cache, - TX_32X32 - (bs < BLOCK_SIZE_SB32X32) - - (bs < BLOCK_SIZE_MB16X16)); + if (psse) + *psse = sse[mbmi->txfm_size]; +} + +static int conditional_skipintra(MB_PREDICTION_MODE mode, + MB_PREDICTION_MODE best_intra_mode) { + if (mode == D117_PRED && + best_intra_mode != V_PRED && + best_intra_mode != D135_PRED) + return 1; + if (mode == D63_PRED && + best_intra_mode != V_PRED && + best_intra_mode != D45_PRED) + return 1; + if (mode == D27_PRED && + best_intra_mode != H_PRED && + best_intra_mode != D45_PRED) + return 1; + if (mode == D153_PRED && + best_intra_mode != H_PRED && + best_intra_mode != D135_PRED) + return 1; + return 0; } static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, @@ -651,15 +1173,18 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, int *bmode_costs, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, - int *bestdistortion, + int64_t *bestdistortion, BLOCK_SIZE_TYPE bsize) { MB_PREDICTION_MODE mode; MACROBLOCKD *xd = &x->e_mbd; int64_t best_rd = INT64_MAX; int rate = 0; - int distortion; + int64_t distortion; VP9_COMMON *const cm = &cpi->common; - const int src_stride = x->plane[0].src.stride; + struct macroblock_plane *p = &x->plane[0]; + struct macroblockd_plane *pd = &xd->plane[0]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; uint8_t *src, *dst; int16_t *src_diff, *coeff; @@ -667,8 +1192,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, ENTROPY_CONTEXT tl[2], templ[2]; TX_TYPE tx_type = DCT_DCT; TX_TYPE best_tx_type = DCT_DCT; - int bw = 1 << b_width_log2(bsize); - int bh = 1 << b_height_log2(bsize); + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; int idx, idy, block; DECLARE_ALIGNED(16, int16_t, best_dqcoeff[4][16]); @@ -681,6 +1206,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, for (mode = DC_PRED; mode <= TM_PRED; ++mode) { int64_t this_rd; int ratey = 0; + // Only do the oblique modes if the best so far is + // one of the neighboring directional modes + if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra(mode, *best_mode)) + continue; + } rate = bmode_costs[mode]; distortion = 0; @@ -688,25 +1219,30 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, vpx_memcpy(tempa, ta, sizeof(ta)); vpx_memcpy(templ, tl, sizeof(tl)); - for (idy = 0; idy < bh; ++idy) { - for (idx = 0; idx < bw; ++idx) { + for (idy = 0; idy < num_4x4_blocks_high; ++idy) { + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) { + int64_t ssz; + const int16_t *scan; + block = ib + idy * 2 + idx; - xd->mode_info_context->bmi[block].as_mode.first = mode; + xd->mode_info_context->bmi[block].as_mode = mode; src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block, - x->plane[0].src.buf, src_stride); + p->src.buf, src_stride); src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block, - x->plane[0].src_diff); + p->src_diff); coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16); dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block, - xd->plane[0].dst.buf, - xd->plane[0].dst.stride); - vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, mode, - dst, xd->plane[0].dst.stride); + pd->dst.buf, dst_stride); + vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8), + TX_4X4, mode, + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride); vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, - dst, xd->plane[0].dst.stride); + dst, dst_stride); - tx_type = get_tx_type_4x4(xd, block); + tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block); if (tx_type != DCT_DCT) { vp9_short_fht4x4(src_diff, coeff, 8, tx_type); x->quantize_b_4x4(x, block, tx_type, 16); @@ -715,17 +1251,20 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, x->quantize_b_4x4(x, block, tx_type, 16); } + scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block)); ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC, - tempa + idx, templ + idy, TX_4X4, 16); - distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff, - block, 16), 16) >> 2; - - if (best_tx_type != DCT_DCT) - vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16), - dst, xd->plane[0].dst.stride, best_tx_type); + tempa + idx, templ + idy, TX_4X4, scan, + vp9_get_coef_neighbors_handle(scan)); + distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, + block, 16), + 16, &ssz) >> 2; + + if (tx_type != DCT_DCT) + vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16), + dst, pd->dst.stride, tx_type); else - xd->inv_txm4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16), - dst, xd->plane[0].dst.stride); + xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16), + dst, pd->dst.stride); } } @@ -741,34 +1280,41 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, best_tx_type = tx_type; vpx_memcpy(a, tempa, sizeof(tempa)); vpx_memcpy(l, templ, sizeof(templ)); - for (idy = 0; idy < bh; ++idy) { - for (idx = 0; idx < bw; ++idx) { + for (idy = 0; idy < num_4x4_blocks_high; ++idy) { + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) { block = ib + idy * 2 + idx; vpx_memcpy(best_dqcoeff[idy * 2 + idx], - BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16), + BLOCK_OFFSET(pd->dqcoeff, block, 16), sizeof(best_dqcoeff[0])); } } } } - for (idy = 0; idy < bh; ++idy) { - for (idx = 0; idx < bw; ++idx) { + if (x->skip_encode) + return best_rd; + + for (idy = 0; idy < num_4x4_blocks_high; ++idy) { + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) { block = ib + idy * 2 + idx; - xd->mode_info_context->bmi[block].as_mode.first = *best_mode; + xd->mode_info_context->bmi[block].as_mode = *best_mode; + src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block, + p->src.buf, src_stride); dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block, - xd->plane[0].dst.buf, - xd->plane[0].dst.stride); + pd->dst.buf, dst_stride); - vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, *best_mode, - dst, xd->plane[0].dst.stride); + vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8), TX_4X4, + *best_mode, + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride); // inverse transform if (best_tx_type != DCT_DCT) vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst, - xd->plane[0].dst.stride, best_tx_type); + dst_stride, best_tx_type); else xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst, - xd->plane[0].dst.stride); + dst_stride); } } @@ -777,15 +1323,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int *Rate, int *rate_y, - int *Distortion, int64_t best_rd) { + int64_t *Distortion, int64_t best_rd) { int i, j; MACROBLOCKD *const xd = &mb->e_mbd; BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; - int bw = 1 << b_width_log2(bsize); - int bh = 1 << b_height_log2(bsize); + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; int idx, idy; int cost = 0; - int distortion = 0; + int64_t distortion = 0; int tot_rate_y = 0; int64_t total_rd = 0; ENTROPY_CONTEXT t_above[4], t_left[4]; @@ -797,15 +1343,15 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, bmode_costs = mb->mbmode_cost; - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { const int mis = xd->mode_info_stride; MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry); - int UNINITIALIZED_IS_SAFE(d); + int64_t UNINITIALIZED_IS_SAFE(d); i = idy * 2 + idx; - if (xd->frame_type == KEY_FRAME) { + if (cpi->common.frame_type == KEY_FRAME) { const MB_PREDICTION_MODE A = above_block_mode(mic, i, mis); const MB_PREDICTION_MODE L = (xd->left_available || idx) ? left_block_mode(mic, i) : DC_PRED; @@ -820,51 +1366,45 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, distortion += d; tot_rate_y += ry; - mic->bmi[i].as_mode.first = best_mode; - for (j = 1; j < bh; ++j) - mic->bmi[i + j * 2].as_mode.first = best_mode; - for (j = 1; j < bw; ++j) - mic->bmi[i + j].as_mode.first = best_mode; + mic->bmi[i].as_mode = best_mode; + for (j = 1; j < num_4x4_blocks_high; ++j) + mic->bmi[i + j * 2].as_mode = best_mode; + for (j = 1; j < num_4x4_blocks_wide; ++j) + mic->bmi[i + j].as_mode = best_mode; if (total_rd >= best_rd) - break; + return INT64_MAX; } } - if (total_rd >= best_rd) - return INT64_MAX; - *Rate = cost; *rate_y = tot_rate_y; *Distortion = distortion; - xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode.first; + xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode; return RDCOST(mb->rdmult, mb->rddiv, cost, distortion); } static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, - int *distortion, int *skippable, + int64_t *distortion, int *skippable, BLOCK_SIZE_TYPE bsize, - int64_t txfm_cache[NB_TXFM_MODES]) { + int64_t txfm_cache[NB_TXFM_MODES], + int64_t best_rd) { MB_PREDICTION_MODE mode; MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); MACROBLOCKD *const xd = &x->e_mbd; - int this_rate, this_rate_tokenonly; - int this_distortion, s; - int64_t best_rd = INT64_MAX, this_rd; + int this_rate, this_rate_tokenonly, s; + int64_t this_distortion, this_rd; TX_SIZE UNINITIALIZED_IS_SAFE(best_tx); int i; int *bmode_costs = x->mbmode_cost; - if (bsize < BLOCK_SIZE_SB8X8) { - x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4; - return best_rd; + if (cpi->sf.tx_size_search_method == USE_FULL_RD) { + for (i = 0; i < NB_TXFM_MODES; i++) + txfm_cache[i] = INT64_MAX; } - for (i = 0; i < NB_TXFM_MODES; i++) - txfm_cache[i] = INT64_MAX; - /* Y Search for 32x32 intra prediction mode */ for (mode = DC_PRED; mode <= TM_PRED; mode++) { int64_t local_txfm_cache[NB_TXFM_MODES]; @@ -880,8 +1420,11 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, } x->e_mbd.mode_info_context->mbmi.mode = mode; - super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, - bsize, local_txfm_cache); + super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL, + bsize, local_txfm_cache, best_rd); + + if (this_rate_tokenonly == INT_MAX) + continue; this_rate = this_rate_tokenonly + bmode_costs[mode]; this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion); @@ -896,11 +1439,13 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, *skippable = s; } - for (i = 0; i < NB_TXFM_MODES; i++) { - int64_t adj_rd = this_rd + local_txfm_cache[i] - - local_txfm_cache[cpi->common.txfm_mode]; - if (adj_rd < txfm_cache[i]) { - txfm_cache[i] = adj_rd; + if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) { + for (i = 0; i < NB_TXFM_MODES; i++) { + int64_t adj_rd = this_rd + local_txfm_cache[i] - + local_txfm_cache[cpi->common.tx_mode]; + if (adj_rd < txfm_cache[i]) { + txfm_cache[i] = adj_rd; + } } } } @@ -912,60 +1457,56 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, } static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, - int *rate, int *distortion, - int *skippable, BLOCK_SIZE_TYPE bsize, + int *rate, int64_t *distortion, + int *skippable, int64_t *sse, + BLOCK_SIZE_TYPE bsize, TX_SIZE uv_tx_size) { MACROBLOCKD *const xd = &x->e_mbd; + int64_t dummy; if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) vp9_encode_intra_block_uv(cm, x, bsize); else vp9_xform_quant_sbuv(cm, x, bsize); - *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2); + *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2, + sse ? sse : &dummy); *rate = rdcost_uv(cm, x, bsize, uv_tx_size); *skippable = vp9_sbuv_is_skippable(xd, bsize); } static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, - int *rate, int *distortion, int *skippable, - BLOCK_SIZE_TYPE bsize) { + int *rate, int64_t *distortion, int *skippable, + int64_t *sse, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi); if (mbmi->ref_frame[0] > INTRA_FRAME) vp9_subtract_sbuv(x, bsize); - if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) { - super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize, - TX_32X32); - } else if (mbmi->txfm_size >= TX_16X16 && bsize >= BLOCK_SIZE_SB32X32) { - super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize, - TX_16X16); - } else if (mbmi->txfm_size >= TX_8X8 && bsize >= BLOCK_SIZE_MB16X16) { - super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize, - TX_8X8); - } else { - super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize, - TX_4X4); - } + super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, sse, bsize, + uv_txfm_size); } static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, - int *distortion, int *skippable, + int64_t *distortion, int *skippable, BLOCK_SIZE_TYPE bsize) { MB_PREDICTION_MODE mode; MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); int64_t best_rd = INT64_MAX, this_rd; - int this_rate_tokenonly, this_rate; - int this_distortion, s; + int this_rate_tokenonly, this_rate, s; + int64_t this_distortion; - for (mode = DC_PRED; mode <= TM_PRED; mode++) { + MB_PREDICTION_MODE last_mode = bsize <= BLOCK_SIZE_SB8X8 ? + TM_PRED : cpi->sf.last_chroma_intra_mode; + + for (mode = DC_PRED; mode <= last_mode; mode++) { x->e_mbd.mode_info_context->mbmi.uv_mode = mode; super_block_uvrd(&cpi->common, x, &this_rate_tokenonly, - &this_distortion, &s, bsize); + &this_distortion, &s, NULL, bsize); this_rate = this_rate_tokenonly + - x->intra_uv_mode_cost[x->e_mbd.frame_type][mode]; + x->intra_uv_mode_cost[cpi->common.frame_type][mode]; this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion); if (this_rd < best_rd) { @@ -983,21 +1524,60 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, return best_rd; } -int vp9_cost_mv_ref(VP9_COMP *cpi, - MB_PREDICTION_MODE m, - const int mode_context) { - MACROBLOCKD *xd = &cpi->mb.e_mbd; - int segment_id = xd->mode_info_context->mbmi.segment_id; - - // Dont account for mode here if segment skip is enabled. - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) { - VP9_COMMON *pc = &cpi->common; - assert(NEARESTMV <= m && m <= NEWMV); - return cost_token(vp9_sb_mv_ref_tree, - pc->fc.inter_mode_probs[mode_context], - vp9_sb_mv_ref_encoding_array - NEARESTMV + m); - } else +static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x, + int *rate, int *rate_tokenonly, + int64_t *distortion, int *skippable, + BLOCK_SIZE_TYPE bsize) { + int64_t this_rd; + + x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; + super_block_uvrd(&cpi->common, x, rate_tokenonly, + distortion, skippable, NULL, bsize); + *rate = *rate_tokenonly + + x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED]; + this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion); + + x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; + + return this_rd; +} + +static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, + int *rate_uv, int *rate_uv_tokenonly, + int64_t *dist_uv, int *skip_uv, + MB_PREDICTION_MODE *mode_uv) { + MACROBLOCK *const x = &cpi->mb; + + // Use an estimated rd for uv_intra based on DC_PRED if the + // appropriate speed flag is set. + if (cpi->sf.use_uv_intra_rd_estimate) { + rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, + (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : + bsize); + // Else do a proper rd search for each possible transform size that may + // be considered in the main rd loop. + } else { + rd_pick_intra_sbuv_mode(cpi, x, + rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, + (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 + : bsize); + } + *mode_uv = x->e_mbd.mode_info_context->mbmi.uv_mode; +} + +static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode, + int mode_context) { + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int segment_id = xd->mode_info_context->mbmi.segment_id; + + // Don't account for mode here if segment skip is enabled. + if (!vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) { + assert(is_inter_mode(mode)); + return x->inter_mode_cost[mode_context][mode - NEARESTMV]; + } else { return 0; + } } void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) { @@ -1029,8 +1609,8 @@ static int labels2mode(MACROBLOCK *x, int i, MB_MODE_INFO * mbmi = &mic->mbmi; int cost = 0, thismvcost = 0; int idx, idy; - int bw = 1 << b_width_log2(mbmi->sb_type); - int bh = 1 << b_height_log2(mbmi->sb_type); + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type]; /* We have to be careful retrieving previously-encoded motion vectors. Ones from this macroblock have to be pulled from the BLOCKD array @@ -1072,77 +1652,63 @@ static int labels2mode(MACROBLOCK *x, int i, break; } - cost = vp9_cost_mv_ref(cpi, this_mode, - mbmi->mb_mode_context[mbmi->ref_frame[0]]); + cost = cost_mv_ref(cpi, this_mode, + mbmi->mb_mode_context[mbmi->ref_frame[0]]); mic->bmi[i].as_mv[0].as_int = this_mv->as_int; if (mbmi->ref_frame[1] > 0) mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int; x->partition_info->bmi[i].mode = m; - x->partition_info->bmi[i].mv.as_int = this_mv->as_int; - if (mbmi->ref_frame[1] > 0) - x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int; - for (idy = 0; idy < bh; ++idy) { - for (idx = 0; idx < bw; ++idx) { + for (idy = 0; idy < num_4x4_blocks_high; ++idy) + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) vpx_memcpy(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i])); - vpx_memcpy(&x->partition_info->bmi[i + idy * 2 + idx], - &x->partition_info->bmi[i], - sizeof(x->partition_info->bmi[i])); - } - } cost += thismvcost; return cost; } -static int64_t encode_inter_mb_segment(VP9_COMMON *const cm, +static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x, + int64_t best_yrd, int i, int *labelyrate, - int *distortion, + int64_t *distortion, int64_t *sse, ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl) { int k; + VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; - int bwl = b_width_log2(bsize), bw = 1 << bwl; - int bhl = b_height_log2(bsize), bh = 1 << bhl; + const int width = plane_block_width(bsize, &xd->plane[0]); + const int height = plane_block_height(bsize, &xd->plane[0]); int idx, idy; const int src_stride = x->plane[0].src.stride; - uint8_t* const src = - raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, - x->plane[0].src.buf, src_stride); - int16_t* src_diff = - raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i, - x->plane[0].src_diff); + uint8_t* const src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, + x->plane[0].src.buf, + src_stride); + int16_t* src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, i, + x->plane[0].src_diff); int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i); - uint8_t* const pre = - raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, - xd->plane[0].pre[0].buf, - xd->plane[0].pre[0].stride); - uint8_t* const dst = - raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, - xd->plane[0].dst.buf, - xd->plane[0].dst.stride); - int thisdistortion = 0; + uint8_t* const pre = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, + xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); + uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, + xd->plane[0].dst.buf, + xd->plane[0].dst.stride); + int64_t thisdistortion = 0, thissse = 0; int thisrate = 0; - *labelyrate = 0; - *distortion = 0; - vp9_build_inter_predictor(pre, xd->plane[0].pre[0].stride, dst, xd->plane[0].dst.stride, &xd->mode_info_context->bmi[i].as_mv[0], &xd->scale_factor[0], - 4 * bw, 4 * bh, 0 /* no avg */, &xd->subpix); + width, height, 0, &xd->subpix, + MV_PRECISION_Q3); - // TODO(debargha): Make this work properly with the - // implicit-compoundinter-weight experiment when implicit - // weighting for splitmv modes is turned on. if (xd->mode_info_context->mbmi.ref_frame[1] > 0) { uint8_t* const second_pre = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, @@ -1151,17 +1717,20 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm, vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride, dst, xd->plane[0].dst.stride, &xd->mode_info_context->bmi[i].as_mv[1], - &xd->scale_factor[1], 4 * bw, 4 * bh, 1, - &xd->subpix); + &xd->scale_factor[1], + width, height, 1, + &xd->subpix, MV_PRECISION_Q3); } - vp9_subtract_block(4 * bh, 4 * bw, src_diff, 8, + vp9_subtract_block(height, width, src_diff, 8, src, src_stride, dst, xd->plane[0].dst.stride); k = i; - for (idy = 0; idy < bh; ++idy) { - for (idx = 0; idx < bw; ++idx) { + for (idy = 0; idy < height / 4; ++idy) { + for (idx = 0; idx < width / 4; ++idx) { + int64_t ssz, rd, rd1, rd2; + k += (idy * 2 + idx); src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k, x->plane[0].src_diff); @@ -1170,30 +1739,50 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm, x->quantize_b_4x4(x, k, DCT_DCT, 16); thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff, - k, 16), 16); + k, 16), 16, &ssz); + thissse += ssz; thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC, ta + (k & 1), - tl + (k >> 1), TX_4X4, 16); + tl + (k >> 1), TX_4X4, + vp9_default_scan_4x4, + vp9_default_scan_4x4_neighbors); + rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2); + rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2); + rd = MIN(rd1, rd2); + if (rd >= best_yrd) + return INT64_MAX; } } - *distortion += thisdistortion; - *labelyrate += thisrate; + *distortion = thisdistortion >> 2; + *labelyrate = thisrate; + *sse = thissse >> 2; - *distortion >>= 2; return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion); } typedef struct { + int eobs; + int brate; + int byrate; + int64_t bdist; + int64_t bsse; + int64_t brdcost; + int_mv mvs[2]; + ENTROPY_CONTEXT ta[2]; + ENTROPY_CONTEXT tl[2]; +} SEG_RDSTAT; + +typedef struct { int_mv *ref_mv, *second_ref_mv; int_mv mvp; int64_t segment_rd; int r; - int d; + int64_t d; + int64_t sse; int segment_yrate; MB_PREDICTION_MODE modes[4]; - int_mv mvs[4], second_mvs[4]; - int eobs[4]; + SEG_RDSTAT rdstat[4][VP9_INTER_MODES]; int mvthresh; } BEST_SEG_INFO; @@ -1206,50 +1795,6 @@ static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) { return r; } -static enum BlockSize get_block_size(int bw, int bh) { - if (bw == 4 && bh == 4) - return BLOCK_4X4; - - if (bw == 4 && bh == 8) - return BLOCK_4X8; - - if (bw == 8 && bh == 4) - return BLOCK_8X4; - - if (bw == 8 && bh == 8) - return BLOCK_8X8; - - if (bw == 8 && bh == 16) - return BLOCK_8X16; - - if (bw == 16 && bh == 8) - return BLOCK_16X8; - - if (bw == 16 && bh == 16) - return BLOCK_16X16; - - if (bw == 32 && bh == 32) - return BLOCK_32X32; - - if (bw == 32 && bh == 16) - return BLOCK_32X16; - - if (bw == 16 && bh == 32) - return BLOCK_16X32; - - if (bw == 64 && bh == 32) - return BLOCK_64X32; - - if (bw == 32 && bh == 64) - return BLOCK_32X64; - - if (bw == 64 && bh == 64) - return BLOCK_64X64; - - assert(0); - return -1; -} - static INLINE void mi_buf_shift(MACROBLOCK *x, int i) { MB_MODE_INFO *mbmi = &x->e_mbd.mode_info_context->mbmi; x->plane[0].src.buf = @@ -1278,32 +1823,31 @@ static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src, } static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, - BEST_SEG_INFO *bsi, + BEST_SEG_INFO *bsi_buf, int filter_idx, int_mv seg_mvs[4][MAX_REF_FRAMES], int mi_row, int mi_col) { - int i, j; - int br = 0, bd = 0; + int i, j, br = 0, idx, idy; + int64_t bd = 0, block_sse = 0; MB_PREDICTION_MODE this_mode; - MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi; + MODE_INFO *mi = x->e_mbd.mode_info_context; + MB_MODE_INFO *const mbmi = &mi->mbmi; const int label_count = 4; - int64_t this_segment_rd = 0, other_segment_rd; + int64_t this_segment_rd = 0; int label_mv_thresh; - int rate = 0; - int sbr = 0, sbd = 0; int segmentyrate = 0; - int best_eobs[4] = { 0 }; BLOCK_SIZE_TYPE bsize = mbmi->sb_type; - int bwl = b_width_log2(bsize), bw = 1 << bwl; - int bhl = b_height_log2(bsize), bh = 1 << bhl; - int idx, idy; + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; vp9_variance_fn_ptr_t *v_fn_ptr; - ENTROPY_CONTEXT t_above[4], t_left[4]; - ENTROPY_CONTEXT t_above_b[4], t_left_b[4]; + ENTROPY_CONTEXT t_above[2], t_left[2]; + BEST_SEG_INFO *bsi = bsi_buf + filter_idx; + int mode_idx; + int subpelmv = 1, have_ref = 0; vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above)); vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left)); - v_fn_ptr = &cpi->fn_ptr[get_block_size(4 << bwl, 4 << bhl)]; + v_fn_ptr = &cpi->fn_ptr[bsize]; // 64 makes this threshold really big effectively // making it so that we very rarely check mvs on @@ -1312,17 +1856,14 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, label_mv_thresh = 1 * bsi->mvthresh / label_count; // Segmentation method overheads - other_segment_rd = this_segment_rd; - - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { // TODO(jingning,rbultje): rewrite the rate-distortion optimization // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT]; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; - int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX; MB_PREDICTION_MODE mode_selected = ZEROMV; - int bestlabelyrate = 0; + int64_t best_rd = INT64_MAX; i = idy * 2 + idx; frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0; @@ -1339,20 +1880,58 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // search for the best motion vector on this segment for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { - int64_t this_rd; - int distortion; - int labelyrate; - ENTROPY_CONTEXT t_above_s[4], t_left_s[4]; const struct buf_2d orig_src = x->plane[0].src; struct buf_2d orig_pre[2]; - vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre)); + mode_idx = inter_mode_offset(this_mode); + bsi->rdstat[i][mode_idx].brdcost = INT64_MAX; + + // if we're near/nearest and mv == 0,0, compare to zeromv + if ((this_mode == NEARMV || this_mode == NEARESTMV || + this_mode == ZEROMV) && + frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 && + (mbmi->ref_frame[1] <= 0 || + frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) { + int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]]; + int c1 = cost_mv_ref(cpi, NEARMV, rfc); + int c2 = cost_mv_ref(cpi, NEARESTMV, rfc); + int c3 = cost_mv_ref(cpi, ZEROMV, rfc); + + if (this_mode == NEARMV) { + if (c1 > c3) + continue; + } else if (this_mode == NEARESTMV) { + if (c2 > c3) + continue; + } else { + assert(this_mode == ZEROMV); + if (mbmi->ref_frame[1] <= 0) { + if ((c3 >= c2 && + frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) || + (c3 >= c1 && + frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0)) + continue; + } else { + if ((c3 >= c2 && + frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 && + frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) || + (c3 >= c1 && + frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 && + frame_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0)) + continue; + } + } + } - vpx_memcpy(t_above_s, t_above, sizeof(t_above_s)); - vpx_memcpy(t_left_s, t_left, sizeof(t_left_s)); + vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre)); + vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above, + sizeof(bsi->rdstat[i][mode_idx].ta)); + vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left, + sizeof(bsi->rdstat[i][mode_idx].tl)); // motion search for newmv (single predictor case only) - if (mbmi->ref_frame[1] <= 0 && this_mode == NEWMV) { + if (mbmi->ref_frame[1] <= 0 && this_mode == NEWMV && + seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) { int step_param = 0; int further_steps; int thissme, bestsme = INT_MAX; @@ -1361,7 +1940,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, /* Is the best so far sufficiently good that we cant justify doing * and new motion search. */ - if (best_label_rd < label_mv_thresh) + if (best_rd < label_mv_thresh) break; if (cpi->compressor_speed) { @@ -1372,9 +1951,24 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (i == 2) bsi->mvp.as_int = x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int; - step_param = 2; } } + if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) { + // Take wtd average of the step_params based on the last frame's + // max mv magnitude and the best ref mvs of the current block for + // the given reference. + if (i == 0) + step_param = (vp9_init_search_range( + cpi, x->max_mv_context[mbmi->ref_frame[0]]) + + cpi->mv_step_param) >> 1; + else + step_param = (vp9_init_search_range( + cpi, MAX(abs(bsi->mvp.as_mv.row), + abs(bsi->mvp.as_mv.col)) >> 3) + + cpi->mv_step_param) >> 1; + } else { + step_param = cpi->mv_step_param; + } further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; @@ -1424,14 +2018,17 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // restore src pointers mi_buf_restore(x, orig_src, orig_pre); - } else if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV) { + } + + if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV && + mbmi->interp_filter == vp9_switchable_interp[0]) { if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV || seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) continue; // adjust src pointers mi_buf_shift(x, i); - if (cpi->sf.comp_inter_joint_search_thresh < bsize) { + if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { int rate_mv; joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row, mi_col, seg_mvs[i], @@ -1445,146 +2042,209 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, mi_buf_restore(x, orig_src, orig_pre); } - rate = labels2mode(x, i, this_mode, &mode_mv[this_mode], - &second_mode_mv[this_mode], frame_mv, seg_mvs[i], - bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost, - x->mvcost, cpi); + bsi->rdstat[i][mode_idx].brate = + labels2mode(x, i, this_mode, &mode_mv[this_mode], + &second_mode_mv[this_mode], frame_mv, seg_mvs[i], + bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost, + x->mvcost, cpi); + + bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int; + if (num_4x4_blocks_wide > 1) + bsi->rdstat[i + 1][mode_idx].mvs[0].as_int = + mode_mv[this_mode].as_int; + if (num_4x4_blocks_high > 1) + bsi->rdstat[i + 2][mode_idx].mvs[0].as_int = + mode_mv[this_mode].as_int; + if (mbmi->ref_frame[1] > 0) { + bsi->rdstat[i][mode_idx].mvs[1].as_int = + second_mode_mv[this_mode].as_int; + if (num_4x4_blocks_wide > 1) + bsi->rdstat[i + 1][mode_idx].mvs[1].as_int = + second_mode_mv[this_mode].as_int; + if (num_4x4_blocks_high > 1) + bsi->rdstat[i + 2][mode_idx].mvs[1].as_int = + second_mode_mv[this_mode].as_int; + } // Trap vectors that reach beyond the UMV borders - if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || - ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) || - ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || - ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) { + if (mv_check_bounds(x, &mode_mv[this_mode])) continue; - } if (mbmi->ref_frame[1] > 0 && mv_check_bounds(x, &second_mode_mv[this_mode])) continue; - this_rd = encode_inter_mb_segment(&cpi->common, - x, i, &labelyrate, - &distortion, t_above_s, t_left_s); - this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0); - rate += labelyrate; + if (filter_idx > 0) { + BEST_SEG_INFO *ref_bsi = bsi_buf; + subpelmv = (mode_mv[this_mode].as_mv.row & 0x0f) || + (mode_mv[this_mode].as_mv.col & 0x0f); + have_ref = mode_mv[this_mode].as_int == + ref_bsi->rdstat[i][mode_idx].mvs[0].as_int; + if (mbmi->ref_frame[1] > 0) { + subpelmv |= (second_mode_mv[this_mode].as_mv.row & 0x0f) || + (second_mode_mv[this_mode].as_mv.col & 0x0f); + have_ref &= second_mode_mv[this_mode].as_int == + ref_bsi->rdstat[i][mode_idx].mvs[1].as_int; + } + + if (filter_idx > 1 && !subpelmv && !have_ref) { + ref_bsi = bsi_buf + 1; + have_ref = mode_mv[this_mode].as_int == + ref_bsi->rdstat[i][mode_idx].mvs[0].as_int; + if (mbmi->ref_frame[1] > 0) { + have_ref &= second_mode_mv[this_mode].as_int == + ref_bsi->rdstat[i][mode_idx].mvs[1].as_int; + } + } + + if (!subpelmv && have_ref && + ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) { + vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx], + sizeof(SEG_RDSTAT)); + if (bsi->rdstat[i][mode_idx].brdcost < best_rd) { + mode_selected = this_mode; + best_rd = bsi->rdstat[i][mode_idx].brdcost; + } + continue; + } + } + + bsi->rdstat[i][mode_idx].brdcost = + encode_inter_mb_segment(cpi, x, + bsi->segment_rd - this_segment_rd, i, + &bsi->rdstat[i][mode_idx].byrate, + &bsi->rdstat[i][mode_idx].bdist, + &bsi->rdstat[i][mode_idx].bsse, + bsi->rdstat[i][mode_idx].ta, + bsi->rdstat[i][mode_idx].tl); + if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) { + bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv, + bsi->rdstat[i][mode_idx].brate, 0); + bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate; + bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i]; + } - if (this_rd < best_label_rd) { - sbr = rate; - sbd = distortion; - bestlabelyrate = labelyrate; + if (bsi->rdstat[i][mode_idx].brdcost < best_rd) { mode_selected = this_mode; - best_label_rd = this_rd; - best_eobs[i] = x->e_mbd.plane[0].eobs[i]; - vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s)); - vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s)); + best_rd = bsi->rdstat[i][mode_idx].brdcost; } } /*for each 4x4 mode*/ - vpx_memcpy(t_above, t_above_b, sizeof(t_above)); - vpx_memcpy(t_left, t_left_b, sizeof(t_left)); + if (best_rd == INT64_MAX) { + int iy, midx; + for (iy = i + 1; iy < 4; ++iy) + for (midx = 0; midx < VP9_INTER_MODES; ++midx) + bsi->rdstat[iy][midx].brdcost = INT64_MAX; + bsi->segment_rd = INT64_MAX; + return; + } + + mode_idx = inter_mode_offset(mode_selected); + vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above)); + vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left)); labels2mode(x, i, mode_selected, &mode_mv[mode_selected], &second_mode_mv[mode_selected], frame_mv, seg_mvs[i], bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost, x->mvcost, cpi); - br += sbr; - bd += sbd; - segmentyrate += bestlabelyrate; - this_segment_rd += best_label_rd; - other_segment_rd += best_other_rd; + br += bsi->rdstat[i][mode_idx].brate; + bd += bsi->rdstat[i][mode_idx].bdist; + block_sse += bsi->rdstat[i][mode_idx].bsse; + segmentyrate += bsi->rdstat[i][mode_idx].byrate; + this_segment_rd += bsi->rdstat[i][mode_idx].brdcost; + + if (this_segment_rd > bsi->segment_rd) { + int iy, midx; + for (iy = i + 1; iy < 4; ++iy) + for (midx = 0; midx < VP9_INTER_MODES; ++midx) + bsi->rdstat[iy][midx].brdcost = INT64_MAX; + bsi->segment_rd = INT64_MAX; + return; + } - for (j = 1; j < bh; ++j) + for (j = 1; j < num_4x4_blocks_high; ++j) vpx_memcpy(&x->partition_info->bmi[i + j * 2], &x->partition_info->bmi[i], sizeof(x->partition_info->bmi[i])); - for (j = 1; j < bw; ++j) + for (j = 1; j < num_4x4_blocks_wide; ++j) vpx_memcpy(&x->partition_info->bmi[i + j], &x->partition_info->bmi[i], sizeof(x->partition_info->bmi[i])); } } /* for each label */ - if (this_segment_rd < bsi->segment_rd) { - bsi->r = br; - bsi->d = bd; - bsi->segment_yrate = segmentyrate; - bsi->segment_rd = this_segment_rd; + bsi->r = br; + bsi->d = bd; + bsi->segment_yrate = segmentyrate; + bsi->segment_rd = this_segment_rd; + bsi->sse = block_sse; - // store everything needed to come back to this!! - for (i = 0; i < 4; i++) { - bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv; - if (mbmi->ref_frame[1] > 0) - bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv; - bsi->modes[i] = x->partition_info->bmi[i].mode; - bsi->eobs[i] = best_eobs[i]; - } - } + // update the coding decisions + for (i = 0; i < 4; ++i) + bsi->modes[i] = x->partition_info->bmi[i].mode; } -static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, - int_mv *best_ref_mv, - int_mv *second_best_ref_mv, - int64_t best_rd, - int *returntotrate, - int *returnyrate, - int *returndistortion, - int *skippable, int mvthresh, - int_mv seg_mvs[4][MAX_REF_FRAMES], - int mi_row, int mi_col) { +static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, + int_mv *best_ref_mv, + int_mv *second_best_ref_mv, + int64_t best_rd, + int *returntotrate, + int *returnyrate, + int64_t *returndistortion, + int *skippable, int64_t *psse, + int mvthresh, + int_mv seg_mvs[4][MAX_REF_FRAMES], + BEST_SEG_INFO *bsi_buf, + int filter_idx, + int mi_row, int mi_col) { int i; - BEST_SEG_INFO bsi; - MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi; + BEST_SEG_INFO *bsi = bsi_buf + filter_idx; + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mode_info_context; + MB_MODE_INFO *mbmi = &mi->mbmi; + int mode_idx; - vpx_memset(&bsi, 0, sizeof(bsi)); + vpx_memset(bsi, 0, sizeof(*bsi)); - bsi.segment_rd = best_rd; - bsi.ref_mv = best_ref_mv; - bsi.second_ref_mv = second_best_ref_mv; - bsi.mvp.as_int = best_ref_mv->as_int; - bsi.mvthresh = mvthresh; + bsi->segment_rd = best_rd; + bsi->ref_mv = best_ref_mv; + bsi->second_ref_mv = second_best_ref_mv; + bsi->mvp.as_int = best_ref_mv->as_int; + bsi->mvthresh = mvthresh; for (i = 0; i < 4; i++) - bsi.modes[i] = ZEROMV; + bsi->modes[i] = ZEROMV; - rd_check_segment_txsize(cpi, x, &bsi, seg_mvs, mi_row, mi_col); + rd_check_segment_txsize(cpi, x, bsi_buf, filter_idx, seg_mvs, mi_row, mi_col); + if (bsi->segment_rd > best_rd) + return INT64_MAX; /* set it to the best */ for (i = 0; i < 4; i++) { - x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int = bsi.mvs[i].as_int; + mode_idx = inter_mode_offset(bsi->modes[i]); + mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int; if (mbmi->ref_frame[1] > 0) - x->e_mbd.mode_info_context->bmi[i].as_mv[1].as_int = - bsi.second_mvs[i].as_int; - x->e_mbd.plane[0].eobs[i] = bsi.eobs[i]; + mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int; + xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs; + x->partition_info->bmi[i].mode = bsi->modes[i]; } - /* save partitions */ - x->partition_info->count = 4; - - for (i = 0; i < x->partition_info->count; i++) { - x->partition_info->bmi[i].mode = bsi.modes[i]; - x->partition_info->bmi[i].mv.as_mv = bsi.mvs[i].as_mv; - if (mbmi->ref_frame[1] > 0) - x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[i].as_mv; - } /* * used to set mbmi->mv.as_int */ - x->partition_info->bmi[3].mv.as_int = bsi.mvs[3].as_int; - if (mbmi->ref_frame[1] > 0) - x->partition_info->bmi[3].second_mv.as_int = bsi.second_mvs[3].as_int; - - *returntotrate = bsi.r; - *returndistortion = bsi.d; - *returnyrate = bsi.segment_yrate; + *returntotrate = bsi->r; + *returndistortion = bsi->d; + *returnyrate = bsi->segment_yrate; *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_SB8X8); - mbmi->mode = bsi.modes[3]; + *psse = bsi->sse; + mbmi->mode = bsi->modes[3]; - return (int)(bsi.segment_rd); + return bsi->segment_rd; } static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, int ref_y_stride, - int ref_frame, enum BlockSize block_size ) { + int ref_frame, BLOCK_SIZE_TYPE block_size ) { MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; int_mv this_mv; @@ -1593,6 +2253,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, int best_index = 0; int best_sad = INT_MAX; int this_sad = INT_MAX; + unsigned int max_mv = 0; uint8_t *src_y_ptr = x->plane[0].src.buf; uint8_t *ref_y_ptr; @@ -1602,6 +2263,8 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < MAX_MV_REF_CANDIDATES; i++) { this_mv.as_int = mbmi->ref_mvs[ref_frame][i].as_int; + max_mv = MAX(max_mv, + MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3); // The list is at an end if we see 0 for a second time. if (!this_mv.as_int && zero_seen) break; @@ -1625,6 +2288,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, // Note the index of the mv that worked best in the reference list. x->mv_best_ref_index[ref_frame] = best_index; + x->max_mv_context[ref_frame] = max_mv; } static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, @@ -1633,18 +2297,18 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, vp9_prob *comp_mode_p) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; - int seg_ref_active = vp9_segfeature_active(xd, segment_id, + int seg_ref_active = vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME); if (seg_ref_active) { vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single)); vpx_memset(ref_costs_comp, 0, MAX_REF_FRAMES * sizeof(*ref_costs_comp)); *comp_mode_p = 128; } else { - vp9_prob intra_inter_p = vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER); + vp9_prob intra_inter_p = vp9_get_pred_prob_intra_inter(cm, xd); vp9_prob comp_inter_p = 128; if (cm->comp_pred_mode == HYBRID_PREDICTION) { - comp_inter_p = vp9_get_pred_prob(cm, xd, PRED_COMP_INTER_INTER); + comp_inter_p = vp9_get_pred_prob_comp_inter_inter(cm, xd); *comp_mode_p = comp_inter_p; } else { *comp_mode_p = 128; @@ -1653,8 +2317,8 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0); if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) { - vp9_prob ref_single_p1 = vp9_get_pred_prob(cm, xd, PRED_SINGLE_REF_P1); - vp9_prob ref_single_p2 = vp9_get_pred_prob(cm, xd, PRED_SINGLE_REF_P2); + vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd); + vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd); unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1); if (cm->comp_pred_mode == HYBRID_PREDICTION) @@ -1673,7 +2337,7 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, ref_costs_single[ALTREF_FRAME] = 512; } if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) { - vp9_prob ref_comp_p = vp9_get_pred_prob(cm, xd, PRED_COMP_REF_P); + vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd); unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1); if (cm->comp_pred_mode == HYBRID_PREDICTION) @@ -1689,12 +2353,13 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, } static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, - int mode_index, - PARTITION_INFO *partition, - int_mv *ref_mv, - int_mv *second_ref_mv, - int64_t comp_pred_diff[NB_PREDICTION_TYPES], - int64_t txfm_size_diff[NB_TXFM_MODES]) { + int mode_index, + PARTITION_INFO *partition, + int_mv *ref_mv, + int_mv *second_ref_mv, + int64_t comp_pred_diff[NB_PREDICTION_TYPES], + int64_t txfm_size_diff[NB_TXFM_MODES], + int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]) { MACROBLOCKD *const xd = &x->e_mbd; // Take a snapshot of the coding context so it can be @@ -1713,7 +2378,11 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, ctx->comp_pred_diff = (int)comp_pred_diff[COMP_PREDICTION_ONLY]; ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION]; + // FIXME(rbultje) does this memcpy the whole array? I believe sizeof() + // doesn't actually work this way memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff)); + memcpy(ctx->best_filter_diff, best_filter_diff, + sizeof(*best_filter_diff) * (VP9_SWITCHABLE_FILTERS + 1)); } static void setup_pred_block(const MACROBLOCKD *xd, @@ -1744,7 +2413,7 @@ static void setup_pred_block(const MACROBLOCKD *xd, static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, int idx, MV_REFERENCE_FRAME frame_type, - enum BlockSize block_size, + BLOCK_SIZE_TYPE block_size, int mi_row, int mi_col, int_mv frame_nearest_mv[MAX_REF_FRAMES], int_mv frame_near_mv[MAX_REF_FRAMES], @@ -1786,8 +2455,8 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, // Further refinement that is encode side only to test the top few candidates // in full and choose the best as the centre point for subsequent searches. // The current implementation doesn't support scaling. - if (scale[frame_type].x_scale_fp == (1 << VP9_REF_SCALE_SHIFT) && - scale[frame_type].y_scale_fp == (1 << VP9_REF_SCALE_SHIFT)) + if (scale[frame_type].x_scale_fp == VP9_REF_NO_SCALE && + scale[frame_type].y_scale_fp == VP9_REF_NO_SCALE) mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride, frame_type, block_size); } @@ -1800,93 +2469,11 @@ static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) { return scaled_ref_frame; } -static void model_rd_from_var_lapndz(int var, int n, int qstep, - int *rate, int *dist) { - // This function models the rate and distortion for a Laplacian - // source with given variance when quantized with a uniform quantizer - // with given stepsize. The closed form expressions are in: - // Hang and Chen, "Source Model for transform video coder and its - // application - Part I: Fundamental Theory", IEEE Trans. Circ. - // Sys. for Video Tech., April 1997. - // The function is implemented as piecewise approximation to the - // exact computation. - // TODO(debargha): Implement the functions by interpolating from a - // look-up table - vp9_clear_system_state(); - if (var == 0 || n == 0) { - *rate = 0; - *dist = 0; - } else { - double D, R; - double s2 = (double) var / n; - double s = sqrt(s2); - double x = qstep / s; - if (x > 1.0) { - double y = exp(-x / 2); - double y2 = y * y; - D = 2.069981728764738 * y2 - 2.764286806516079 * y + 1.003956960819275; - R = 0.924056758535089 * y2 + 2.738636469814024 * y - 0.005169662030017; - } else { - double x2 = x * x; - D = 0.075303187668830 * x2 + 0.004296954321112 * x - 0.000413209252807; - if (x > 0.125) - R = 1 / (-0.03459733614226 * x2 + 0.36561675733603 * x + - 0.1626989668625); - else - R = -1.442252874826093 * log(x) + 1.944647760719664; - } - if (R < 0) { - *rate = 0; - *dist = var; - } else { - *rate = (n * R * 256 + 0.5); - *dist = (n * D * s2 + 0.5); - } - } - vp9_clear_system_state(); -} - -static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize, - struct macroblockd_plane *pd) { - return get_block_size(plane_block_width(bsize, pd), - plane_block_height(bsize, pd)); -} - -static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, - MACROBLOCK *x, MACROBLOCKD *xd, - int *out_rate_sum, int *out_dist_sum) { - // Note our transform coeffs are 8 times an orthogonal transform. - // Hence quantizer step is also 8 times. To get effective quantizer - // we need to divide by 8 before sending to modeling function. - unsigned int sse, var; - int i, rate_sum = 0, dist_sum = 0; - - for (i = 0; i < MAX_MB_PLANE; ++i) { - struct macroblock_plane *const p = &x->plane[i]; - struct macroblockd_plane *const pd = &xd->plane[i]; - - // TODO(dkovalev) the same code in get_plane_block_size - const int bw = plane_block_width(bsize, pd); - const int bh = plane_block_height(bsize, pd); - const enum BlockSize bs = get_block_size(bw, bh); - int rate, dist; - var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, - pd->dst.buf, pd->dst.stride, &sse); - model_rd_from_var_lapndz(var, bw * bh, pd->dequant[1] >> 3, &rate, &dist); - - rate_sum += rate; - dist_sum += dist; - } - - *out_rate_sum = rate_sum; - *out_dist_sum = dist_sum; -} - static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) { MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; - const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP); + const int c = vp9_get_pred_context_switchable_interp(xd); const int m = vp9_switchable_interp_map[mbmi->interp_filter]; return SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m]; } @@ -1896,16 +2483,16 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv) { MACROBLOCKD *xd = &x->e_mbd; + VP9_COMMON *cm = &cpi->common; MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}}; int bestsme = INT_MAX; - int further_steps, step_param = cpi->sf.first_step; + int further_steps, step_param; int sadpb = x->sadperbit16; int_mv mvp_full; int ref = mbmi->ref_frame[0]; int_mv ref_mv = mbmi->ref_mvs[ref][0]; - int sr = 0; - const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]); + const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]); int tmp_col_min = x->mv_col_min; int tmp_col_max = x->mv_col_max; @@ -1922,24 +2509,48 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; - setup_pre_planes(xd, scaled_ref_frame, NULL, mi_row, mi_col, - NULL, NULL); + setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); } vp9_clamp_mv_min_max(x, &ref_mv); - sr = vp9_init_search_range(cpi->common.width, cpi->common.height); - - // mvp_full.as_int = ref_mv[0].as_int; - mvp_full.as_int = - mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int; + // Adjust search parameters based on small partitions' result. + if (x->fast_ms) { + // && abs(mvp_full.as_mv.row - x->pred_mv.as_mv.row) < 24 && + // abs(mvp_full.as_mv.col - x->pred_mv.as_mv.col) < 24) { + // adjust search range + step_param = 6; + if (x->fast_ms > 1) + step_param = 8; + + // Get prediction MV. + mvp_full.as_int = x->pred_mv.as_int; + + // Adjust MV sign if needed. + if (cm->ref_frame_sign_bias[ref]) { + mvp_full.as_mv.col *= -1; + mvp_full.as_mv.row *= -1; + } + } else { + // Work out the size of the first step in the mv step search. + // 0 here is maximum length first step. 1 is MAX >> 1 etc. + if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) { + // Take wtd average of the step_params based on the last frame's + // max mv magnitude and that based on the best ref mvs of the current + // block for the given reference. + step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) + + cpi->mv_step_param) >> 1; + } else { + step_param = cpi->mv_step_param; + } + // mvp_full.as_int = ref_mv[0].as_int; + mvp_full.as_int = + mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int; + } mvp_full.as_mv.col >>= 3; mvp_full.as_mv.row >>= 3; - // adjust search range according to sr from mv prediction - step_param = MAX(step_param, sr); - // Further step/diamond searches as necessary further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; @@ -1984,7 +2595,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int refs[2] = { mbmi->ref_frame[0], (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; int_mv ref_mv[2]; - const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]); + const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]); int ite; // Prediction buffer from second frame. uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t)); @@ -2008,8 +2619,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, // motion search code to be used without additional modifications. for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; - setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col, - NULL, NULL); + setup_pre_planes(xd, 0, scaled_ref_frame[0], mi_row, mi_col, NULL); } if (scaled_ref_frame[1]) { @@ -2017,8 +2627,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < MAX_MB_PLANE; i++) backup_second_yv12[i] = xd->plane[i].pre[1]; - setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col, - NULL, NULL); + setup_pre_planes(xd, 0, scaled_ref_frame[1], mi_row, mi_col, NULL); } xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0], @@ -2057,7 +2666,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, &frame_mv[refs[!id]], &xd->scale_factor[!id], pw, ph, 0, - &xd->subpix); + &xd->subpix, MV_PRECISION_Q3); // Compound motion search on first ref frame. if (id) @@ -2134,35 +2743,37 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int64_t txfm_cache[], - int *rate2, int *distortion, int *skippable, - int *rate_y, int *distortion_y, - int *rate_uv, int *distortion_uv, + int *rate2, int64_t *distortion, + int *skippable, + int *rate_y, int64_t *distortion_y, + int *rate_uv, int64_t *distortion_uv, int *mode_excluded, int *disable_skip, INTERPOLATIONFILTERTYPE *best_filter, - int_mv *frame_mv, + int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row, int mi_col, - int_mv single_newmv[MAX_REF_FRAMES]) { - const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize); - + int_mv single_newmv[MAX_REF_FRAMES], + int64_t *psse, int64_t ref_best_rd) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]); - const enum BlockSize uv_block_size = get_plane_block_size(bsize, - &xd->plane[1]); MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; const int is_comp_pred = (mbmi->ref_frame[1] > 0); const int num_refs = is_comp_pred ? 2 : 1; const int this_mode = mbmi->mode; + int_mv *frame_mv = mode_mv[this_mode]; int i; int refs[2] = { mbmi->ref_frame[0], (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) }; int_mv cur_mv[2]; int64_t this_rd = 0; - unsigned char tmp_buf[MAX_MB_PLANE][64 * 64]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64); int pred_exists = 0; int interpolating_intpel_seen = 0; int intpel_mv; int64_t rd, best_rd = INT64_MAX; + int best_needs_copy = 0; + uint8_t *orig_dst[MAX_MB_PLANE]; + int orig_dst_stride[MAX_MB_PLANE]; + int rs = 0; switch (this_mode) { int rate_mv; @@ -2172,7 +2783,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int; frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; - if (cpi->sf.comp_inter_joint_search_thresh < bsize) { + if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, single_newmv, &rate_mv); } else { @@ -2189,7 +2800,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, frame_mv[refs[1]].as_int == INVALID_MV) return INT64_MAX; *rate2 += rate_mv; - } else { int_mv tmp_mv; single_motion_search(cpi, x, bsize, mi_row, mi_col, @@ -2206,6 +2816,43 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, default: break; } + + // if we're near/nearest and mv == 0,0, compare to zeromv + if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) && + frame_mv[refs[0]].as_int == 0 && + !vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP) && + (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) { + int rfc = mbmi->mb_mode_context[mbmi->ref_frame[0]]; + int c1 = cost_mv_ref(cpi, NEARMV, rfc); + int c2 = cost_mv_ref(cpi, NEARESTMV, rfc); + int c3 = cost_mv_ref(cpi, ZEROMV, rfc); + + if (this_mode == NEARMV) { + if (c1 > c3) + return INT64_MAX; + } else if (this_mode == NEARESTMV) { + if (c2 > c3) + return INT64_MAX; + } else { + assert(this_mode == ZEROMV); + if (num_refs == 1) { + if ((c3 >= c2 && + mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) || + (c3 >= c1 && + mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0)) + return INT64_MAX; + } else { + if ((c3 >= c2 && + mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 && + mode_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) || + (c3 >= c1 && + mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 && + mode_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0)) + return INT64_MAX; + } + } + } + for (i = 0; i < num_refs; ++i) { cur_mv[i] = frame_mv[refs[i]]; // Clip "next_nearest" so that it does not extend to far out of image @@ -2219,12 +2866,30 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, mbmi->mv[i].as_int = cur_mv[i].as_int; } + // do first prediction into the destination buffer. Do the next + // prediction into a temporary buffer. Then keep track of which one + // of these currently holds the best predictor, and use the other + // one for future predictions. In the end, copy from tmp_buf to + // dst if necessary. + for (i = 0; i < MAX_MB_PLANE; i++) { + orig_dst[i] = xd->plane[i].dst.buf; + orig_dst_stride[i] = xd->plane[i].dst.stride; + } + /* We don't include the cost of the second reference here, because there * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other * words if you present them in that order, the second one is always known * if the first is known */ - *rate2 += vp9_cost_mv_ref(cpi, this_mode, - mbmi->mb_mode_context[mbmi->ref_frame[0]]); + *rate2 += cost_mv_ref(cpi, this_mode, + mbmi->mb_mode_context[mbmi->ref_frame[0]]); + + if (!(*mode_excluded)) { + if (is_comp_pred) { + *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY); + } else { + *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY); + } + } pred_exists = 0; interpolating_intpel_seen = 0; @@ -2236,78 +2901,113 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, (mbmi->mv[1].as_mv.col & 15) == 0; // Search for best switchable filter by checking the variance of // pred error irrespective of whether the filter will be used - if (cpi->speed > 4) { + *best_filter = EIGHTTAP; + if (cpi->sf.use_8tap_always) { *best_filter = EIGHTTAP; + vp9_zero(cpi->rd_filter_cache); } else { int i, newbest; - int tmp_rate_sum = 0, tmp_dist_sum = 0; + int tmp_rate_sum = 0; + int64_t tmp_dist_sum = 0; + + cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX; for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) { - int rs = 0; + int j; + int64_t rs_rd; const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i]; - const int is_intpel_interp = intpel_mv && - vp9_is_interpolating_filter[filter]; + const int is_intpel_interp = intpel_mv; mbmi->interp_filter = filter; vp9_setup_interp_filters(xd, mbmi->interp_filter, cm); - - if (cm->mcomp_filter_type == SWITCHABLE) - rs = get_switchable_rate(cm, x); + rs = get_switchable_rate(cm, x); + rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0); if (interpolating_intpel_seen && is_intpel_interp) { - rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum); + cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv, + tmp_rate_sum, tmp_dist_sum); + cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = + MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], + cpi->rd_filter_cache[i] + rs_rd); + rd = cpi->rd_filter_cache[i]; + if (cm->mcomp_filter_type == SWITCHABLE) + rd += rs_rd; } else { - int rate_sum = 0, dist_sum = 0; + int rate_sum = 0; + int64_t dist_sum = 0; + if ((cm->mcomp_filter_type == SWITCHABLE && + (!i || best_needs_copy)) || + (cm->mcomp_filter_type != SWITCHABLE && + (cm->mcomp_filter_type == mbmi->interp_filter || + (!interpolating_intpel_seen && is_intpel_interp)))) { + for (j = 0; j < MAX_MB_PLANE; j++) { + xd->plane[j].dst.buf = orig_dst[j]; + xd->plane[j].dst.stride = orig_dst_stride[j]; + } + } else { + for (j = 0; j < MAX_MB_PLANE; j++) { + xd->plane[j].dst.buf = tmp_buf + j * 64 * 64; + xd->plane[j].dst.stride = 64; + } + } vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum); - rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum); + cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv, + rate_sum, dist_sum); + cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = + MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], + cpi->rd_filter_cache[i] + rs_rd); + rd = cpi->rd_filter_cache[i]; + if (cm->mcomp_filter_type == SWITCHABLE) + rd += rs_rd; if (!interpolating_intpel_seen && is_intpel_interp) { tmp_rate_sum = rate_sum; tmp_dist_sum = dist_sum; } } + if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { + if (rd / 2 > ref_best_rd) { + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = orig_dst[i]; + xd->plane[i].dst.stride = orig_dst_stride[i]; + } + return INT64_MAX; + } + } newbest = i == 0 || rd < best_rd; if (newbest) { best_rd = rd; *best_filter = mbmi->interp_filter; + if (cm->mcomp_filter_type == SWITCHABLE && i && + !(interpolating_intpel_seen && is_intpel_interp)) + best_needs_copy = !best_needs_copy; } if ((cm->mcomp_filter_type == SWITCHABLE && newbest) || (cm->mcomp_filter_type != SWITCHABLE && cm->mcomp_filter_type == mbmi->interp_filter)) { - int p; - - for (p = 0; p < MAX_MB_PLANE; p++) { - const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y; - const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x; - int i; - - for (i = 0; i < y; i++) - vpx_memcpy(&tmp_buf[p][64 * i], - xd->plane[p].dst.buf + i * xd->plane[p].dst.stride, x); - } pred_exists = 1; } interpolating_intpel_seen |= is_intpel_interp; } - } + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = orig_dst[i]; + xd->plane[i].dst.stride = orig_dst_stride[i]; + } + } // Set the appripriate filter mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ? cm->mcomp_filter_type : *best_filter; vp9_setup_interp_filters(xd, mbmi->interp_filter, cm); - + rs = (cm->mcomp_filter_type == SWITCHABLE ? get_switchable_rate(cm, x) : 0); if (pred_exists) { - int p; - - for (p = 0; p < MAX_MB_PLANE; p++) { - const int y = (MI_SIZE * bh) >> xd->plane[p].subsampling_y; - const int x = (MI_SIZE * bw) >> xd->plane[p].subsampling_x; - int i; - - for (i = 0; i < y; i++) - vpx_memcpy(xd->plane[p].dst.buf + i * xd->plane[p].dst.stride, - &tmp_buf[p][64 * i], x); + if (best_needs_copy) { + // again temporarily set the buffers to local memory to prevent a memcpy + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = tmp_buf + i * 64 * 64; + xd->plane[i].dst.stride = 64; + } } } else { // Handles the special case when a filter that is not in the @@ -2315,42 +3015,60 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); } + + if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { + int tmp_rate; + int64_t tmp_dist; + model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist); + rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist); + // if current pred_error modeled rd is substantially more than the best + // so far, do not bother doing full rd + if (rd / 2 > ref_best_rd) { + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = orig_dst[i]; + xd->plane[i].dst.stride = orig_dst_stride[i]; + } + return INT64_MAX; + } + } + if (cpi->common.mcomp_filter_type == SWITCHABLE) *rate2 += get_switchable_rate(cm, x); if (cpi->active_map_enabled && x->active_ptr[0] == 0) x->skip = 1; else if (x->encode_breakout) { + const BLOCK_SIZE_TYPE y_size = get_plane_block_size(bsize, &xd->plane[0]); + const BLOCK_SIZE_TYPE uv_size = get_plane_block_size(bsize, &xd->plane[1]); + unsigned int var, sse; - int threshold = (xd->plane[0].dequant[1] - * xd->plane[0].dequant[1] >> 4); + int threshold = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1] >> 4); + if (threshold < x->encode_breakout) threshold = x->encode_breakout; - var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf, - x->plane[0].src.stride, - xd->plane[0].dst.buf, - xd->plane[0].dst.stride, - &sse); + var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, xd->plane[0].dst.stride, + &sse); if ((int)sse < threshold) { unsigned int q2dc = xd->plane[0].dequant[0]; - /* If there is no codeable 2nd order dc - or a very small uniform pixel change change */ + // If there is no codeable 2nd order dc + // or a very small uniform pixel change change if ((sse - var < q2dc * q2dc >> 4) || (sse / 2 > var && sse - var < 64)) { // Check u and v to make sure skip is ok int sse2; unsigned int sse2u, sse2v; - var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf, - x->plane[1].src.stride, - xd->plane[1].dst.buf, - xd->plane[1].dst.stride, &sse2u); - var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf, - x->plane[1].src.stride, - xd->plane[2].dst.buf, - xd->plane[1].dst.stride, &sse2v); + var = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf, + x->plane[1].src.stride, + xd->plane[1].dst.buf, + xd->plane[1].dst.stride, &sse2u); + var = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf, + x->plane[2].src.stride, + xd->plane[2].dst.buf, + xd->plane[2].dst.stride, &sse2v); sse2 = sse2u + sse2v; if (sse2 * 2 < threshold) { @@ -2358,7 +3076,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, *distortion = sse + sse2; *rate2 = 500; - /* for best_yrd calculation */ + // for best yrd calculation *rate_uv = 0; *distortion_uv = sse2; @@ -2371,89 +3089,91 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (!x->skip) { int skippable_y, skippable_uv; + int64_t sseuv = INT_MAX; // Y cost and distortion - super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, - bsize, txfm_cache); + super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse, + bsize, txfm_cache, ref_best_rd); + + if (*rate_y == INT_MAX) { + *rate2 = INT_MAX; + *distortion = INT64_MAX; + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = orig_dst[i]; + xd->plane[i].dst.stride = orig_dst_stride[i]; + } + return INT64_MAX; + } *rate2 += *rate_y; *distortion += *distortion_y; super_block_uvrd(cm, x, rate_uv, distortion_uv, - &skippable_uv, bsize); + &skippable_uv, &sseuv, bsize); + *psse += sseuv; *rate2 += *rate_uv; *distortion += *distortion_uv; *skippable = skippable_y && skippable_uv; } - if (!(*mode_excluded)) { - if (is_comp_pred) { - *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY); - } else { - *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY); - } + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = orig_dst[i]; + xd->plane[i].dst.stride = orig_dst_stride[i]; } return this_rd; // if 0, this will be re-calculated by caller } void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, - int *returnrate, int *returndist, + int *returnrate, int64_t *returndist, BLOCK_SIZE_TYPE bsize, - PICK_MODE_CONTEXT *ctx) { - VP9_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &x->e_mbd; - int rate_y = 0, rate_uv; - int rate_y_tokenonly = 0, rate_uv_tokenonly; - int dist_y = 0, dist_uv; + PICK_MODE_CONTEXT *ctx, int64_t best_rd) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0; int y_skip = 0, uv_skip; - int64_t txfm_cache[NB_TXFM_MODES], err; - MB_PREDICTION_MODE mode; - TX_SIZE txfm_size; - int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y; - int64_t err4x4 = INT64_MAX; - int i; + int64_t dist_y = 0, dist_uv = 0, txfm_cache[NB_TXFM_MODES]; - vpx_memset(&txfm_cache,0,sizeof(txfm_cache)); + x->skip_encode = 0; + vpx_memset(&txfm_cache, 0, sizeof(txfm_cache)); ctx->skip = 0; - xd->mode_info_context->mbmi.mode = DC_PRED; xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME; - err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, - &dist_y, &y_skip, bsize, txfm_cache); - mode = xd->mode_info_context->mbmi.mode; - txfm_size = xd->mode_info_context->mbmi.txfm_size; - rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, - &dist_uv, &uv_skip, - (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : - bsize); - if (bsize < BLOCK_SIZE_SB8X8) - err4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4_y, - &rate4x4_y_tokenonly, - &dist4x4_y, err); + if (bsize >= BLOCK_SIZE_SB8X8) { + if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, + &dist_y, &y_skip, bsize, txfm_cache, + best_rd) >= best_rd) { + *returnrate = INT_MAX; + return; + } + rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, + &dist_uv, &uv_skip, bsize); + } else { + y_skip = 0; + if (rd_pick_intra4x4mby_modes(cpi, x, &rate_y, &rate_y_tokenonly, + &dist_y, best_rd) >= best_rd) { + *returnrate = INT_MAX; + return; + } + rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, + &dist_uv, &uv_skip, BLOCK_SIZE_SB8X8); + } if (y_skip && uv_skip) { *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly + - vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1); + vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1); *returndist = dist_y + (dist_uv >> 2); memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff)); - xd->mode_info_context->mbmi.mode = mode; - xd->mode_info_context->mbmi.txfm_size = txfm_size; - } else if (bsize < BLOCK_SIZE_SB8X8 && err4x4 < err) { - *returnrate = rate4x4_y + rate_uv + - vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0); - *returndist = dist4x4_y + (dist_uv >> 2); - vpx_memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff)); - xd->mode_info_context->mbmi.txfm_size = TX_4X4; } else { + int i; *returnrate = rate_y + rate_uv + - vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0); + vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0); *returndist = dist_y + (dist_uv >> 2); - for (i = 0; i < NB_TXFM_MODES; i++) { - ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->txfm_mode]; + if (cpi->sf.tx_size_search_method == USE_FULL_RD) { + for (i = 0; i < NB_TXFM_MODES; i++) { + ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->tx_mode]; + } } - xd->mode_info_context->mbmi.txfm_size = txfm_size; - xd->mode_info_context->mbmi.mode = mode; } ctx->mic = *xd->mode_info_context; @@ -2462,15 +3182,15 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, int *returnrate, - int *returndistortion, + int64_t *returndistortion, BLOCK_SIZE_TYPE bsize, - PICK_MODE_CONTEXT *ctx) { + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; - const enum BlockSize block_size = get_plane_block_size(bsize, &xd->plane[0]); + const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]); MB_PREDICTION_MODE this_mode; - MB_PREDICTION_MODE best_mode = DC_PRED; MV_REFERENCE_FRAME ref_frame; unsigned char segment_id = xd->mode_info_context->mbmi.segment_id; int comp_pred, i; @@ -2483,21 +3203,28 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx}; - int64_t best_rd = INT64_MAX; + int64_t best_rd = best_rd_so_far; + int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise int64_t best_txfm_rd[NB_TXFM_MODES]; int64_t best_txfm_diff[NB_TXFM_MODES]; int64_t best_pred_diff[NB_PREDICTION_TYPES]; int64_t best_pred_rd[NB_PREDICTION_TYPES]; + int64_t best_filter_rd[VP9_SWITCHABLE_FILTERS + 1]; + int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]; MB_MODE_INFO best_mbmode; int j; int mode_index, best_mode_index = 0; unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; vp9_prob comp_mode_p; - int64_t best_overall_rd = INT64_MAX; - INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE; + int64_t best_intra_rd = INT64_MAX; + int64_t best_inter_rd = INT64_MAX; + MB_PREDICTION_MODE best_intra_mode = DC_PRED; + // MB_PREDICTION_MODE best_inter_mode = ZEROMV; + MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME; INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE; int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB]; - int dist_uv[TX_SIZE_MAX_SB], skip_uv[TX_SIZE_MAX_SB]; + int64_t dist_uv[TX_SIZE_MAX_SB]; + int skip_uv[TX_SIZE_MAX_SB]; MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB]; struct scale_factors scale_factor[4]; unsigned int ref_frame_mask = 0; @@ -2513,10 +3240,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int bws = (1 << bwsl) / 4; // mode_info step for subsize int bhsl = b_height_log2(bsize); int bhs = (1 << bhsl) / 4; // mode_info step for subsize + int best_skip2 = 0; + + x->skip_encode = (cpi->sf.skip_encode_frame && + xd->q_index < QIDX_SKIP_THRESH); for (i = 0; i < 4; i++) { int j; - for (j = 0; j < MAX_REF_FRAMES; j++) seg_mvs[i][j].as_int = INVALID_MV; } @@ -2534,9 +3264,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_pred_rd[i] = INT64_MAX; for (i = 0; i < NB_TXFM_MODES; i++) best_txfm_rd[i] = INT64_MAX; + for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) + best_filter_rd[i] = INT64_MAX; + for (i = 0; i < TX_SIZE_MAX_SB; i++) + rate_uv_intra[i] = INT_MAX; + + *returnrate = INT_MAX; // Create a mask set to 1 for each frame used by a smaller resolution. - if (cpi->speed > 0) { + if (cpi->sf.use_avoid_tested_higherror) { switch (block_size) { case BLOCK_64X64: for (i = 0; i < 4; i++) { @@ -2576,22 +3312,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; } - if (cpi->speed == 0 - || (cpi->speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) { - mbmi->mode = DC_PRED; - mbmi->ref_frame[0] = INTRA_FRAME; - for (i = 0; i <= (bsize < BLOCK_SIZE_MB16X16 ? TX_4X4 : - (bsize < BLOCK_SIZE_SB32X32 ? TX_8X8 : - (bsize < BLOCK_SIZE_SB64X64 ? TX_16X16 : TX_32X32))); - i++) { - mbmi->txfm_size = i; - rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[i], &rate_uv_tokenonly[i], - &dist_uv[i], &skip_uv[i], - (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : - bsize); - mode_uv[i] = mbmi->uv_mode; - } - } for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { int mode_excluded = 0; @@ -2599,14 +3319,30 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int disable_skip = 0; int compmode_cost = 0; int rate2 = 0, rate_y = 0, rate_uv = 0; - int distortion2 = 0, distortion_y = 0, distortion_uv = 0; + int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; int skippable; int64_t txfm_cache[NB_TXFM_MODES]; int i; + int this_skip2 = 0; + int64_t total_sse = INT_MAX; + int early_term = 0; for (i = 0; i < NB_TXFM_MODES; ++i) txfm_cache[i] = INT64_MAX; + this_mode = vp9_mode_order[mode_index].mode; + ref_frame = vp9_mode_order[mode_index].ref_frame; + + // Slip modes that have been masked off but always consider first mode. + if ( mode_index && (bsize > cpi->sf.unused_mode_skip_lvl) && + (cpi->unused_mode_skip_mask & (1 << mode_index)) ) + continue; + + // Skip if the current refernce frame has been masked off + if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask && + (cpi->ref_frame_mask & (1 << ref_frame))) + continue; + // Test best rd so far against threshold for trying this mode. if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] * cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 4)) || @@ -2616,14 +3352,18 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // Do not allow compound prediction if the segment level reference // frame feature is in use as in this case there can only be one reference. if ((vp9_mode_order[mode_index].second_ref_frame > INTRA_FRAME) && - vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) + vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME)) continue; x->skip = 0; - this_mode = vp9_mode_order[mode_index].mode; - ref_frame = vp9_mode_order[mode_index].ref_frame; - if (cpi->speed > 0 && bsize >= BLOCK_SIZE_SB8X8) { + // Skip some checking based on small partitions' result. + if (x->fast_ms > 1 && !ref_frame) + continue; + if (x->fast_ms > 2 && ref_frame != x->subblock_ref) + continue; + + if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_SIZE_SB8X8) { if (!(ref_frame_mask & (1 << ref_frame))) { continue; } @@ -2649,27 +3389,32 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, continue; } + comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; + if (comp_pred) { + if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) + if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) + continue; + if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) + if (vp9_mode_order[mode_index].ref_frame != best_inter_ref_frame && + vp9_mode_order[mode_index].second_ref_frame != best_inter_ref_frame) + continue; + } // TODO(jingning, jkoleszar): scaling reference frame not supported for // SPLITMV. if (mbmi->ref_frame[0] > 0 && - (scale_factor[mbmi->ref_frame[0]].x_scale_fp != - (1 << VP9_REF_SCALE_SHIFT) || - scale_factor[mbmi->ref_frame[0]].y_scale_fp != - (1 << VP9_REF_SCALE_SHIFT)) && + (scale_factor[mbmi->ref_frame[0]].x_scale_fp != VP9_REF_NO_SCALE || + scale_factor[mbmi->ref_frame[0]].y_scale_fp != VP9_REF_NO_SCALE) && this_mode == SPLITMV) continue; if (mbmi->ref_frame[1] > 0 && - (scale_factor[mbmi->ref_frame[1]].x_scale_fp != - (1 << VP9_REF_SCALE_SHIFT) || - scale_factor[mbmi->ref_frame[1]].y_scale_fp != - (1 << VP9_REF_SCALE_SHIFT)) && + (scale_factor[mbmi->ref_frame[1]].x_scale_fp != VP9_REF_NO_SCALE || + scale_factor[mbmi->ref_frame[1]].y_scale_fp != VP9_REF_NO_SCALE) && this_mode == SPLITMV) continue; set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1], scale_factor); - comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; mbmi->mode = this_mode; mbmi->uv_mode = DC_PRED; @@ -2691,9 +3436,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1], scale_factor); - mode_excluded = - mode_excluded ? - mode_excluded : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY; + mode_excluded = mode_excluded + ? mode_excluded + : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY; } else { // mbmi->ref_frame[1] = vp9_mode_order[mode_index].ref_frame[1]; if (ref_frame != INTRA_FRAME) { @@ -2713,23 +3458,31 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // If the segment reference frame feature is enabled.... // then do nothing if the current ref frame is not allowed.. - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && - vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME) && + vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) != + (int)ref_frame) { continue; // If the segment skip feature is enabled.... // then do nothing if the current mode is not allowed.. - } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) && + } else if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP) && (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) { continue; // Disable this drop out case if the ref frame // segment level feature is enabled for this segment. This is to // prevent the possibility that we end up unable to pick any mode. - } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) { + } else if (!vp9_segfeature_active(&xd->seg, segment_id, + SEG_LVL_REF_FRAME)) { // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, // unless ARNR filtering is enabled in which case we want - // an unfiltered alternative + // an unfiltered alternative. We allow near/nearest as well + // because they may result in zero-zero MVs but be cheaper. if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { - if (this_mode != ZEROMV || ref_frame != ALTREF_FRAME) { + if ((this_mode != ZEROMV && + !(this_mode == NEARMV && + frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) && + !(this_mode == NEARESTMV && + frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) || + ref_frame != ALTREF_FRAME) { continue; } } @@ -2747,6 +3500,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (this_mode == I4X4_PRED) { int rate; + /* + if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && + (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)) + continue; + */ + mbmi->txfm_size = TX_4X4; rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion_y, INT64_MAX); @@ -2754,8 +3513,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, rate2 += intra_cost_penalty; distortion2 += distortion_y; + if (rate_uv_intra[TX_4X4] == INT_MAX) { + choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4], + &rate_uv_tokenonly[TX_4X4], + &dist_uv[TX_4X4], &skip_uv[TX_4X4], + &mode_uv[TX_4X4]); + } rate2 += rate_uv_intra[TX_4X4]; - rate_uv = rate_uv_intra[TX_4X4]; + rate_uv = rate_uv_tokenonly[TX_4X4]; distortion2 += dist_uv[TX_4X4]; distortion_uv = dist_uv[TX_4X4]; mbmi->uv_mode = mode_uv[TX_4X4]; @@ -2764,41 +3529,68 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, txfm_cache[i] = txfm_cache[ONLY_4X4]; } else if (ref_frame == INTRA_FRAME) { TX_SIZE uv_tx; - super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, - bsize, txfm_cache); - - uv_tx = mbmi->txfm_size; - if (bsize < BLOCK_SIZE_MB16X16 && uv_tx == TX_8X8) - uv_tx = TX_4X4; - if (bsize < BLOCK_SIZE_SB32X32 && uv_tx == TX_16X16) - uv_tx = TX_8X8; - else if (bsize < BLOCK_SIZE_SB64X64 && uv_tx == TX_32X32) - uv_tx = TX_16X16; - - rate_uv = rate_uv_intra[uv_tx]; + // Only search the oblique modes if the best so far is + // one of the neighboring directional modes + if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && + (this_mode >= D45_PRED && this_mode <= TM_PRED)) { + if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME) + continue; + } + if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra(mbmi->mode, best_intra_mode)) + continue; + } + super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, + bsize, txfm_cache, best_rd); + + if (rate_y == INT_MAX) + continue; + + uv_tx = MIN(mbmi->txfm_size, max_uv_txsize_lookup[bsize]); + if (rate_uv_intra[uv_tx] == INT_MAX) { + choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx], + &rate_uv_tokenonly[uv_tx], + &dist_uv[uv_tx], &skip_uv[uv_tx], + &mode_uv[uv_tx]); + } + + rate_uv = rate_uv_tokenonly[uv_tx]; distortion_uv = dist_uv[uv_tx]; skippable = skippable && skip_uv[uv_tx]; mbmi->uv_mode = mode_uv[uv_tx]; - rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv; + rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx]; if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED) rate2 += intra_cost_penalty; distortion2 = distortion_y + distortion_uv; } else if (this_mode == SPLITMV) { const int is_comp_pred = mbmi->ref_frame[1] > 0; - int rate, distortion; + int rate; + int64_t distortion; int64_t this_rd_thresh; int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX; int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX; - int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0; + int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse; + int tmp_best_skippable = 0; int switchable_filter_index; int_mv *second_ref = is_comp_pred ? &mbmi->ref_mvs[mbmi->ref_frame[1]][0] : NULL; union b_mode_info tmp_best_bmodes[16]; MB_MODE_INFO tmp_best_mbmode; PARTITION_INFO tmp_best_partition; + BEST_SEG_INFO bsi[VP9_SWITCHABLE_FILTERS]; int pred_exists = 0; int uv_skippable; + if (is_comp_pred) { + if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) + if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) + continue; + if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) + if (vp9_mode_order[mode_index].ref_frame != best_inter_ref_frame && + vp9_mode_order[mode_index].second_ref_frame != + best_inter_ref_frame) + continue; + } this_rd_thresh = (mbmi->ref_frame[0] == LAST_FRAME) ? cpi->rd_threshes[bsize][THR_NEWMV] : @@ -2807,25 +3599,36 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh; xd->mode_info_context->mbmi.txfm_size = TX_4X4; + cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = INT64_MAX; for (switchable_filter_index = 0; switchable_filter_index < VP9_SWITCHABLE_FILTERS; ++switchable_filter_index) { - int newbest; + int newbest, rs; + int64_t rs_rd; mbmi->interp_filter = - vp9_switchable_interp[switchable_filter_index]; + vp9_switchable_interp[switchable_filter_index]; vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &mbmi->ref_mvs[mbmi->ref_frame[0]][0], - second_ref, INT64_MAX, + second_ref, + best_yrd, &rate, &rate_y, &distortion, - &skippable, + &skippable, &total_sse, (int)this_rd_thresh, seg_mvs, + bsi, switchable_filter_index, mi_row, mi_col); - if (cpi->common.mcomp_filter_type == SWITCHABLE) { - const int rs = get_switchable_rate(cm, x); - tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0); - } + + if (tmp_rd == INT64_MAX) + continue; + cpi->rd_filter_cache[switchable_filter_index] = tmp_rd; + rs = get_switchable_rate(cm, x); + rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0); + cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] = + MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], tmp_rd + rs_rd); + if (cm->mcomp_filter_type == SWITCHABLE) + tmp_rd += rs_rd; + newbest = (tmp_rd < tmp_best_rd); if (newbest) { tmp_best_filter = mbmi->interp_filter; @@ -2834,19 +3637,34 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if ((newbest && cm->mcomp_filter_type == SWITCHABLE) || (mbmi->interp_filter == cm->mcomp_filter_type && cm->mcomp_filter_type != SWITCHABLE)) { - tmp_best_rdu = tmp_rd; - tmp_best_rate = rate; - tmp_best_ratey = rate_y; - tmp_best_distortion = distortion; - tmp_best_skippable = skippable; - tmp_best_mbmode = *mbmi; - tmp_best_partition = *x->partition_info; - for (i = 0; i < 4; i++) - tmp_best_bmodes[i] = xd->mode_info_context->bmi[i]; - pred_exists = 1; + tmp_best_rdu = tmp_rd; + tmp_best_rate = rate; + tmp_best_ratey = rate_y; + tmp_best_distortion = distortion; + tmp_best_sse = total_sse; + tmp_best_skippable = skippable; + tmp_best_mbmode = *mbmi; + tmp_best_partition = *x->partition_info; + for (i = 0; i < 4; i++) + tmp_best_bmodes[i] = xd->mode_info_context->bmi[i]; + pred_exists = 1; + if (switchable_filter_index == 0 && + cpi->sf.use_rd_breakout && + best_rd < INT64_MAX) { + if (tmp_best_rdu / 2 > best_rd) { + // skip searching the other filters if the first is + // already substantially larger than the best so far + tmp_best_filter = mbmi->interp_filter; + tmp_best_rdu = INT64_MAX; + break; } + } + } } // switchable_filter_index loop + if (tmp_best_rdu == INT64_MAX) + continue; + mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ? tmp_best_filter : cm->mcomp_filter_type); vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); @@ -2855,17 +3673,22 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // switchable list (bilinear, 6-tap) is indicated at the frame level tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &mbmi->ref_mvs[mbmi->ref_frame[0]][0], - second_ref, INT64_MAX, + second_ref, + best_yrd, &rate, &rate_y, &distortion, - &skippable, + &skippable, &total_sse, (int)this_rd_thresh, seg_mvs, + bsi, 0, mi_row, mi_col); + if (tmp_rd == INT64_MAX) + continue; } else { if (cpi->common.mcomp_filter_type == SWITCHABLE) { int rs = get_switchable_rate(cm, x); tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0); } tmp_rd = tmp_best_rdu; + total_sse = tmp_best_sse; rate = tmp_best_rate; rate_y = tmp_best_ratey; distortion = tmp_best_distortion; @@ -2882,29 +3705,33 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (cpi->common.mcomp_filter_type == SWITCHABLE) rate2 += get_switchable_rate(cm, x); - // If even the 'Y' rd value of split is higher than best so far - // then dont bother looking at UV - vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, - BLOCK_SIZE_SB8X8); - vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8); - super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv, - &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4); - rate2 += rate_uv; - distortion2 += distortion_uv; - skippable = skippable && uv_skippable; - - txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); - for (i = 0; i < NB_TXFM_MODES; ++i) - txfm_cache[i] = txfm_cache[ONLY_4X4]; - if (!mode_excluded) { if (is_comp_pred) mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY; else mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY; } - compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred); + + if (RDCOST(x->rdmult, x->rddiv, rate2, distortion2) < + best_rd) { + // If even the 'Y' rd value of split is higher than best so far + // then dont bother looking at UV + vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, + BLOCK_SIZE_SB8X8); + vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8); + super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv, + &uv_skippable, &uv_sse, + BLOCK_SIZE_SB8X8, TX_4X4); + rate2 += rate_uv; + distortion2 += distortion_uv; + skippable = skippable && uv_skippable; + total_sse += uv_sse; + + txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + for (i = 0; i < NB_TXFM_MODES; ++i) + txfm_cache[i] = txfm_cache[ONLY_4X4]; + } } else { compmode_cost = vp9_cost_bit(comp_mode_p, mbmi->ref_frame[1] > INTRA_FRAME); @@ -2914,9 +3741,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, &rate_y, &distortion_y, &rate_uv, &distortion_uv, &mode_excluded, &disable_skip, - &tmp_best_filter, frame_mv[this_mode], + &tmp_best_filter, frame_mv, mi_row, mi_col, - single_newmv); + single_newmv, &total_sse, best_rd); if (this_rd == INT64_MAX) continue; } @@ -2938,15 +3765,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // because there are no non zero coefficients and make any // necessary adjustment for rate. Ignore if skip is coded at // segment level as the cost wont have been added in. - int mb_skip_allowed; - // Is Mb level skip allowed (i.e. not coded at segment level). - mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); + const int mb_skip_allowed = !vp9_segfeature_active(&xd->seg, segment_id, + SEG_LVL_SKIP); if (skippable && bsize >= BLOCK_SIZE_SB8X8) { // Back out the coefficient coding costs rate2 -= (rate_y + rate_uv); - // for best_yrd calculation + // for best yrd calculation rate_uv = 0; if (mb_skip_allowed) { @@ -2954,17 +3780,37 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // Cost the skip mb case vp9_prob skip_prob = - vp9_get_pred_prob(cm, xd, PRED_MBSKIP); + vp9_get_pred_prob_mbskip(cm, xd); if (skip_prob) { prob_skip_cost = vp9_cost_bit(skip_prob, 1); rate2 += prob_skip_cost; } } + } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && + !xd->lossless) { + if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) < + RDCOST(x->rdmult, x->rddiv, 0, total_sse)) { + // Add in the cost of the no skip flag. + int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), + 0); + rate2 += prob_skip_cost; + } else { + // FIXME(rbultje) make this work for splitmv also + int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), + 1); + rate2 += prob_skip_cost; + distortion2 = total_sse; + assert(total_sse >= 0); + rate2 -= (rate_y + rate_uv); + rate_y = 0; + rate_uv = 0; + this_skip2 = 1; + } } else if (mb_skip_allowed) { // Add in the cost of the no skip flag. - int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd, - PRED_MBSKIP), 0); + int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), + 0); rate2 += prob_skip_cost; } @@ -2972,23 +3818,28 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); } -#if 0 - // Keep record of best intra distortion - if ((xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) && - (this_rd < best_intra_rd)) { + // Keep record of best intra rd + if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME && + is_intra_mode(xd->mode_info_context->mbmi.mode) && + this_rd < best_intra_rd) { best_intra_rd = this_rd; - *returnintra = distortion2; + best_intra_mode = xd->mode_info_context->mbmi.mode; + } + // Keep record of best inter rd with single reference + if (xd->mode_info_context->mbmi.ref_frame[0] > INTRA_FRAME && + xd->mode_info_context->mbmi.ref_frame[1] == NONE && + !mode_excluded && + this_rd < best_inter_rd) { + best_inter_rd = this_rd; + best_inter_ref_frame = ref_frame; + // best_inter_mode = xd->mode_info_context->mbmi.mode; } -#endif - if (!disable_skip && mbmi->ref_frame[0] == INTRA_FRAME) + if (!disable_skip && mbmi->ref_frame[0] == INTRA_FRAME) { for (i = 0; i < NB_PREDICTION_TYPES; ++i) best_pred_rd[i] = MIN(best_pred_rd[i], this_rd); - - if (this_rd < best_overall_rd) { - best_overall_rd = this_rd; - best_filter = tmp_best_filter; - best_mode = this_mode; + for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) + best_filter_rd[i] = MIN(best_filter_rd[i], this_rd); } if (this_mode != I4X4_PRED && this_mode != SPLITMV) { @@ -3007,6 +3858,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (this_rd < best_rd || x->skip) { if (!mode_excluded) { // Note index of best mode so far + const int qstep = xd->plane[0].dequant[1]; + best_mode_index = mode_index; if (ref_frame == INTRA_FRAME) { @@ -3017,12 +3870,21 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, *returnrate = rate2; *returndistortion = distortion2; best_rd = this_rd; + best_yrd = best_rd - + RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv); best_mbmode = *mbmi; + best_skip2 = this_skip2; best_partition = *x->partition_info; if (this_mode == I4X4_PRED || this_mode == SPLITMV) for (i = 0; i < 4; i++) best_bmodes[i] = xd->mode_info_context->bmi[i]; + + // TODO(debargha): enhance this test with a better distortion prediction + // based on qp, activity mask and history + if (cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) + if (ref_frame > INTRA_FRAME && distortion2 * 4 < qstep * qstep) + early_term = 1; } #if 0 // Testing this mode gave rise to an improvement in best error score. @@ -3075,6 +3937,26 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_pred_rd[HYBRID_PREDICTION] = hybrid_rd; } + /* keep record of best filter type */ + if (!mode_excluded && !disable_skip && mbmi->ref_frame[0] != INTRA_FRAME && + cm->mcomp_filter_type != BILINEAR) { + int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ? + VP9_SWITCHABLE_FILTERS : + vp9_switchable_interp_map[cm->mcomp_filter_type]]; + for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) { + int64_t adj_rd; + // In cases of poor prediction, filter_cache[] can contain really big + // values, which actually are bigger than this_rd itself. This can + // cause negative best_filter_rd[] values, which is obviously silly. + // Therefore, if filter_cache < ref, we do an adjusted calculation. + if (cpi->rd_filter_cache[i] >= ref) + adj_rd = this_rd + cpi->rd_filter_cache[i] - ref; + else // FIXME(rbultje) do this for comppred also + adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref; + best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd); + } + } + /* keep record of best txfm size */ if (bsize < BLOCK_SIZE_SB32X32) { if (bsize < BLOCK_SIZE_MB16X16) { @@ -3088,7 +3970,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < NB_TXFM_MODES; i++) { int64_t adj_rd = INT64_MAX; if (this_mode != I4X4_PRED) { - adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->txfm_mode]; + adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->tx_mode]; } else { adj_rd = this_rd; } @@ -3098,9 +3980,41 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } } + if (early_term) + break; + if (x->skip && !mode_excluded) break; } + if (best_rd >= best_rd_so_far) + return INT64_MAX; + + // If we used an estimate for the uv intra rd in the loop above... + if (cpi->sf.use_uv_intra_rd_estimate) { + // Do Intra UV best rd mode selection if best mode choice above was intra. + if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) { + TX_SIZE uv_tx_size = get_uv_tx_size(mbmi); + rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size], + &rate_uv_tokenonly[uv_tx_size], + &dist_uv[uv_tx_size], + &skip_uv[uv_tx_size], + (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 + : bsize); + } + } + + // If indicated then mark the index of the chosen mode to be inspected at + // other block sizes. + if (bsize <= cpi->sf.unused_mode_skip_lvl) { + cpi->unused_mode_skip_mask = cpi->unused_mode_skip_mask & + (~((int64_t)1 << best_mode_index)); + } + + // If we are using reference masking and the set mask flag is set then + // create the reference frame mask. + if (cpi->sf.reference_masking && cpi->set_ref_frame_mask) + cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame); + // Flag all modes that have a distortion thats > 2x the best we found at // this level. for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) { @@ -3130,26 +4044,21 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, (cm->mcomp_filter_type == best_mbmode.interp_filter) || (best_mbmode.ref_frame[0] == INTRA_FRAME)); - // Accumulate filter usage stats - // TODO(agrange): Use RD criteria to select interpolation filter mode. - if (is_inter_mode(best_mode)) - ++cpi->best_switchable_interp_count[vp9_switchable_interp_map[best_filter]]; - // Updating rd_thresh_freq_fact[] here means that the differnt // partition/block sizes are handled independently based on the best // choice for the current partition. It may well be better to keep a scaled // best rd so far value and update rd_thresh_freq_fact based on the mode/size // combination that wins out. - if (cpi->sf.adpative_rd_thresh) { + if (cpi->sf.adaptive_rd_thresh) { for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { if (mode_index == best_mode_index) { cpi->rd_thresh_freq_fact[bsize][mode_index] = BASE_RD_THRESH_FREQ_FACT; } else { cpi->rd_thresh_freq_fact[bsize][mode_index] += MAX_RD_THRESH_FREQ_INC; if (cpi->rd_thresh_freq_fact[bsize][mode_index] > - (cpi->sf.adpative_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) { + (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT)) { cpi->rd_thresh_freq_fact[bsize][mode_index] = - cpi->sf.adpative_rd_thresh * MAX_RD_THRESH_FREQ_FACT; + cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FREQ_FACT; } } } @@ -3170,36 +4079,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } #endif - // This code forces Altref,0,0 and skip for the frame that overlays a - // an alrtef unless Altref is filtered. However, this is unsafe if - // segment level coding of ref frame is enabled for this segment. - if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && - cpi->is_src_frame_alt_ref && - (cpi->oxcf.arnr_max_frames == 0) && - (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame[0] != ALTREF_FRAME) - && bsize >= BLOCK_SIZE_SB8X8) { - mbmi->mode = ZEROMV; - mbmi->ref_frame[0] = ALTREF_FRAME; - mbmi->ref_frame[1] = NONE; - mbmi->mv[0].as_int = 0; - mbmi->uv_mode = DC_PRED; - mbmi->mb_skip_coeff = 1; - if (cm->txfm_mode == TX_MODE_SELECT) { - if (bsize >= BLOCK_SIZE_SB32X32) - mbmi->txfm_size = TX_32X32; - else if (bsize >= BLOCK_SIZE_MB16X16) - mbmi->txfm_size = TX_16X16; - else - mbmi->txfm_size = TX_8X8; - } - - vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff)); - vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff)); - goto end; - } - // macroblock modes *mbmi = best_mbmode; + x->skip |= best_skip2; if (best_mbmode.ref_frame[0] == INTRA_FRAME && best_mbmode.sb_type < BLOCK_SIZE_SB8X8) { for (i = 0; i < 4; i++) @@ -3219,8 +4101,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, *x->partition_info = best_partition; - mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int; - mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int; + mbmi->mv[0].as_int = xd->mode_info_context->bmi[3].as_mv[0].as_int; + mbmi->mv[1].as_int = xd->mode_info_context->bmi[3].as_mv[1].as_int; } for (i = 0; i < NB_PREDICTION_TYPES; ++i) { @@ -3231,6 +4113,19 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } if (!x->skip) { + for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) { + if (best_filter_rd[i] == INT64_MAX) + best_filter_diff[i] = 0; + else + best_filter_diff[i] = best_rd - best_filter_rd[i]; + } + if (cm->mcomp_filter_type == SWITCHABLE) + assert(best_filter_diff[VP9_SWITCHABLE_FILTERS] == 0); + } else { + vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff)); + } + + if (!x->skip) { for (i = 0; i < NB_TXFM_MODES; i++) { if (best_txfm_rd[i] == INT64_MAX) best_txfm_diff[i] = 0; @@ -3241,7 +4136,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff)); } - end: set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1], scale_factor); store_coding_context(x, ctx, best_mode_index, @@ -3249,7 +4143,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, &mbmi->ref_mvs[mbmi->ref_frame[0]][0], &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]][0], - best_pred_diff, best_txfm_diff); + best_pred_diff, best_txfm_diff, best_filter_diff); return best_rd; } diff --git a/libvpx/vp9/encoder/vp9_rdopt.h b/libvpx/vp9/encoder/vp9_rdopt.h index dcf5d00..7c84b48 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.h +++ b/libvpx/vp9/encoder/vp9_rdopt.h @@ -15,18 +15,20 @@ #define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) ) #define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) ) +#define QIDX_SKIP_THRESH 115 + void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex); void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex); void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, - int *r, int *d, BLOCK_SIZE_TYPE bsize, - PICK_MODE_CONTEXT *ctx); + int *r, int64_t *d, BLOCK_SIZE_TYPE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd); int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, - int *r, int *d, BLOCK_SIZE_TYPE bsize, - PICK_MODE_CONTEXT *ctx); + int *r, int64_t *d, BLOCK_SIZE_TYPE bsize, + PICK_MODE_CONTEXT *ctx, int64_t best_rd); void vp9_init_me_luts(); diff --git a/libvpx/vp9/encoder/vp9_sad_c.c b/libvpx/vp9/encoder/vp9_sad_c.c index 6b1ba49..42ddb21 100644 --- a/libvpx/vp9/encoder/vp9_sad_c.c +++ b/libvpx/vp9/encoder/vp9_sad_c.c @@ -11,25 +11,43 @@ #include <stdlib.h> #include "vp9/common/vp9_sadmxn.h" +#include "vp9/encoder/vp9_variance.h" #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "./vp9_rtcd.h" -unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64); -} - -unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 32); -} +#define sad_mxn_func(m, n) \ +unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \ + int src_stride, \ + const uint8_t *ref_ptr, \ + int ref_stride, \ + unsigned int max_sad) { \ + return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ +} \ +unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, \ + int src_stride, \ + const uint8_t *ref_ptr, \ + int ref_stride, \ + const uint8_t *second_pred, \ + unsigned int max_sad) { \ + uint8_t comp_pred[m * n]; \ + comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \ + return sad_mx_n_c(src_ptr, src_stride, comp_pred, m, m, n); \ +} + +sad_mxn_func(64, 64) +sad_mxn_func(64, 32) +sad_mxn_func(32, 64) +sad_mxn_func(32, 32) +sad_mxn_func(32, 16) +sad_mxn_func(16, 32) +sad_mxn_func(16, 16) +sad_mxn_func(16, 8) +sad_mxn_func(8, 16) +sad_mxn_func(8, 8) +sad_mxn_func(8, 4) +sad_mxn_func(4, 8) +sad_mxn_func(4, 4) void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, @@ -46,14 +64,6 @@ void vp9_sad64x32x4d_c(const uint8_t *src_ptr, ref_ptr[3], ref_stride, 0x7fffffff); } -unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 64); -} - void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], @@ -69,22 +79,6 @@ void vp9_sad32x64x4d_c(const uint8_t *src_ptr, ref_ptr[3], ref_stride, 0x7fffffff); } -unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32); -} - -unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 16); -} - void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], @@ -100,14 +94,6 @@ void vp9_sad32x16x4d_c(const uint8_t *src_ptr, ref_ptr[3], ref_stride, 0x7fffffff); } -unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 32); -} - void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], @@ -123,63 +109,6 @@ void vp9_sad16x32x4d_c(const uint8_t *src_ptr, ref_ptr[3], ref_stride, 0x7fffffff); } -unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16); -} - -unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8); -} - - -unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8); -} - -unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16); -} - -unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 4); -} - -unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 8); -} - -unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4); -} - void vp9_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c index fe995ad..ef84cc5 100644 --- a/libvpx/vp9/encoder/vp9_segmentation.c +++ b/libvpx/vp9/encoder/vp9_segmentation.c @@ -18,14 +18,14 @@ void vp9_enable_segmentation(VP9_PTR ptr) { VP9_COMP *cpi = (VP9_COMP *)ptr; - cpi->mb.e_mbd.segmentation_enabled = 1; - cpi->mb.e_mbd.update_mb_segmentation_map = 1; - cpi->mb.e_mbd.update_mb_segmentation_data = 1; + cpi->mb.e_mbd.seg.enabled = 1; + cpi->mb.e_mbd.seg.update_map = 1; + cpi->mb.e_mbd.seg.update_data = 1; } void vp9_disable_segmentation(VP9_PTR ptr) { VP9_COMP *cpi = (VP9_COMP *)ptr; - cpi->mb.e_mbd.segmentation_enabled = 0; + cpi->mb.e_mbd.seg.enabled = 0; } void vp9_set_segmentation_map(VP9_PTR ptr, @@ -37,8 +37,8 @@ void vp9_set_segmentation_map(VP9_PTR ptr, (cpi->common.mi_rows * cpi->common.mi_cols)); // Signal that the map should be updated. - cpi->mb.e_mbd.update_mb_segmentation_map = 1; - cpi->mb.e_mbd.update_mb_segmentation_data = 1; + cpi->mb.e_mbd.seg.update_map = 1; + cpi->mb.e_mbd.seg.update_data = 1; } void vp9_set_segment_data(VP9_PTR ptr, @@ -46,10 +46,10 @@ void vp9_set_segment_data(VP9_PTR ptr, unsigned char abs_delta) { VP9_COMP *cpi = (VP9_COMP *)(ptr); - cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta; + cpi->mb.e_mbd.seg.abs_delta = abs_delta; - vpx_memcpy(cpi->mb.e_mbd.segment_feature_data, feature_data, - sizeof(cpi->mb.e_mbd.segment_feature_data)); + vpx_memcpy(cpi->mb.e_mbd.seg.feature_data, feature_data, + sizeof(cpi->mb.e_mbd.seg.feature_data)); // TBD ?? Set the feature mask // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0, @@ -115,8 +115,7 @@ static int cost_segmap(MACROBLOCKD *xd, int *segcounts, vp9_prob *probs) { return cost; } -static void count_segs(VP9_COMP *cpi, - MODE_INFO *mi, +static void count_segs(VP9_COMP *cpi, MODE_INFO *mi, int *no_pred_segcounts, int (*temporal_predictor_count)[2], int *t_unpred_seg_counts, @@ -137,20 +136,19 @@ static void count_segs(VP9_COMP *cpi, // Temporal prediction not allowed on key frames if (cm->frame_type != KEY_FRAME) { + const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type; // Test to see if the segment id matches the predicted value. - const int pred_seg_id = vp9_get_pred_mi_segid(cm, mi->mbmi.sb_type, - mi_row, mi_col); - const int seg_predicted = (segment_id == pred_seg_id); - - // Get the segment id prediction context - const int pred_context = vp9_get_pred_context(cm, xd, PRED_SEG_ID); + const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map, + bsize, mi_row, mi_col); + const int pred_flag = pred_segment_id == segment_id; + const int pred_context = vp9_get_pred_context_seg_id(xd); // Store the prediction status for this mb and update counts // as appropriate - vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted); - temporal_predictor_count[pred_context][seg_predicted]++; + vp9_set_pred_flag_seg_id(cm, bsize, mi_row, mi_col, pred_flag); + temporal_predictor_count[pred_context][pred_flag]++; - if (!seg_predicted) + if (!pred_flag) // Update the "unpredicted" segment count t_unpred_seg_counts[segment_id]++; } @@ -218,15 +216,14 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { int no_pred_cost; int t_pred_cost = INT_MAX; - int i; - int tile_col, mi_row, mi_col; + int i, tile_col, mi_row, mi_col; int temporal_predictor_count[PREDICTION_PROBS][2]; - int no_pred_segcounts[MAX_MB_SEGMENTS]; - int t_unpred_seg_counts[MAX_MB_SEGMENTS]; + int no_pred_segcounts[MAX_SEGMENTS]; + int t_unpred_seg_counts[MAX_SEGMENTS]; - vp9_prob no_pred_tree[MB_SEG_TREE_PROBS]; - vp9_prob t_pred_tree[MB_SEG_TREE_PROBS]; + vp9_prob no_pred_tree[SEG_TREE_PROBS]; + vp9_prob t_pred_tree[SEG_TREE_PROBS]; vp9_prob t_nopred_prob[PREDICTION_PROBS]; const int mis = cm->mode_info_stride; @@ -234,8 +231,8 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { // Set default state for the segment tree probabilities and the // temporal coding probabilities - vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs)); - vpx_memset(cm->segment_pred_probs, 255, sizeof(cm->segment_pred_probs)); + vpx_memset(xd->seg.tree_probs, 255, sizeof(xd->seg.tree_probs)); + vpx_memset(xd->seg.pred_probs, 255, sizeof(xd->seg.pred_probs)); vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts)); vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts)); @@ -243,18 +240,16 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { // First of all generate stats regarding how well the last segment map // predicts this one - for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) { + for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) { vp9_get_tile_col_offsets(cm, tile_col); mi_ptr = cm->mi + cm->cur_tile_mi_col_start; for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) { mi = mi_ptr; - for (mi_col = cm->cur_tile_mi_col_start; - mi_col < cm->cur_tile_mi_col_end; - mi_col += 8, mi += 8) { + for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; + mi_col += 8, mi += 8) count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, mi_row, mi_col, BLOCK_SIZE_SB64X64); - } } } @@ -285,11 +280,11 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { // Now choose which coding method to use. if (t_pred_cost < no_pred_cost) { - cm->temporal_update = 1; - vpx_memcpy(xd->mb_segment_tree_probs, t_pred_tree, sizeof(t_pred_tree)); - vpx_memcpy(cm->segment_pred_probs, t_nopred_prob, sizeof(t_nopred_prob)); + xd->seg.temporal_update = 1; + vpx_memcpy(xd->seg.tree_probs, t_pred_tree, sizeof(t_pred_tree)); + vpx_memcpy(xd->seg.pred_probs, t_nopred_prob, sizeof(t_nopred_prob)); } else { - cm->temporal_update = 0; - vpx_memcpy(xd->mb_segment_tree_probs, no_pred_tree, sizeof(no_pred_tree)); + xd->seg.temporal_update = 0; + vpx_memcpy(xd->seg.tree_probs, no_pred_tree, sizeof(no_pred_tree)); } } diff --git a/libvpx/vp9/encoder/vp9_ssim.c b/libvpx/vp9/encoder/vp9_ssim.c index 363ed84..c155516 100644 --- a/libvpx/vp9/encoder/vp9_ssim.c +++ b/libvpx/vp9/encoder/vp9_ssim.c @@ -88,8 +88,9 @@ double vp9_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1, double ssim_total = 0; // sample point start with each 4x4 location - for (i = 0; i < height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { - for (j = 0; j < width - 8; j += 4) { + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2); ssim_total += v; samples++; @@ -104,16 +105,16 @@ double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double ssimv; a = vp9_ssim2(source->y_buffer, dest->y_buffer, - source->y_stride, dest->y_stride, source->y_width, - source->y_height); + source->y_stride, dest->y_stride, + source->y_crop_width, source->y_crop_height); b = vp9_ssim2(source->u_buffer, dest->u_buffer, - source->uv_stride, dest->uv_stride, source->uv_width, - source->uv_height); + source->uv_stride, dest->uv_stride, + source->uv_crop_width, source->uv_crop_height); c = vp9_ssim2(source->v_buffer, dest->v_buffer, - source->uv_stride, dest->uv_stride, source->uv_width, - source->uv_height); + source->uv_stride, dest->uv_stride, + source->uv_crop_width, source->uv_crop_height); ssimv = a * .8 + .1 * (b + c); @@ -128,16 +129,16 @@ double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double a, b, c; a = vp9_ssim2(source->y_buffer, dest->y_buffer, - source->y_stride, dest->y_stride, source->y_width, - source->y_height); + source->y_stride, dest->y_stride, + source->y_crop_width, source->y_crop_height); b = vp9_ssim2(source->u_buffer, dest->u_buffer, - source->uv_stride, dest->uv_stride, source->uv_width, - source->uv_height); + source->uv_stride, dest->uv_stride, + source->uv_crop_width, source->uv_crop_height); c = vp9_ssim2(source->v_buffer, dest->v_buffer, - source->uv_stride, dest->uv_stride, source->uv_width, - source->uv_height); + source->uv_stride, dest->uv_stride, + source->uv_crop_width, source->uv_crop_height); *ssim_y = a; *ssim_u = b; *ssim_v = c; diff --git a/libvpx/vp9/encoder/vp9_subexp.c b/libvpx/vp9/encoder/vp9_subexp.c new file mode 100644 index 0000000..667b801 --- /dev/null +++ b/libvpx/vp9/encoder/vp9_subexp.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" + +#include "vp9/encoder/vp9_boolhuff.h" +#include "vp9/encoder/vp9_treewriter.h" + +#define vp9_cost_upd ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8) +#define vp9_cost_upd256 ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd))) + +static int update_bits[255]; + +static int count_uniform(int v, int n) { + int l = get_unsigned_bits(n); + int m; + if (l == 0) return 0; + m = (1 << l) - n; + if (v < m) + return l - 1; + else + return l; +} + +static int split_index(int i, int n, int modulus) { + int max1 = (n - 1 - modulus / 2) / modulus + 1; + if (i % modulus == modulus / 2) + i = i / modulus; + else + i = max1 + i - (i + modulus - modulus / 2) / modulus; + return i; +} + +static int recenter_nonneg(int v, int m) { + if (v > (m << 1)) + return v; + else if (v >= m) + return ((v - m) << 1); + else + return ((m - v) << 1) - 1; +} + +static int remap_prob(int v, int m) { + int i; + static const int map_table[MAX_PROB - 1] = { + // generated by: + // map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM); + 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 2, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, + 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 4, 74, + 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88, + 89, 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102, + 103, 104, 105, 106, 107, 108, 109, 7, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 8, 122, 123, 124, 125, 126, 127, 128, 129, 130, + 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, + 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, + 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, + 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 13, 182, 183, 184, 185, + 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, 195, 196, 197, 198, 199, + 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, 213, + 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, + 228, 229, 17, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, + 18, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19, + }; + v--; + m--; + if ((m << 1) <= MAX_PROB) + i = recenter_nonneg(v, m) - 1; + else + i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1; + + i = map_table[i]; + return i; +} + +static int count_term_subexp(int word, int k, int num_syms) { + int count = 0; + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (num_syms <= mk + 3 * a) { + count += count_uniform(word - mk, num_syms - mk); + break; + } else { + int t = (word >= mk + a); + count++; + if (t) { + i = i + 1; + mk += a; + } else { + count += b; + break; + } + } + } + return count; +} + +static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) { + int delp = remap_prob(newp, oldp); + return update_bits[delp] * 256; +} + +static void encode_uniform(vp9_writer *w, int v, int n) { + int l = get_unsigned_bits(n); + int m; + if (l == 0) + return; + m = (1 << l) - n; + if (v < m) { + vp9_write_literal(w, v, l - 1); + } else { + vp9_write_literal(w, m + ((v - m) >> 1), l - 1); + vp9_write_literal(w, (v - m) & 1, 1); + } +} + +static void encode_term_subexp(vp9_writer *w, int word, int k, int num_syms) { + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (num_syms <= mk + 3 * a) { + encode_uniform(w, word - mk, num_syms - mk); + break; + } else { + int t = (word >= mk + a); + vp9_write_literal(w, t, 1); + if (t) { + i = i + 1; + mk += a; + } else { + vp9_write_literal(w, word - mk, b); + break; + } + } + } +} + +void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp) { + const int delp = remap_prob(newp, oldp); + encode_term_subexp(w, delp, SUBEXP_PARAM, 255); +} + +void vp9_compute_update_table() { + int i; + for (i = 0; i < 254; i++) + update_bits[i] = count_term_subexp(i, SUBEXP_PARAM, 255); +} + +int vp9_prob_diff_update_savings_search(const unsigned int *ct, + vp9_prob oldp, vp9_prob *bestp, + vp9_prob upd) { + const int old_b = cost_branch256(ct, oldp); + int bestsavings = 0; + vp9_prob newp, bestnewp = oldp; + const int step = *bestp > oldp ? -1 : 1; + + for (newp = *bestp; newp != oldp; newp += step) { + const int new_b = cost_branch256(ct, newp); + const int update_b = prob_diff_update_cost(newp, oldp) + vp9_cost_upd256; + const int savings = old_b - new_b - update_b; + if (savings > bestsavings) { + bestsavings = savings; + bestnewp = newp; + } + } + *bestp = bestnewp; + return bestsavings; +} + +int vp9_prob_diff_update_savings_search_model(const unsigned int *ct, + const vp9_prob *oldp, + vp9_prob *bestp, + vp9_prob upd, + int b, int r) { + int i, old_b, new_b, update_b, savings, bestsavings, step; + int newp; + vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES]; + vp9_model_to_full_probs(oldp, oldplist); + vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES); + for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i) + old_b += cost_branch256(ct + 2 * i, oldplist[i]); + old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]); + + bestsavings = 0; + bestnewp = oldp[PIVOT_NODE]; + + step = (*bestp > oldp[PIVOT_NODE] ? -1 : 1); + + for (newp = *bestp; newp != oldp[PIVOT_NODE]; newp += step) { + if (newp < 1 || newp > 255) + continue; + newplist[PIVOT_NODE] = newp; + vp9_model_to_full_probs(newplist, newplist); + for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i) + new_b += cost_branch256(ct + 2 * i, newplist[i]); + new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]); + update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) + + vp9_cost_upd256; + savings = old_b - new_b - update_b; + if (savings > bestsavings) { + bestsavings = savings; + bestnewp = newp; + } + } + *bestp = bestnewp; + return bestsavings; +} + +void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp, + vp9_prob upd, unsigned int *ct) { + vp9_prob newp = get_binary_prob(ct[0], ct[1]); + const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp, + upd); + assert(newp >= 1); + if (savings > 0) { + vp9_write(w, 1, upd); + vp9_write_prob_diff_update(w, newp, *oldp); + *oldp = newp; + } else { + vp9_write(w, 0, upd); + } +} diff --git a/libvpx/vp9/encoder/vp9_subexp.h b/libvpx/vp9/encoder/vp9_subexp.h new file mode 100644 index 0000000..7acdaf6 --- /dev/null +++ b/libvpx/vp9/encoder/vp9_subexp.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_DECODER_VP9_SUBEXP_H_ +#define VP9_DECODER_VP9_SUBEXP_H_ + +void vp9_compute_update_table(); + + +void vp9_write_prob_diff_update(vp9_writer *w, + vp9_prob newp, vp9_prob oldp); + +void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp, + vp9_prob upd, unsigned int *ct); + +int vp9_prob_diff_update_savings_search(const unsigned int *ct, + vp9_prob oldp, vp9_prob *bestp, + vp9_prob upd); + + +int vp9_prob_diff_update_savings_search_model(const unsigned int *ct, + const vp9_prob *oldp, + vp9_prob *bestp, + vp9_prob upd, + int b, int r); + +#endif // VP9_DECODER_VP9_SUBEXP_H_ diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c index 47792fc..821b7c6 100644 --- a/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -51,25 +51,25 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, &xd->scale_factor[which_mv], 16, 16, which_mv, - &xd->subpix); + &xd->subpix, MV_PRECISION_Q3); stride = (stride + 1) >> 1; - vp9_build_inter_predictor_q4(u_mb_ptr, stride, - &pred[256], 8, - &mv, - &xd->scale_factor_uv[which_mv], - 8, 8, - which_mv, - &xd->subpix); - - vp9_build_inter_predictor_q4(v_mb_ptr, stride, - &pred[320], 8, - &mv, - &xd->scale_factor_uv[which_mv], - 8, 8, - which_mv, - &xd->subpix); + vp9_build_inter_predictor(u_mb_ptr, stride, + &pred[256], 8, + &mv, + &xd->scale_factor[which_mv], + 8, 8, + which_mv, + &xd->subpix, MV_PRECISION_Q4); + + vp9_build_inter_predictor(v_mb_ptr, stride, + &pred[320], 8, + &mv, + &xd->scale_factor[which_mv], + 8, 8, + which_mv, + &xd->subpix, MV_PRECISION_Q4); } void vp9_temporal_filter_apply_c(uint8_t *frame1, @@ -148,9 +148,10 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, // Further step/diamond searches as necessary if (cpi->speed < 8) - step_param = cpi->sf.first_step + ((cpi->speed > 5) ? 1 : 0); + step_param = cpi->sf.reduce_first_step_size + ((cpi->speed > 5) ? 1 : 0); else - step_param = cpi->sf.first_step + 2; + step_param = cpi->sf.reduce_first_step_size + 2; + step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2)); /*cpi->sf.search_method == HEX*/ // TODO Check that the 16x16 vf & sdf are selected here @@ -442,7 +443,6 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { cm->yv12_fb[cm->new_fb_idx].y_crop_width, cm->yv12_fb[cm->new_fb_idx].y_crop_height, cm->width, cm->height); - cpi->mb.e_mbd.scale_factor_uv[0] = cpi->mb.e_mbd.scale_factor[0]; // Setup frame pointers, NULL indicates frame not included in filter vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *)); diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c index 0a290e1..4b9c6c8 100644 --- a/libvpx/vp9/encoder/vp9_tokenize.c +++ b/libvpx/vp9/encoder/vp9_tokenize.c @@ -90,8 +90,6 @@ static void fill_value_tokens() { vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE; } -extern const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad); - struct tokenize_b_args { VP9_COMP *cpi; MACROBLOCKD *xd; @@ -106,7 +104,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, VP9_COMP *cpi = args->cpi; MACROBLOCKD *xd = args->xd; TOKENEXTRA **tp = args->tp; - PLANE_TYPE type = plane ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC; TX_SIZE tx_size = ss_txfrm_size / 2; int dry_run = args->dry_run; @@ -115,6 +112,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, int c = 0, rc = 0; TOKENEXTRA *t = *tp; /* store tokens starting here */ const int eob = xd->plane[plane].eobs[block]; + const PLANE_TYPE type = xd->plane[plane].plane_type; const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16); const BLOCK_SIZE_TYPE sb_type = (mbmi->sb_type < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : mbmi->sb_type; @@ -125,56 +123,42 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, const int loff = (off >> mod) << tx_size; ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff; ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff; - int seg_eob, default_eob, pad; + int seg_eob; const int segment_id = mbmi->segment_id; - const int *scan, *nb; + const int16_t *scan, *nb; vp9_coeff_count *counts; vp9_coeff_probs_model *coef_probs; const int ref = mbmi->ref_frame[0] != INTRA_FRAME; ENTROPY_CONTEXT above_ec, left_ec; uint8_t token_cache[1024]; - TX_TYPE tx_type = DCT_DCT; - const uint8_t * band_translate; + const uint8_t *band_translate; assert((!type && !plane) || (type && plane)); counts = cpi->coef_counts[tx_size]; coef_probs = cpi->common.fc.coef_probs[tx_size]; switch (tx_size) { default: - case TX_4X4: { - tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type_4x4(xd, block) : DCT_DCT; + case TX_4X4: above_ec = A[0] != 0; left_ec = L[0] != 0; seg_eob = 16; - scan = get_scan_4x4(tx_type); + scan = get_scan_4x4(get_tx_type_4x4(type, xd, block)); band_translate = vp9_coefband_trans_4x4; break; - } - case TX_8X8: { - const int sz = 1 + b_width_log2(sb_type); - const int x = block & ((1 << sz) - 1), y = block - x; - tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT; + case TX_8X8: above_ec = (A[0] + A[1]) != 0; left_ec = (L[0] + L[1]) != 0; seg_eob = 64; - scan = get_scan_8x8(tx_type); + scan = get_scan_8x8(get_tx_type_8x8(type, xd)); band_translate = vp9_coefband_trans_8x8plus; break; - } - case TX_16X16: { - const int sz = 2 + b_width_log2(sb_type); - const int x = block & ((1 << sz) - 1), y = block - x; - tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT; + case TX_16X16: above_ec = (A[0] + A[1] + A[2] + A[3]) != 0; left_ec = (L[0] + L[1] + L[2] + L[3]) != 0; seg_eob = 256; - scan = get_scan_16x16(tx_type); + scan = get_scan_16x16(get_tx_type_16x16(type, xd)); band_translate = vp9_coefband_trans_8x8plus; break; - } case TX_32X32: above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0; left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0; @@ -185,10 +169,9 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, } pt = combine_entropy_contexts(above_ec, left_ec); - nb = vp9_get_coef_neighbors_handle(scan, &pad); - default_eob = seg_eob; + nb = vp9_get_coef_neighbors_handle(scan); - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) + if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)) seg_eob = 0; c = 0; @@ -198,7 +181,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, int v = 0; rc = scan[c]; if (c) - pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob); + pt = get_coef_context(nb, token_cache, c); if (c < eob) { v = qcoeff_ptr[rc]; assert(-DCT_MAX_VALUE <= v && v < DCT_MAX_VALUE); @@ -213,21 +196,12 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, t->context_tree = coef_probs[type][ref][band][pt]; t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0); -#if CONFIG_BALANCED_COEFTREE - assert(token <= ZERO_TOKEN || - vp9_coef_encodings[t->token].len - t->skip_eob_node > 0); -#else assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0); -#endif if (!dry_run) { ++counts[type][ref][band][pt][token]; -#if CONFIG_BALANCED_COEFTREE - if (!t->skip_eob_node && token > ZERO_TOKEN) -#else if (!t->skip_eob_node) -#endif - ++cpi->common.fc.eob_branch_counts[tx_size][type][ref][band][pt]; + ++cpi->common.counts.eob_branch[tx_size][type][ref][band][pt]; } token_cache[scan[c]] = vp9_pt_energy_class[token]; ++t; @@ -263,8 +237,7 @@ int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) { int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) { int result = 1; struct is_skippable_args args = {xd, &result}; - foreach_transformed_block_in_plane(xd, bsize, 0, - is_skippable, &args); + foreach_transformed_block_in_plane(xd, bsize, 0, is_skippable, &args); return result; } @@ -275,26 +248,22 @@ int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) { return result; } -void vp9_tokenize_sb(VP9_COMP *cpi, - MACROBLOCKD *xd, - TOKENEXTRA **t, - int dry_run, BLOCK_SIZE_TYPE bsize) { - VP9_COMMON * const cm = &cpi->common; - MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi; +void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, + BLOCK_SIZE_TYPE bsize) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->mb.e_mbd; + MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; TOKENEXTRA *t_backup = *t; - const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP); - const int segment_id = mbmi->segment_id; - const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP); + const int mb_skip_context = vp9_get_pred_context_mbskip(xd); + const int skip_inc = !vp9_segfeature_active(&xd->seg, mbmi->segment_id, + SEG_LVL_SKIP); const TX_SIZE txfm_size = mbmi->txfm_size; - struct tokenize_b_args arg = { - cpi, xd, t, txfm_size, dry_run - }; + struct tokenize_b_args arg = { cpi, xd, t, txfm_size, dry_run }; mbmi->mb_skip_coeff = vp9_sb_is_skippable(xd, bsize); - if (mbmi->mb_skip_coeff) { if (!dry_run) - cm->fc.mbskip_count[mb_skip_context][1] += skip_inc; + cm->counts.mbskip[mb_skip_context][1] += skip_inc; vp9_reset_sb_tokens_context(xd, bsize); if (dry_run) *t = t_backup; @@ -302,7 +271,7 @@ void vp9_tokenize_sb(VP9_COMP *cpi, } if (!dry_run) - cm->fc.mbskip_count[mb_skip_context][0] += skip_inc; + cm->counts.mbskip[mb_skip_context][0] += skip_inc; foreach_transformed_block(xd, bsize, tokenize_b, &arg); diff --git a/libvpx/vp9/encoder/vp9_tokenize.h b/libvpx/vp9/encoder/vp9_tokenize.h index e7f90c9..bc7d935 100644 --- a/libvpx/vp9/encoder/vp9_tokenize.h +++ b/libvpx/vp9/encoder/vp9_tokenize.h @@ -36,8 +36,8 @@ int vp9_sby_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize); int vp9_sbuv_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize); struct VP9_COMP; -void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd, - TOKENEXTRA **t, int dry_run, BLOCK_SIZE_TYPE bsize); +void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, + BLOCK_SIZE_TYPE bsize); #ifdef ENTROPY_STATS void init_context_counters(); diff --git a/libvpx/vp9/encoder/vp9_variance.h b/libvpx/vp9/encoder/vp9_variance.h index 38808d7..6e686d6 100644 --- a/libvpx/vp9/encoder/vp9_variance.h +++ b/libvpx/vp9/encoder/vp9_variance.h @@ -20,6 +20,13 @@ typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, int ref_stride, unsigned int max_sad); +typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr, + int source_stride, + const uint8_t *ref_ptr, + int ref_stride, + const uint8_t *second_pred, + unsigned int max_sad); + typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, @@ -74,20 +81,21 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr, int ref_stride); typedef struct vp9_variance_vtable { - vp9_sad_fn_t sdf; - vp9_variance_fn_t vf; - vp9_subpixvariance_fn_t svf; - vp9_subp_avg_variance_fn_t svaf; - vp9_variance_fn_t svf_halfpix_h; - vp9_variance_fn_t svf_halfpix_v; - vp9_variance_fn_t svf_halfpix_hv; - vp9_sad_multi_fn_t sdx3f; - vp9_sad_multi1_fn_t sdx8f; - vp9_sad_multi_d_fn_t sdx4df; + vp9_sad_fn_t sdf; + vp9_sad_avg_fn_t sdaf; + vp9_variance_fn_t vf; + vp9_subpixvariance_fn_t svf; + vp9_subp_avg_variance_fn_t svaf; + vp9_variance_fn_t svf_halfpix_h; + vp9_variance_fn_t svf_halfpix_v; + vp9_variance_fn_t svf_halfpix_hv; + vp9_sad_multi_fn_t sdx3f; + vp9_sad_multi1_fn_t sdx8f; + vp9_sad_multi_d_fn_t sdx4df; } vp9_variance_fn_ptr_t; static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, uint8_t *ref, int ref_stride) { + int height, const uint8_t *ref, int ref_stride) { int i, j; for (i = 0; i < height; i++) { diff --git a/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm b/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm deleted file mode 100644 index 54766d8..0000000 --- a/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm +++ /dev/null @@ -1,241 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch) -global sym(vp9_short_fdct4x4_mmx) PRIVATE -sym(vp9_short_fdct4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ; input - mov rdi, arg(1) ; output - - movsxd rax, dword ptr arg(2) ;pitch - - lea rcx, [rsi + rax*2] - ; read the input data - movq mm0, [rsi] - movq mm1, [rsi + rax] - - movq mm2, [rcx] - movq mm4, [rcx + rax] - - ; transpose for the first stage - movq mm3, mm0 ; 00 01 02 03 - movq mm5, mm2 ; 20 21 22 23 - - punpcklwd mm0, mm1 ; 00 10 01 11 - punpckhwd mm3, mm1 ; 02 12 03 13 - - punpcklwd mm2, mm4 ; 20 30 21 31 - punpckhwd mm5, mm4 ; 22 32 23 33 - - movq mm1, mm0 ; 00 10 01 11 - punpckldq mm0, mm2 ; 00 10 20 30 - - punpckhdq mm1, mm2 ; 01 11 21 31 - - movq mm2, mm3 ; 02 12 03 13 - punpckldq mm2, mm5 ; 02 12 22 32 - - punpckhdq mm3, mm5 ; 03 13 23 33 - - ; mm0 0 - ; mm1 1 - ; mm2 2 - ; mm3 3 - - ; first stage - movq mm5, mm0 - movq mm4, mm1 - - paddw mm0, mm3 ; a1 = 0 + 3 - paddw mm1, mm2 ; b1 = 1 + 2 - - psubw mm4, mm2 ; c1 = 1 - 2 - psubw mm5, mm3 ; d1 = 0 - 3 - - psllw mm5, 3 - psllw mm4, 3 - - psllw mm0, 3 - psllw mm1, 3 - - ; output 0 and 2 - movq mm2, mm0 ; a1 - - paddw mm0, mm1 ; op[0] = a1 + b1 - psubw mm2, mm1 ; op[2] = a1 - b1 - - ; output 1 and 3 - ; interleave c1, d1 - movq mm1, mm5 ; d1 - punpcklwd mm1, mm4 ; c1 d1 - punpckhwd mm5, mm4 ; c1 d1 - - movq mm3, mm1 - movq mm4, mm5 - - pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - - pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - - paddd mm1, MMWORD PTR[GLOBAL(_14500)] - paddd mm4, MMWORD PTR[GLOBAL(_14500)] - paddd mm3, MMWORD PTR[GLOBAL(_7500)] - paddd mm5, MMWORD PTR[GLOBAL(_7500)] - - psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 - psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 - psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 - psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 - - packssdw mm1, mm4 ; op[1] - packssdw mm3, mm5 ; op[3] - - ; done with vertical - ; transpose for the second stage - movq mm4, mm0 ; 00 10 20 30 - movq mm5, mm2 ; 02 12 22 32 - - punpcklwd mm0, mm1 ; 00 01 10 11 - punpckhwd mm4, mm1 ; 20 21 30 31 - - punpcklwd mm2, mm3 ; 02 03 12 13 - punpckhwd mm5, mm3 ; 22 23 32 33 - - movq mm1, mm0 ; 00 01 10 11 - punpckldq mm0, mm2 ; 00 01 02 03 - - punpckhdq mm1, mm2 ; 01 22 12 13 - - movq mm2, mm4 ; 20 31 30 31 - punpckldq mm2, mm5 ; 20 21 22 23 - - punpckhdq mm4, mm5 ; 30 31 32 33 - - ; mm0 0 - ; mm1 1 - ; mm2 2 - ; mm3 4 - - movq mm5, mm0 - movq mm3, mm1 - - paddw mm0, mm4 ; a1 = 0 + 3 - paddw mm1, mm2 ; b1 = 1 + 2 - - psubw mm3, mm2 ; c1 = 1 - 2 - psubw mm5, mm4 ; d1 = 0 - 3 - - pxor mm6, mm6 ; zero out for compare - - pcmpeqw mm6, mm5 ; d1 != 0 - - pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper, - ; and keep bit 0 of lower - - ; output 0 and 2 - movq mm2, mm0 ; a1 - - paddw mm0, mm1 ; a1 + b1 - psubw mm2, mm1 ; a1 - b1 - - paddw mm0, MMWORD PTR[GLOBAL(_7w)] - paddw mm2, MMWORD PTR[GLOBAL(_7w)] - - psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4 - psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4 - - movq MMWORD PTR[rdi + 0 ], mm0 - movq MMWORD PTR[rdi + 16], mm2 - - ; output 1 and 3 - ; interleave c1, d1 - movq mm1, mm5 ; d1 - punpcklwd mm1, mm3 ; c1 d1 - punpckhwd mm5, mm3 ; c1 d1 - - movq mm3, mm1 - movq mm4, mm5 - - pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 - - pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 - - paddd mm1, MMWORD PTR[GLOBAL(_12000)] - paddd mm4, MMWORD PTR[GLOBAL(_12000)] - paddd mm3, MMWORD PTR[GLOBAL(_51000)] - paddd mm5, MMWORD PTR[GLOBAL(_51000)] - - psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 - psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 - psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 - psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 - - packssdw mm1, mm4 ; op[4] - packssdw mm3, mm5 ; op[12] - - paddw mm1, mm6 ; op[4] += (d1!=0) - - movq MMWORD PTR[rdi + 8 ], mm1 - movq MMWORD PTR[rdi + 24], mm3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 8 -_5352_2217: - dw 5352 - dw 2217 - dw 5352 - dw 2217 -align 8 -_2217_neg5352: - dw 2217 - dw -5352 - dw 2217 - dw -5352 -align 8 -_cmp_mask: - times 4 dw 1 -align 8 -_7w: - times 4 dw 7 -align 8 -_14500: - times 2 dd 14500 -align 8 -_7500: - times 2 dd 7500 -align 8 -_12000: - times 2 dd 12000 -align 8 -_51000: - times 2 dd 51000 diff --git a/libvpx/vp9/encoder/x86/vp9_dct_mmx.h b/libvpx/vp9/encoder/x86/vp9_dct_mmx.h deleted file mode 100644 index 3bac7c8..0000000 --- a/libvpx/vp9/encoder/x86/vp9_dct_mmx.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_ENCODER_X86_VP9_DCT_MMX_H_ -#define VP9_ENCODER_X86_VP9_DCT_MMX_H_ - -extern void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch); - - -#endif /* VP9_ENCODER_X86_VP9_DCT_MMX_H_ */ diff --git a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c index aaacebe..bf09c7a 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_dct_sse2.c @@ -10,6 +10,7 @@ #include <emmintrin.h> // SSE2 #include "vp9/common/vp9_idct.h" // for cospi constants +#include "vpx_ports/mem.h" void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { // The 2D transform is done with two passes which are actually pretty @@ -116,6 +117,166 @@ void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) { vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch); } +static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) { + const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); + const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); + __m128i mask; + + in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); + in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); + + in[0] = _mm_slli_epi16(in[0], 4); + in[1] = _mm_slli_epi16(in[1], 4); + in[2] = _mm_slli_epi16(in[2], 4); + in[3] = _mm_slli_epi16(in[3], 4); + + mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); + in[0] = _mm_add_epi16(in[0], mask); + in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); +} + +static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) { + const __m128i kOne = _mm_set1_epi16(1); + __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); + __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); + __m128i out01 = _mm_add_epi16(in01, kOne); + __m128i out23 = _mm_add_epi16(in23, kOne); + out01 = _mm_srai_epi16(out01, 2); + out23 = _mm_srai_epi16(out23, 2); + _mm_store_si128((__m128i *)(output + 0 * 8), out01); + _mm_store_si128((__m128i *)(output + 1 * 8), out23); +} + +static INLINE void transpose_4x4(__m128i *res) { + // Combine and transpose + // 00 01 02 03 20 21 22 23 + // 10 11 12 13 30 31 32 33 + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); + res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); + + // 00 10 20 30 01 11 21 31 + // 02 12 22 32 03 13 23 33 + // only use the first 4 16-bit integers + res[1] = _mm_unpackhi_epi64(res[0], res[0]); + res[3] = _mm_unpackhi_epi64(res[2], res[2]); +} + +void fdct4_1d_sse2(__m128i *in) { + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u[4], v[4]; + u[0] = _mm_add_epi16(in[0], in[3]); + u[1] = _mm_add_epi16(in[1], in[2]); + u[2] = _mm_sub_epi16(in[1], in[2]); + u[3] = _mm_sub_epi16(in[0], in[3]); + + v[0] = _mm_unpacklo_epi16(u[0], u[1]); + v[1] = _mm_unpacklo_epi16(u[2], u[3]); + u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 + u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 + u[2] = _mm_madd_epi16(v[1], k__cospi_p24_p08); // 1 + u[3] = _mm_madd_epi16(v[1], k__cospi_m08_p24); // 3 + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); + transpose_4x4(in); +} + +void fadst4_1d_sse2(__m128i *in) { + const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); + const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); + const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); + const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); + const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); + const __m128i kZero = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8]; + __m128i in7 = _mm_add_epi16(in[0], in[1]); + in7 = _mm_sub_epi16(in7, in[3]); + + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpacklo_epi16(in[2], in[3]); + u[2] = _mm_unpacklo_epi16(in7, kZero); + u[3] = _mm_unpacklo_epi16(in[2], kZero); + + v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 + v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 + v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 + v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 + v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 + v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = v[2]; + u[2] = _mm_add_epi32(v[3], v[4]); + u[3] = _mm_sub_epi32(u[2], u[0]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_sub_epi32(u[4], v[5]); + u[6] = _mm_add_epi32(u[3], u[5]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[2]); + in[1] = _mm_packs_epi32(u[1], u[3]); + transpose_4x4(in); +} + +void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output, + int stride, int tx_type) { + __m128i in[4]; + load_buffer_4x4(input, in, stride); + switch (tx_type) { + case 0: // DCT_DCT + fdct4_1d_sse2(in); + fdct4_1d_sse2(in); + break; + case 1: // ADST_DCT + fadst4_1d_sse2(in); + fdct4_1d_sse2(in); + break; + case 2: // DCT_ADST + fdct4_1d_sse2(in); + fadst4_1d_sse2(in); + break; + case 3: // ADST_ADST + fadst4_1d_sse2(in); + fadst4_1d_sse2(in); + break; + default: + assert(0); + break; + } + write_buffer_4x4(output, in); +} + void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { const int stride = pitch >> 1; int pass; @@ -133,14 +294,14 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); // Load input - __m128i in0 = _mm_loadu_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_loadu_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_loadu_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_loadu_si128((const __m128i *)(input + 3 * stride)); - __m128i in4 = _mm_loadu_si128((const __m128i *)(input + 4 * stride)); - __m128i in5 = _mm_loadu_si128((const __m128i *)(input + 5 * stride)); - __m128i in6 = _mm_loadu_si128((const __m128i *)(input + 6 * stride)); - __m128i in7 = _mm_loadu_si128((const __m128i *)(input + 7 * stride)); + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); // Pre-condition input (shift by two) in0 = _mm_slli_epi16(in0, 2); in1 = _mm_slli_epi16(in1, 2); @@ -362,15 +523,543 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { in6 = _mm_srai_epi16(in6, 1); in7 = _mm_srai_epi16(in7, 1); // store results - _mm_storeu_si128((__m128i *)(output + 0 * 8), in0); - _mm_storeu_si128((__m128i *)(output + 1 * 8), in1); - _mm_storeu_si128((__m128i *)(output + 2 * 8), in2); - _mm_storeu_si128((__m128i *)(output + 3 * 8), in3); - _mm_storeu_si128((__m128i *)(output + 4 * 8), in4); - _mm_storeu_si128((__m128i *)(output + 5 * 8), in5); - _mm_storeu_si128((__m128i *)(output + 6 * 8), in6); - _mm_storeu_si128((__m128i *)(output + 7 * 8), in7); + _mm_store_si128((__m128i *)(output + 0 * 8), in0); + _mm_store_si128((__m128i *)(output + 1 * 8), in1); + _mm_store_si128((__m128i *)(output + 2 * 8), in2); + _mm_store_si128((__m128i *)(output + 3 * 8), in3); + _mm_store_si128((__m128i *)(output + 4 * 8), in4); + _mm_store_si128((__m128i *)(output + 5 * 8), in5); + _mm_store_si128((__m128i *)(output + 6 * 8), in6); + _mm_store_si128((__m128i *)(output + 7 * 8), in7); + } +} + +// load 8x8 array +static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) { + in[0] = _mm_load_si128((__m128i *)(input + 0 * stride)); + in[1] = _mm_load_si128((__m128i *)(input + 1 * stride)); + in[2] = _mm_load_si128((__m128i *)(input + 2 * stride)); + in[3] = _mm_load_si128((__m128i *)(input + 3 * stride)); + in[4] = _mm_load_si128((__m128i *)(input + 4 * stride)); + in[5] = _mm_load_si128((__m128i *)(input + 5 * stride)); + in[6] = _mm_load_si128((__m128i *)(input + 6 * stride)); + in[7] = _mm_load_si128((__m128i *)(input + 7 * stride)); + + in[0] = _mm_slli_epi16(in[0], 2); + in[1] = _mm_slli_epi16(in[1], 2); + in[2] = _mm_slli_epi16(in[2], 2); + in[3] = _mm_slli_epi16(in[3], 2); + in[4] = _mm_slli_epi16(in[4], 2); + in[5] = _mm_slli_epi16(in[5], 2); + in[6] = _mm_slli_epi16(in[6], 2); + in[7] = _mm_slli_epi16(in[7], 2); +} + +// right shift and rounding +static INLINE void right_shift_8x8(__m128i *res, int const bit) { + const __m128i kOne = _mm_set1_epi16(1); + const int bit_m02 = bit - 2; + __m128i sign0 = _mm_srai_epi16(res[0], 15); + __m128i sign1 = _mm_srai_epi16(res[1], 15); + __m128i sign2 = _mm_srai_epi16(res[2], 15); + __m128i sign3 = _mm_srai_epi16(res[3], 15); + __m128i sign4 = _mm_srai_epi16(res[4], 15); + __m128i sign5 = _mm_srai_epi16(res[5], 15); + __m128i sign6 = _mm_srai_epi16(res[6], 15); + __m128i sign7 = _mm_srai_epi16(res[7], 15); + + if (bit_m02 >= 0) { + __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02); + res[0] = _mm_add_epi16(res[0], k_const_rounding); + res[1] = _mm_add_epi16(res[1], k_const_rounding); + res[2] = _mm_add_epi16(res[2], k_const_rounding); + res[3] = _mm_add_epi16(res[3], k_const_rounding); + res[4] = _mm_add_epi16(res[4], k_const_rounding); + res[5] = _mm_add_epi16(res[5], k_const_rounding); + res[6] = _mm_add_epi16(res[6], k_const_rounding); + res[7] = _mm_add_epi16(res[7], k_const_rounding); + } + + res[0] = _mm_sub_epi16(res[0], sign0); + res[1] = _mm_sub_epi16(res[1], sign1); + res[2] = _mm_sub_epi16(res[2], sign2); + res[3] = _mm_sub_epi16(res[3], sign3); + res[4] = _mm_sub_epi16(res[4], sign4); + res[5] = _mm_sub_epi16(res[5], sign5); + res[6] = _mm_sub_epi16(res[6], sign6); + res[7] = _mm_sub_epi16(res[7], sign7); + + res[0] = _mm_srai_epi16(res[0], bit); + res[1] = _mm_srai_epi16(res[1], bit); + res[2] = _mm_srai_epi16(res[2], bit); + res[3] = _mm_srai_epi16(res[3], bit); + res[4] = _mm_srai_epi16(res[4], bit); + res[5] = _mm_srai_epi16(res[5], bit); + res[6] = _mm_srai_epi16(res[6], bit); + res[7] = _mm_srai_epi16(res[7], bit); +} + +// write 8x8 array +static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) { + _mm_store_si128((__m128i *)(output + 0 * stride), res[0]); + _mm_store_si128((__m128i *)(output + 1 * stride), res[1]); + _mm_store_si128((__m128i *)(output + 2 * stride), res[2]); + _mm_store_si128((__m128i *)(output + 3 * stride), res[3]); + _mm_store_si128((__m128i *)(output + 4 * stride), res[4]); + _mm_store_si128((__m128i *)(output + 5 * stride), res[5]); + _mm_store_si128((__m128i *)(output + 6 * stride), res[6]); + _mm_store_si128((__m128i *)(output + 7 * stride), res[7]); +} + +// perform in-place transpose +static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 44 54 45 55 46 56 47 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 25 35 + // 44 54 64 74 45 55 65 75 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); + res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); + res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); + res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); + res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 +} + +void fdct8_1d_sse2(__m128i *in) { + // constants + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + + // stage 1 + s0 = _mm_add_epi16(in[0], in[7]); + s1 = _mm_add_epi16(in[1], in[6]); + s2 = _mm_add_epi16(in[2], in[5]); + s3 = _mm_add_epi16(in[3], in[4]); + s4 = _mm_sub_epi16(in[3], in[4]); + s5 = _mm_sub_epi16(in[2], in[5]); + s6 = _mm_sub_epi16(in[1], in[6]); + s7 = _mm_sub_epi16(in[0], in[7]); + + u0 = _mm_add_epi16(s0, s3); + u1 = _mm_add_epi16(s1, s2); + u2 = _mm_sub_epi16(s1, s2); + u3 = _mm_sub_epi16(s0, s3); + // interleave and perform butterfly multiplication/addition + v0 = _mm_unpacklo_epi16(u0, u1); + v1 = _mm_unpackhi_epi16(u0, u1); + v2 = _mm_unpacklo_epi16(u2, u3); + v3 = _mm_unpackhi_epi16(u2, u3); + + u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); + u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); + u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); + u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); + u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); + u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); + u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); + u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); + + // shift and rounding + v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u0, u1); + in[2] = _mm_packs_epi32(u4, u5); + in[4] = _mm_packs_epi32(u2, u3); + in[6] = _mm_packs_epi32(u6, u7); + + // stage 2 + // interleave and perform butterfly multiplication/addition + u0 = _mm_unpacklo_epi16(s6, s5); + u1 = _mm_unpackhi_epi16(s6, s5); + v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); + + // shift and rounding + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + + u0 = _mm_packs_epi32(v0, v1); + u1 = _mm_packs_epi32(v2, v3); + + // stage 3 + s0 = _mm_add_epi16(s4, u0); + s1 = _mm_sub_epi16(s4, u0); + s2 = _mm_sub_epi16(s7, u1); + s3 = _mm_add_epi16(s7, u1); + + // stage 4 + u0 = _mm_unpacklo_epi16(s0, s3); + u1 = _mm_unpackhi_epi16(s0, s3); + u2 = _mm_unpacklo_epi16(s1, s2); + u3 = _mm_unpackhi_epi16(s1, s2); + + v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); + v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); + v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); + v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); + v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); + v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); + v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); + v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); + + // shift and rounding + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + in[1] = _mm_packs_epi32(v0, v1); + in[3] = _mm_packs_epi32(v4, v5); + in[5] = _mm_packs_epi32(v2, v3); + in[7] = _mm_packs_epi32(v6, v7); + + // transpose + array_transpose_8x8(in, in); +} + +void fadst8_1d_sse2(__m128i *in) { + // Constants + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__const_0 = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + + // properly aligned for butterfly input + in0 = in[7]; + in1 = in[0]; + in2 = in[5]; + in3 = in[2]; + in4 = in[3]; + in5 = in[4]; + in6 = in[1]; + in7 = in[6]; + + // column transformation + // stage 1 + // interleave and multiply/add into 32-bit integer + s0 = _mm_unpacklo_epi16(in0, in1); + s1 = _mm_unpackhi_epi16(in0, in1); + s2 = _mm_unpacklo_epi16(in2, in3); + s3 = _mm_unpackhi_epi16(in2, in3); + s4 = _mm_unpacklo_epi16(in4, in5); + s5 = _mm_unpackhi_epi16(in4, in5); + s6 = _mm_unpacklo_epi16(in6, in7); + s7 = _mm_unpackhi_epi16(in6, in7); + + u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); + u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); + u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); + u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); + u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); + u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); + u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); + u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); + u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); + u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); + u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); + u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); + u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); + u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); + u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); + u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); + + // addition + w0 = _mm_add_epi32(u0, u8); + w1 = _mm_add_epi32(u1, u9); + w2 = _mm_add_epi32(u2, u10); + w3 = _mm_add_epi32(u3, u11); + w4 = _mm_add_epi32(u4, u12); + w5 = _mm_add_epi32(u5, u13); + w6 = _mm_add_epi32(u6, u14); + w7 = _mm_add_epi32(u7, u15); + w8 = _mm_sub_epi32(u0, u8); + w9 = _mm_sub_epi32(u1, u9); + w10 = _mm_sub_epi32(u2, u10); + w11 = _mm_sub_epi32(u3, u11); + w12 = _mm_sub_epi32(u4, u12); + w13 = _mm_sub_epi32(u5, u13); + w14 = _mm_sub_epi32(u6, u14); + w15 = _mm_sub_epi32(u7, u15); + + // shift and rounding + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); + v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); + v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); + v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); + v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); + v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); + v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); + v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); + u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); + u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); + u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); + u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); + u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); + u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); + u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); + + // back to 16-bit and pack 8 integers into __m128i + in[0] = _mm_packs_epi32(u0, u1); + in[1] = _mm_packs_epi32(u2, u3); + in[2] = _mm_packs_epi32(u4, u5); + in[3] = _mm_packs_epi32(u6, u7); + in[4] = _mm_packs_epi32(u8, u9); + in[5] = _mm_packs_epi32(u10, u11); + in[6] = _mm_packs_epi32(u12, u13); + in[7] = _mm_packs_epi32(u14, u15); + + // stage 2 + s0 = _mm_add_epi16(in[0], in[2]); + s1 = _mm_add_epi16(in[1], in[3]); + s2 = _mm_sub_epi16(in[0], in[2]); + s3 = _mm_sub_epi16(in[1], in[3]); + u0 = _mm_unpacklo_epi16(in[4], in[5]); + u1 = _mm_unpackhi_epi16(in[4], in[5]); + u2 = _mm_unpacklo_epi16(in[6], in[7]); + u3 = _mm_unpackhi_epi16(in[6], in[7]); + + v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); + v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); + v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); + v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); + v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); + v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); + v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); + v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); + + w0 = _mm_add_epi32(v0, v4); + w1 = _mm_add_epi32(v1, v5); + w2 = _mm_add_epi32(v2, v6); + w3 = _mm_add_epi32(v3, v7); + w4 = _mm_sub_epi32(v0, v4); + w5 = _mm_sub_epi32(v1, v5); + w6 = _mm_sub_epi32(v2, v6); + w7 = _mm_sub_epi32(v3, v7); + + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + // back to 16-bit intergers + s4 = _mm_packs_epi32(u0, u1); + s5 = _mm_packs_epi32(u2, u3); + s6 = _mm_packs_epi32(u4, u5); + s7 = _mm_packs_epi32(u6, u7); + + // stage 3 + u0 = _mm_unpacklo_epi16(s2, s3); + u1 = _mm_unpackhi_epi16(s2, s3); + u2 = _mm_unpacklo_epi16(s6, s7); + u3 = _mm_unpackhi_epi16(s6, s7); + + v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); + v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); + v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); + v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); + + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + s2 = _mm_packs_epi32(v0, v1); + s3 = _mm_packs_epi32(v2, v3); + s6 = _mm_packs_epi32(v4, v5); + s7 = _mm_packs_epi32(v6, v7); + + // FIXME(jingning): do subtract using bit inversion? + in[0] = s0; + in[1] = _mm_sub_epi16(k__const_0, s4); + in[2] = s6; + in[3] = _mm_sub_epi16(k__const_0, s2); + in[4] = s3; + in[5] = _mm_sub_epi16(k__const_0, s7); + in[6] = s5; + in[7] = _mm_sub_epi16(k__const_0, s1); + + // transpose + array_transpose_8x8(in, in); +} + +void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output, + int stride, int tx_type) { + __m128i in[8]; + load_buffer_8x8(input, in, stride); + switch (tx_type) { + case 0: // DCT_DCT + fdct8_1d_sse2(in); + fdct8_1d_sse2(in); + break; + case 1: // ADST_DCT + fadst8_1d_sse2(in); + fdct8_1d_sse2(in); + break; + case 2: // DCT_ADST + fdct8_1d_sse2(in); + fadst8_1d_sse2(in); + break; + case 3: // ADST_ADST + fadst8_1d_sse2(in); + fadst8_1d_sse2(in); + break; + default: + assert(0); + break; } + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); } void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { @@ -383,7 +1072,7 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { const int stride = pitch >> 1; int pass; // We need an intermediate buffer between passes. - int16_t intermediate[256]; + DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); int16_t *in = input; int16_t *out = intermediate; // Constants @@ -426,22 +1115,22 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { __m128i res08, res09, res10, res11, res12, res13, res14, res15; // Load and pre-condition input. if (0 == pass) { - in00 = _mm_loadu_si128((const __m128i *)(in + 0 * stride)); - in01 = _mm_loadu_si128((const __m128i *)(in + 1 * stride)); - in02 = _mm_loadu_si128((const __m128i *)(in + 2 * stride)); - in03 = _mm_loadu_si128((const __m128i *)(in + 3 * stride)); - in04 = _mm_loadu_si128((const __m128i *)(in + 4 * stride)); - in05 = _mm_loadu_si128((const __m128i *)(in + 5 * stride)); - in06 = _mm_loadu_si128((const __m128i *)(in + 6 * stride)); - in07 = _mm_loadu_si128((const __m128i *)(in + 7 * stride)); - in08 = _mm_loadu_si128((const __m128i *)(in + 8 * stride)); - in09 = _mm_loadu_si128((const __m128i *)(in + 9 * stride)); - in10 = _mm_loadu_si128((const __m128i *)(in + 10 * stride)); - in11 = _mm_loadu_si128((const __m128i *)(in + 11 * stride)); - in12 = _mm_loadu_si128((const __m128i *)(in + 12 * stride)); - in13 = _mm_loadu_si128((const __m128i *)(in + 13 * stride)); - in14 = _mm_loadu_si128((const __m128i *)(in + 14 * stride)); - in15 = _mm_loadu_si128((const __m128i *)(in + 15 * stride)); + in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); + in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); + in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); + in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); + in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); + in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); + in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); + in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); + in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); + in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); + in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); + in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); + in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); + in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); + in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); + in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); // x = x << 2 in00 = _mm_slli_epi16(in00, 2); in01 = _mm_slli_epi16(in01, 2); @@ -460,22 +1149,22 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { in14 = _mm_slli_epi16(in14, 2); in15 = _mm_slli_epi16(in15, 2); } else { - in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 16)); - in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 16)); - in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 16)); - in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 16)); - in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 16)); - in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 16)); - in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 16)); - in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 16)); - in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 16)); - in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 16)); - in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 16)); - in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 16)); - in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 16)); - in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 16)); - in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 16)); - in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 16)); + in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); + in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); + in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); + in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); + in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); + in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); + in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); + in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); + in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); + in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); + in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); + in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); + in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); + in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); + in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); + in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); // x = (x + 1) >> 2 in00 = _mm_add_epi16(in00, kOne); in01 = _mm_add_epi16(in01, kOne); @@ -982,14 +1671,14 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 // Store results - _mm_storeu_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); - _mm_storeu_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); - _mm_storeu_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); - _mm_storeu_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); - _mm_storeu_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); - _mm_storeu_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); - _mm_storeu_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); - _mm_storeu_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); + _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); + _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); + _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); + _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); + _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); + _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); + _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); + _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); } out += 8*16; } @@ -998,3 +1687,2109 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { out = output; } } + +static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0, + __m128i *in1, int stride) { + // load first 8 columns + load_buffer_8x8(input, in0, stride); + load_buffer_8x8(input + 8 * stride, in0 + 8, stride); + + input += 8; + // load second 8 columns + load_buffer_8x8(input, in1, stride); + load_buffer_8x8(input + 8 * stride, in1 + 8, stride); +} + +static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0, + __m128i *in1, int stride) { + // write first 8 columns + write_buffer_8x8(output, in0, stride); + write_buffer_8x8(output + 8 * stride, in0 + 8, stride); + // write second 8 columns + output += 8; + write_buffer_8x8(output, in1, stride); + write_buffer_8x8(output + 8 * stride, in1 + 8, stride); +} + +static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { + __m128i tbuf[8]; + array_transpose_8x8(res0, res0); + array_transpose_8x8(res1, tbuf); + array_transpose_8x8(res0 + 8, res1); + array_transpose_8x8(res1 + 8, res1 + 8); + + res0[8] = tbuf[0]; + res0[9] = tbuf[1]; + res0[10] = tbuf[2]; + res0[11] = tbuf[3]; + res0[12] = tbuf[4]; + res0[13] = tbuf[5]; + res0[14] = tbuf[6]; + res0[15] = tbuf[7]; +} + +static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { + // perform rounding operations + right_shift_8x8(res0, 2); + right_shift_8x8(res0 + 8, 2); + right_shift_8x8(res1, 2); + right_shift_8x8(res1 + 8, 2); +} + +void fdct16_1d_8col(__m128i *in) { + // perform 16x16 1-D DCT for 8 columns + __m128i i[8], s[8], p[8], t[8], u[16], v[16]; + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + // stage 1 + i[0] = _mm_add_epi16(in[0], in[15]); + i[1] = _mm_add_epi16(in[1], in[14]); + i[2] = _mm_add_epi16(in[2], in[13]); + i[3] = _mm_add_epi16(in[3], in[12]); + i[4] = _mm_add_epi16(in[4], in[11]); + i[5] = _mm_add_epi16(in[5], in[10]); + i[6] = _mm_add_epi16(in[6], in[9]); + i[7] = _mm_add_epi16(in[7], in[8]); + + s[0] = _mm_sub_epi16(in[7], in[8]); + s[1] = _mm_sub_epi16(in[6], in[9]); + s[2] = _mm_sub_epi16(in[5], in[10]); + s[3] = _mm_sub_epi16(in[4], in[11]); + s[4] = _mm_sub_epi16(in[3], in[12]); + s[5] = _mm_sub_epi16(in[2], in[13]); + s[6] = _mm_sub_epi16(in[1], in[14]); + s[7] = _mm_sub_epi16(in[0], in[15]); + + p[0] = _mm_add_epi16(i[0], i[7]); + p[1] = _mm_add_epi16(i[1], i[6]); + p[2] = _mm_add_epi16(i[2], i[5]); + p[3] = _mm_add_epi16(i[3], i[4]); + p[4] = _mm_sub_epi16(i[3], i[4]); + p[5] = _mm_sub_epi16(i[2], i[5]); + p[6] = _mm_sub_epi16(i[1], i[6]); + p[7] = _mm_sub_epi16(i[0], i[7]); + + u[0] = _mm_add_epi16(p[0], p[3]); + u[1] = _mm_add_epi16(p[1], p[2]); + u[2] = _mm_sub_epi16(p[1], p[2]); + u[3] = _mm_sub_epi16(p[0], p[3]); + + v[0] = _mm_unpacklo_epi16(u[0], u[1]); + v[1] = _mm_unpackhi_epi16(u[0], u[1]); + v[2] = _mm_unpacklo_epi16(u[2], u[3]); + v[3] = _mm_unpackhi_epi16(u[2], u[3]); + + u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); + u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); + u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); + u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); + u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); + u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); + u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); + u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[4] = _mm_packs_epi32(u[4], u[5]); + in[8] = _mm_packs_epi32(u[2], u[3]); + in[12] = _mm_packs_epi32(u[6], u[7]); + + u[0] = _mm_unpacklo_epi16(p[5], p[6]); + u[1] = _mm_unpackhi_epi16(p[5], p[6]); + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + + u[0] = _mm_packs_epi32(v[0], v[1]); + u[1] = _mm_packs_epi32(v[2], v[3]); + + t[0] = _mm_add_epi16(p[4], u[0]); + t[1] = _mm_sub_epi16(p[4], u[0]); + t[2] = _mm_sub_epi16(p[7], u[1]); + t[3] = _mm_add_epi16(p[7], u[1]); + + u[0] = _mm_unpacklo_epi16(t[0], t[3]); + u[1] = _mm_unpackhi_epi16(t[0], t[3]); + u[2] = _mm_unpacklo_epi16(t[1], t[2]); + u[3] = _mm_unpackhi_epi16(t[1], t[2]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); + v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); + v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); + v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); + v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); + v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + in[2] = _mm_packs_epi32(v[0], v[1]); + in[6] = _mm_packs_epi32(v[4], v[5]); + in[10] = _mm_packs_epi32(v[2], v[3]); + in[14] = _mm_packs_epi32(v[6], v[7]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[2], s[5]); + u[1] = _mm_unpackhi_epi16(s[2], s[5]); + u[2] = _mm_unpacklo_epi16(s[3], s[4]); + u[3] = _mm_unpackhi_epi16(s[3], s[4]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[2] = _mm_packs_epi32(v[0], v[1]); + t[3] = _mm_packs_epi32(v[2], v[3]); + t[4] = _mm_packs_epi32(v[4], v[5]); + t[5] = _mm_packs_epi32(v[6], v[7]); + + // stage 3 + p[0] = _mm_add_epi16(s[0], t[3]); + p[1] = _mm_add_epi16(s[1], t[2]); + p[2] = _mm_sub_epi16(s[1], t[2]); + p[3] = _mm_sub_epi16(s[0], t[3]); + p[4] = _mm_sub_epi16(s[7], t[4]); + p[5] = _mm_sub_epi16(s[6], t[5]); + p[6] = _mm_add_epi16(s[6], t[5]); + p[7] = _mm_add_epi16(s[7], t[4]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(p[1], p[6]); + u[1] = _mm_unpackhi_epi16(p[1], p[6]); + u[2] = _mm_unpacklo_epi16(p[2], p[5]); + u[3] = _mm_unpackhi_epi16(p[2], p[5]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); + v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); + v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); + v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); + v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); + v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[1] = _mm_packs_epi32(v[0], v[1]); + t[2] = _mm_packs_epi32(v[2], v[3]); + t[5] = _mm_packs_epi32(v[4], v[5]); + t[6] = _mm_packs_epi32(v[6], v[7]); + + // stage 5 + s[0] = _mm_add_epi16(p[0], t[1]); + s[1] = _mm_sub_epi16(p[0], t[1]); + s[2] = _mm_sub_epi16(p[3], t[2]); + s[3] = _mm_add_epi16(p[3], t[2]); + s[4] = _mm_add_epi16(p[4], t[5]); + s[5] = _mm_sub_epi16(p[4], t[5]); + s[6] = _mm_sub_epi16(p[7], t[6]); + s[7] = _mm_add_epi16(p[7], t[6]); + + // stage 6 + u[0] = _mm_unpacklo_epi16(s[0], s[7]); + u[1] = _mm_unpackhi_epi16(s[0], s[7]); + u[2] = _mm_unpacklo_epi16(s[1], s[6]); + u[3] = _mm_unpackhi_epi16(s[1], s[6]); + u[4] = _mm_unpacklo_epi16(s[2], s[5]); + u[5] = _mm_unpackhi_epi16(s[2], s[5]); + u[6] = _mm_unpacklo_epi16(s[3], s[4]); + u[7] = _mm_unpackhi_epi16(s[3], s[4]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); + v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); + v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); + v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); + v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); + v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); + v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); + v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); + v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); + v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); + v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); + v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); + v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); + v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); + v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[1] = _mm_packs_epi32(v[0], v[1]); + in[9] = _mm_packs_epi32(v[2], v[3]); + in[5] = _mm_packs_epi32(v[4], v[5]); + in[13] = _mm_packs_epi32(v[6], v[7]); + in[3] = _mm_packs_epi32(v[8], v[9]); + in[11] = _mm_packs_epi32(v[10], v[11]); + in[7] = _mm_packs_epi32(v[12], v[13]); + in[15] = _mm_packs_epi32(v[14], v[15]); +} + +void fadst16_1d_8col(__m128i *in) { + // perform 16x16 1-D ADST for 8 columns + __m128i s[16], x[16], u[32], v[32]; + const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); + const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_set1_epi16(0); + + u[0] = _mm_unpacklo_epi16(in[15], in[0]); + u[1] = _mm_unpackhi_epi16(in[15], in[0]); + u[2] = _mm_unpacklo_epi16(in[13], in[2]); + u[3] = _mm_unpackhi_epi16(in[13], in[2]); + u[4] = _mm_unpacklo_epi16(in[11], in[4]); + u[5] = _mm_unpackhi_epi16(in[11], in[4]); + u[6] = _mm_unpacklo_epi16(in[9], in[6]); + u[7] = _mm_unpackhi_epi16(in[9], in[6]); + u[8] = _mm_unpacklo_epi16(in[7], in[8]); + u[9] = _mm_unpackhi_epi16(in[7], in[8]); + u[10] = _mm_unpacklo_epi16(in[5], in[10]); + u[11] = _mm_unpackhi_epi16(in[5], in[10]); + u[12] = _mm_unpacklo_epi16(in[3], in[12]); + u[13] = _mm_unpackhi_epi16(in[3], in[12]); + u[14] = _mm_unpacklo_epi16(in[1], in[14]); + u[15] = _mm_unpackhi_epi16(in[1], in[14]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); + v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); + v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); + v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); + v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); + v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); + v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); + v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); + v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); + v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); + v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); + v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); + v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); + v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); + v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); + v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); + v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); + v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); + v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); + v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); + v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); + v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); + v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); + v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); + v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); + v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); + v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); + v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); + v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); + v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); + v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); + v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); + + u[0] = _mm_add_epi32(v[0], v[16]); + u[1] = _mm_add_epi32(v[1], v[17]); + u[2] = _mm_add_epi32(v[2], v[18]); + u[3] = _mm_add_epi32(v[3], v[19]); + u[4] = _mm_add_epi32(v[4], v[20]); + u[5] = _mm_add_epi32(v[5], v[21]); + u[6] = _mm_add_epi32(v[6], v[22]); + u[7] = _mm_add_epi32(v[7], v[23]); + u[8] = _mm_add_epi32(v[8], v[24]); + u[9] = _mm_add_epi32(v[9], v[25]); + u[10] = _mm_add_epi32(v[10], v[26]); + u[11] = _mm_add_epi32(v[11], v[27]); + u[12] = _mm_add_epi32(v[12], v[28]); + u[13] = _mm_add_epi32(v[13], v[29]); + u[14] = _mm_add_epi32(v[14], v[30]); + u[15] = _mm_add_epi32(v[15], v[31]); + u[16] = _mm_sub_epi32(v[0], v[16]); + u[17] = _mm_sub_epi32(v[1], v[17]); + u[18] = _mm_sub_epi32(v[2], v[18]); + u[19] = _mm_sub_epi32(v[3], v[19]); + u[20] = _mm_sub_epi32(v[4], v[20]); + u[21] = _mm_sub_epi32(v[5], v[21]); + u[22] = _mm_sub_epi32(v[6], v[22]); + u[23] = _mm_sub_epi32(v[7], v[23]); + u[24] = _mm_sub_epi32(v[8], v[24]); + u[25] = _mm_sub_epi32(v[9], v[25]); + u[26] = _mm_sub_epi32(v[10], v[26]); + u[27] = _mm_sub_epi32(v[11], v[27]); + u[28] = _mm_sub_epi32(v[12], v[28]); + u[29] = _mm_sub_epi32(v[13], v[29]); + u[30] = _mm_sub_epi32(v[14], v[30]); + u[31] = _mm_sub_epi32(v[15], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); + v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); + v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); + v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); + v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); + v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); + v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); + v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); + v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); + v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); + v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); + v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); + v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); + v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); + v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); + v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); + u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); + u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); + u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); + u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); + u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); + u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); + u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); + u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); + u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); + u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); + u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); + u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); + u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); + u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); + u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_packs_epi32(u[8], u[9]); + s[5] = _mm_packs_epi32(u[10], u[11]); + s[6] = _mm_packs_epi32(u[12], u[13]); + s[7] = _mm_packs_epi32(u[14], u[15]); + s[8] = _mm_packs_epi32(u[16], u[17]); + s[9] = _mm_packs_epi32(u[18], u[19]); + s[10] = _mm_packs_epi32(u[20], u[21]); + s[11] = _mm_packs_epi32(u[22], u[23]); + s[12] = _mm_packs_epi32(u[24], u[25]); + s[13] = _mm_packs_epi32(u[26], u[27]); + s[14] = _mm_packs_epi32(u[28], u[29]); + s[15] = _mm_packs_epi32(u[30], u[31]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[9]); + u[1] = _mm_unpackhi_epi16(s[8], s[9]); + u[2] = _mm_unpacklo_epi16(s[10], s[11]); + u[3] = _mm_unpackhi_epi16(s[10], s[11]); + u[4] = _mm_unpacklo_epi16(s[12], s[13]); + u[5] = _mm_unpackhi_epi16(s[12], s[13]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); + v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); + v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); + v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); + v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); + v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); + v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); + v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + x[0] = _mm_add_epi16(s[0], s[4]); + x[1] = _mm_add_epi16(s[1], s[5]); + x[2] = _mm_add_epi16(s[2], s[6]); + x[3] = _mm_add_epi16(s[3], s[7]); + x[4] = _mm_sub_epi16(s[0], s[4]); + x[5] = _mm_sub_epi16(s[1], s[5]); + x[6] = _mm_sub_epi16(s[2], s[6]); + x[7] = _mm_sub_epi16(s[3], s[7]); + x[8] = _mm_packs_epi32(u[0], u[1]); + x[9] = _mm_packs_epi32(u[2], u[3]); + x[10] = _mm_packs_epi32(u[4], u[5]); + x[11] = _mm_packs_epi32(u[6], u[7]); + x[12] = _mm_packs_epi32(u[8], u[9]); + x[13] = _mm_packs_epi32(u[10], u[11]); + x[14] = _mm_packs_epi32(u[12], u[13]); + x[15] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + u[0] = _mm_unpacklo_epi16(x[4], x[5]); + u[1] = _mm_unpackhi_epi16(x[4], x[5]); + u[2] = _mm_unpacklo_epi16(x[6], x[7]); + u[3] = _mm_unpackhi_epi16(x[6], x[7]); + u[4] = _mm_unpacklo_epi16(x[12], x[13]); + u[5] = _mm_unpackhi_epi16(x[12], x[13]); + u[6] = _mm_unpacklo_epi16(x[14], x[15]); + u[7] = _mm_unpackhi_epi16(x[14], x[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); + v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_add_epi16(x[0], x[2]); + s[1] = _mm_add_epi16(x[1], x[3]); + s[2] = _mm_sub_epi16(x[0], x[2]); + s[3] = _mm_sub_epi16(x[1], x[3]); + s[4] = _mm_packs_epi32(v[0], v[1]); + s[5] = _mm_packs_epi32(v[2], v[3]); + s[6] = _mm_packs_epi32(v[4], v[5]); + s[7] = _mm_packs_epi32(v[6], v[7]); + s[8] = _mm_add_epi16(x[8], x[10]); + s[9] = _mm_add_epi16(x[9], x[11]); + s[10] = _mm_sub_epi16(x[8], x[10]); + s[11] = _mm_sub_epi16(x[9], x[11]); + s[12] = _mm_packs_epi32(v[8], v[9]); + s[13] = _mm_packs_epi32(v[10], v[11]); + s[14] = _mm_packs_epi32(v[12], v[13]); + s[15] = _mm_packs_epi32(v[14], v[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); + u[4] = _mm_unpacklo_epi16(s[10], s[11]); + u[5] = _mm_unpackhi_epi16(s[10], s[11]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); + v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); + v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); + v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); + v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); + v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); + v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); + v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[8]); + in[2] = s[12]; + in[3] = _mm_sub_epi16(kZero, s[4]); + in[4] = _mm_packs_epi32(v[4], v[5]); + in[5] = _mm_packs_epi32(v[12], v[13]); + in[6] = _mm_packs_epi32(v[8], v[9]); + in[7] = _mm_packs_epi32(v[0], v[1]); + in[8] = _mm_packs_epi32(v[2], v[3]); + in[9] = _mm_packs_epi32(v[10], v[11]); + in[10] = _mm_packs_epi32(v[14], v[15]); + in[11] = _mm_packs_epi32(v[6], v[7]); + in[12] = s[5]; + in[13] = _mm_sub_epi16(kZero, s[13]); + in[14] = s[9]; + in[15] = _mm_sub_epi16(kZero, s[1]); +} + +void fdct16_1d_sse2(__m128i *in0, __m128i *in1) { + fdct16_1d_8col(in0); + fdct16_1d_8col(in1); + array_transpose_16x16(in0, in1); +} + +void fadst16_1d_sse2(__m128i *in0, __m128i *in1) { + fadst16_1d_8col(in0); + fadst16_1d_8col(in1); + array_transpose_16x16(in0, in1); +} + +void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output, + int stride, int tx_type) { + __m128i in0[16], in1[16]; + load_buffer_16x16(input, in0, in1, stride); + switch (tx_type) { + case 0: // DCT_DCT + fdct16_1d_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdct16_1d_sse2(in0, in1); + break; + case 1: // ADST_DCT + fadst16_1d_sse2(in0, in1); + right_shift_16x16(in0, in1); + fdct16_1d_sse2(in0, in1); + break; + case 2: // DCT_ADST + fdct16_1d_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_1d_sse2(in0, in1); + break; + case 3: // ADST_ADST + fadst16_1d_sse2(in0, in1); + right_shift_16x16(in0, in1); + fadst16_1d_sse2(in0, in1); + break; + default: + assert(0); + break; + } + write_buffer_16x16(output, in0, in1, 16); +} + +void vp9_short_fdct32x32_rd_sse2(int16_t *input, + int16_t *output_org, int pitch) { + // Calculate pre-multiplied strides + const int str1 = pitch >> 1; + const int str2 = pitch; + const int str3 = pitch + str1; + // We need an intermediate buffer between passes. + DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); + const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); + const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); + const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); + const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); + const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); + const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); + const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); + const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); + const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); + const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); + const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); + const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); + const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); + const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); + const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); + const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_set1_epi16(0); + const __m128i kOne = _mm_set1_epi16(1); + // Do the two transform/transpose passes + int pass; + for (pass = 0; pass < 2; ++pass) { + // We process eight columns (transposed rows in second pass) at a time. + int column_start; + for (column_start = 0; column_start < 32; column_start += 8) { + __m128i step1[32]; + __m128i step2[32]; + __m128i step3[32]; + __m128i out[32]; + // Stage 1 + // Note: even though all the loads below are aligned, using the aligned + // intrinsic make the code slightly slower. + if (0 == pass) { + int16_t *in = &input[column_start]; + // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + int16_t *ina = in + 0 * str1; + int16_t *inb = in + 31 * str1; + __m128i *step1a = &step1[ 0]; + __m128i *step1b = &step1[31]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[ 0] = _mm_add_epi16(ina0, inb0); + step1a[ 1] = _mm_add_epi16(ina1, inb1); + step1a[ 2] = _mm_add_epi16(ina2, inb2); + step1a[ 3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); + step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); + step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); + step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + int16_t *ina = in + 4 * str1; + int16_t *inb = in + 27 * str1; + __m128i *step1a = &step1[ 4]; + __m128i *step1b = &step1[27]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[ 0] = _mm_add_epi16(ina0, inb0); + step1a[ 1] = _mm_add_epi16(ina1, inb1); + step1a[ 2] = _mm_add_epi16(ina2, inb2); + step1a[ 3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); + step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); + step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); + step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + int16_t *ina = in + 8 * str1; + int16_t *inb = in + 23 * str1; + __m128i *step1a = &step1[ 8]; + __m128i *step1b = &step1[23]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[ 0] = _mm_add_epi16(ina0, inb0); + step1a[ 1] = _mm_add_epi16(ina1, inb1); + step1a[ 2] = _mm_add_epi16(ina2, inb2); + step1a[ 3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); + step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); + step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); + step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + int16_t *ina = in + 12 * str1; + int16_t *inb = in + 19 * str1; + __m128i *step1a = &step1[12]; + __m128i *step1b = &step1[19]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[ 0] = _mm_add_epi16(ina0, inb0); + step1a[ 1] = _mm_add_epi16(ina1, inb1); + step1a[ 2] = _mm_add_epi16(ina2, inb2); + step1a[ 3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); + step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); + step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); + step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + } else { + int16_t *in = &intermediate[column_start]; + // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; + // Note: using the same approach as above to have common offset is + // counter-productive as all offsets can be calculated at compile + // time. + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); + __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); + __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); + __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); + __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); + __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); + __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); + __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); + step1[ 0] = _mm_add_epi16(in00, in31); + step1[ 1] = _mm_add_epi16(in01, in30); + step1[ 2] = _mm_add_epi16(in02, in29); + step1[ 3] = _mm_add_epi16(in03, in28); + step1[28] = _mm_sub_epi16(in03, in28); + step1[29] = _mm_sub_epi16(in02, in29); + step1[30] = _mm_sub_epi16(in01, in30); + step1[31] = _mm_sub_epi16(in00, in31); + } + { + __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); + __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); + __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); + __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); + __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); + __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); + __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); + __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); + step1[ 4] = _mm_add_epi16(in04, in27); + step1[ 5] = _mm_add_epi16(in05, in26); + step1[ 6] = _mm_add_epi16(in06, in25); + step1[ 7] = _mm_add_epi16(in07, in24); + step1[24] = _mm_sub_epi16(in07, in24); + step1[25] = _mm_sub_epi16(in06, in25); + step1[26] = _mm_sub_epi16(in05, in26); + step1[27] = _mm_sub_epi16(in04, in27); + } + { + __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); + __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); + __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); + __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); + __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); + __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); + __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); + __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); + step1[ 8] = _mm_add_epi16(in08, in23); + step1[ 9] = _mm_add_epi16(in09, in22); + step1[10] = _mm_add_epi16(in10, in21); + step1[11] = _mm_add_epi16(in11, in20); + step1[20] = _mm_sub_epi16(in11, in20); + step1[21] = _mm_sub_epi16(in10, in21); + step1[22] = _mm_sub_epi16(in09, in22); + step1[23] = _mm_sub_epi16(in08, in23); + } + { + __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); + __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); + __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); + __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); + __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); + __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); + __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); + __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); + step1[12] = _mm_add_epi16(in12, in19); + step1[13] = _mm_add_epi16(in13, in18); + step1[14] = _mm_add_epi16(in14, in17); + step1[15] = _mm_add_epi16(in15, in16); + step1[16] = _mm_sub_epi16(in15, in16); + step1[17] = _mm_sub_epi16(in14, in17); + step1[18] = _mm_sub_epi16(in13, in18); + step1[19] = _mm_sub_epi16(in12, in19); + } + } + // Stage 2 + { + step2[ 0] = _mm_add_epi16(step1[0], step1[15]); + step2[ 1] = _mm_add_epi16(step1[1], step1[14]); + step2[ 2] = _mm_add_epi16(step1[2], step1[13]); + step2[ 3] = _mm_add_epi16(step1[3], step1[12]); + step2[ 4] = _mm_add_epi16(step1[4], step1[11]); + step2[ 5] = _mm_add_epi16(step1[5], step1[10]); + step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]); + step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]); + step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]); + step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]); + step2[10] = _mm_sub_epi16(step1[5], step1[10]); + step2[11] = _mm_sub_epi16(step1[4], step1[11]); + step2[12] = _mm_sub_epi16(step1[3], step1[12]); + step2[13] = _mm_sub_epi16(step1[2], step1[13]); + step2[14] = _mm_sub_epi16(step1[1], step1[14]); + step2[15] = _mm_sub_epi16(step1[0], step1[15]); + } + { + const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); + const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); + const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); + const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); + const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); + const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); + const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); + const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); + const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); + const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); + const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); + const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); + const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); + const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); + const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); + const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); + const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); + const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); + const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); + const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); + const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); + const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); + const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); + const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); + const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); + const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); + const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); + const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); + const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); + const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); + const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); + const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); + const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); + const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); + const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); + const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); + const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); + const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); + const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); + const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); + const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); + const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); + const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); + const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); + const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); + const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); + const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); + const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); + const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); + const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); + const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); + const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); + const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); + const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); + const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); + // Combine + step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); + step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); + step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); + step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); + step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); + step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); + step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); + step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); + } + // Stage 3 + { + step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]); + step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]); + step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]); + step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]); + step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]); + step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]); + step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]); + step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]); + } + { + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); + const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); + const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); + const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); + const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); + const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); + const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); + const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); + // Combine + step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); + step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); + step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); + step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); + } + { + step3[16] = _mm_add_epi16(step2[23], step1[16]); + step3[17] = _mm_add_epi16(step2[22], step1[17]); + step3[18] = _mm_add_epi16(step2[21], step1[18]); + step3[19] = _mm_add_epi16(step2[20], step1[19]); + step3[20] = _mm_sub_epi16(step1[19], step2[20]); + step3[21] = _mm_sub_epi16(step1[18], step2[21]); + step3[22] = _mm_sub_epi16(step1[17], step2[22]); + step3[23] = _mm_sub_epi16(step1[16], step2[23]); + step3[24] = _mm_sub_epi16(step1[31], step2[24]); + step3[25] = _mm_sub_epi16(step1[30], step2[25]); + step3[26] = _mm_sub_epi16(step1[29], step2[26]); + step3[27] = _mm_sub_epi16(step1[28], step2[27]); + step3[28] = _mm_add_epi16(step2[27], step1[28]); + step3[29] = _mm_add_epi16(step2[26], step1[29]); + step3[30] = _mm_add_epi16(step2[25], step1[30]); + step3[31] = _mm_add_epi16(step2[24], step1[31]); + } + // dump the magnitude by half, hence the intermediate values are within + // the range of 16 bits. + if (1 == pass) { + __m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero); + __m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero); + __m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero); + __m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero); + __m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero); + __m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero); + __m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero); + __m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero); + __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero); + __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero); + __m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero); + __m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero); + __m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero); + __m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero); + __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); + __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); + __m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero); + __m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero); + __m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero); + __m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero); + __m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero); + __m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero); + __m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero); + __m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero); + __m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero); + __m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero); + __m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero); + __m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero); + __m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero); + __m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero); + __m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero); + __m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero); + step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0); + step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0); + step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0); + step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0); + step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0); + step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0); + step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0); + step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0); + step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0); + step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0); + step3[10] = _mm_sub_epi16(step3[10], s3_10_0); + step3[11] = _mm_sub_epi16(step3[11], s3_11_0); + step3[12] = _mm_sub_epi16(step3[12], s3_12_0); + step3[13] = _mm_sub_epi16(step3[13], s3_13_0); + step2[14] = _mm_sub_epi16(step2[14], s2_14_0); + step2[15] = _mm_sub_epi16(step2[15], s2_15_0); + step3[16] = _mm_sub_epi16(step3[16], s3_16_0); + step3[17] = _mm_sub_epi16(step3[17], s3_17_0); + step3[18] = _mm_sub_epi16(step3[18], s3_18_0); + step3[19] = _mm_sub_epi16(step3[19], s3_19_0); + step3[20] = _mm_sub_epi16(step3[20], s3_20_0); + step3[21] = _mm_sub_epi16(step3[21], s3_21_0); + step3[22] = _mm_sub_epi16(step3[22], s3_22_0); + step3[23] = _mm_sub_epi16(step3[23], s3_23_0); + step3[24] = _mm_sub_epi16(step3[24], s3_24_0); + step3[25] = _mm_sub_epi16(step3[25], s3_25_0); + step3[26] = _mm_sub_epi16(step3[26], s3_26_0); + step3[27] = _mm_sub_epi16(step3[27], s3_27_0); + step3[28] = _mm_sub_epi16(step3[28], s3_28_0); + step3[29] = _mm_sub_epi16(step3[29], s3_29_0); + step3[30] = _mm_sub_epi16(step3[30], s3_30_0); + step3[31] = _mm_sub_epi16(step3[31], s3_31_0); + step3[ 0] = _mm_add_epi16(step3[ 0], kOne); + step3[ 1] = _mm_add_epi16(step3[ 1], kOne); + step3[ 2] = _mm_add_epi16(step3[ 2], kOne); + step3[ 3] = _mm_add_epi16(step3[ 3], kOne); + step3[ 4] = _mm_add_epi16(step3[ 4], kOne); + step3[ 5] = _mm_add_epi16(step3[ 5], kOne); + step3[ 6] = _mm_add_epi16(step3[ 6], kOne); + step3[ 7] = _mm_add_epi16(step3[ 7], kOne); + step2[ 8] = _mm_add_epi16(step2[ 8], kOne); + step2[ 9] = _mm_add_epi16(step2[ 9], kOne); + step3[10] = _mm_add_epi16(step3[10], kOne); + step3[11] = _mm_add_epi16(step3[11], kOne); + step3[12] = _mm_add_epi16(step3[12], kOne); + step3[13] = _mm_add_epi16(step3[13], kOne); + step2[14] = _mm_add_epi16(step2[14], kOne); + step2[15] = _mm_add_epi16(step2[15], kOne); + step3[16] = _mm_add_epi16(step3[16], kOne); + step3[17] = _mm_add_epi16(step3[17], kOne); + step3[18] = _mm_add_epi16(step3[18], kOne); + step3[19] = _mm_add_epi16(step3[19], kOne); + step3[20] = _mm_add_epi16(step3[20], kOne); + step3[21] = _mm_add_epi16(step3[21], kOne); + step3[22] = _mm_add_epi16(step3[22], kOne); + step3[23] = _mm_add_epi16(step3[23], kOne); + step3[24] = _mm_add_epi16(step3[24], kOne); + step3[25] = _mm_add_epi16(step3[25], kOne); + step3[26] = _mm_add_epi16(step3[26], kOne); + step3[27] = _mm_add_epi16(step3[27], kOne); + step3[28] = _mm_add_epi16(step3[28], kOne); + step3[29] = _mm_add_epi16(step3[29], kOne); + step3[30] = _mm_add_epi16(step3[30], kOne); + step3[31] = _mm_add_epi16(step3[31], kOne); + step3[ 0] = _mm_srai_epi16(step3[ 0], 2); + step3[ 1] = _mm_srai_epi16(step3[ 1], 2); + step3[ 2] = _mm_srai_epi16(step3[ 2], 2); + step3[ 3] = _mm_srai_epi16(step3[ 3], 2); + step3[ 4] = _mm_srai_epi16(step3[ 4], 2); + step3[ 5] = _mm_srai_epi16(step3[ 5], 2); + step3[ 6] = _mm_srai_epi16(step3[ 6], 2); + step3[ 7] = _mm_srai_epi16(step3[ 7], 2); + step2[ 8] = _mm_srai_epi16(step2[ 8], 2); + step2[ 9] = _mm_srai_epi16(step2[ 9], 2); + step3[10] = _mm_srai_epi16(step3[10], 2); + step3[11] = _mm_srai_epi16(step3[11], 2); + step3[12] = _mm_srai_epi16(step3[12], 2); + step3[13] = _mm_srai_epi16(step3[13], 2); + step2[14] = _mm_srai_epi16(step2[14], 2); + step2[15] = _mm_srai_epi16(step2[15], 2); + step3[16] = _mm_srai_epi16(step3[16], 2); + step3[17] = _mm_srai_epi16(step3[17], 2); + step3[18] = _mm_srai_epi16(step3[18], 2); + step3[19] = _mm_srai_epi16(step3[19], 2); + step3[20] = _mm_srai_epi16(step3[20], 2); + step3[21] = _mm_srai_epi16(step3[21], 2); + step3[22] = _mm_srai_epi16(step3[22], 2); + step3[23] = _mm_srai_epi16(step3[23], 2); + step3[24] = _mm_srai_epi16(step3[24], 2); + step3[25] = _mm_srai_epi16(step3[25], 2); + step3[26] = _mm_srai_epi16(step3[26], 2); + step3[27] = _mm_srai_epi16(step3[27], 2); + step3[28] = _mm_srai_epi16(step3[28], 2); + step3[29] = _mm_srai_epi16(step3[29], 2); + step3[30] = _mm_srai_epi16(step3[30], 2); + step3[31] = _mm_srai_epi16(step3[31], 2); + } + // Stage 4 + { + step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]); + step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]); + step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]); + step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]); + step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]); + step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]); + step1[10] = _mm_sub_epi16(step2[ 9], step3[10]); + step1[11] = _mm_sub_epi16(step2[ 8], step3[11]); + step1[12] = _mm_sub_epi16(step2[15], step3[12]); + step1[13] = _mm_sub_epi16(step2[14], step3[13]); + step1[14] = _mm_add_epi16(step3[13], step2[14]); + step1[15] = _mm_add_epi16(step3[12], step2[15]); + } + { + const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); + const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); + const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); + const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); + const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); + const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); + const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); + const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); + const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); + const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); + const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); + const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); + const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); + // Combine + step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); + step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); + } + { + const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); + const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); + const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); + const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); + const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); + const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); + const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); + const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); + const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); + const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); + const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); + const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); + const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); + const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); + const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); + const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); + const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); + const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); + const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); + const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); + const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); + const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); + const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); + const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); + const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); + const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); + const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); + const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); + const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); + const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); + const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); + const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); + const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); + const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); + const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); + const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); + const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); + const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); + const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); + const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); + const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); + const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); + const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); + const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); + const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); + const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); + const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); + const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); + const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); + const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); + const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); + const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); + const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); + const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); + const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); + // Combine + step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); + step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); + step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); + step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); + step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); + step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); + step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); + step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); + } + // Stage 5 + { + step2[4] = _mm_add_epi16(step1[5], step3[4]); + step2[5] = _mm_sub_epi16(step3[4], step1[5]); + step2[6] = _mm_sub_epi16(step3[7], step1[6]); + step2[7] = _mm_add_epi16(step1[6], step3[7]); + } + { + const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); + const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); + const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); + const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); + const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); + const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); + const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); + const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); + const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); + const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); + const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); + const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); + const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); + const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); + const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); + const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); + const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); + const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); + const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); + const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); + const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); + const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); + const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); + const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); + const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); + const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); + const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); + // Combine + out[ 0] = _mm_packs_epi32(out_00_6, out_00_7); + out[16] = _mm_packs_epi32(out_16_6, out_16_7); + out[ 8] = _mm_packs_epi32(out_08_6, out_08_7); + out[24] = _mm_packs_epi32(out_24_6, out_24_7); + } + { + const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]); + const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]); + const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); + const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); + const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); + const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); + const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); + const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); + const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); + const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); + const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); + const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); + const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); + const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); + const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); + const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); + const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); + const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); + const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); + const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); + const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); + const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); + const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); + const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); + const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); + const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); + const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); + // Combine + step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7); + step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); + step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); + step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); + } + { + step2[16] = _mm_add_epi16(step1[19], step3[16]); + step2[17] = _mm_add_epi16(step1[18], step3[17]); + step2[18] = _mm_sub_epi16(step3[17], step1[18]); + step2[19] = _mm_sub_epi16(step3[16], step1[19]); + step2[20] = _mm_sub_epi16(step3[23], step1[20]); + step2[21] = _mm_sub_epi16(step3[22], step1[21]); + step2[22] = _mm_add_epi16(step1[21], step3[22]); + step2[23] = _mm_add_epi16(step1[20], step3[23]); + step2[24] = _mm_add_epi16(step1[27], step3[24]); + step2[25] = _mm_add_epi16(step1[26], step3[25]); + step2[26] = _mm_sub_epi16(step3[25], step1[26]); + step2[27] = _mm_sub_epi16(step3[24], step1[27]); + step2[28] = _mm_sub_epi16(step3[31], step1[28]); + step2[29] = _mm_sub_epi16(step3[30], step1[29]); + step2[30] = _mm_add_epi16(step1[29], step3[30]); + step2[31] = _mm_add_epi16(step1[28], step3[31]); + } + // Stage 6 + { + const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); + const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); + const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); + const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); + const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); + const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); + const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); + const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); + // dct_const_round_shift + const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); + const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); + const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); + const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); + const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); + const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); + const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); + const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); + const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); + const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); + const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); + const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); + const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); + const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); + const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); + const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); + // Combine + out[ 4] = _mm_packs_epi32(out_04_6, out_04_7); + out[20] = _mm_packs_epi32(out_20_6, out_20_7); + out[12] = _mm_packs_epi32(out_12_6, out_12_7); + out[28] = _mm_packs_epi32(out_28_6, out_28_7); + } + { + step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]); + step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]); + step3[10] = _mm_sub_epi16(step1[11], step2[10]); + step3[11] = _mm_add_epi16(step2[10], step1[11]); + step3[12] = _mm_add_epi16(step2[13], step1[12]); + step3[13] = _mm_sub_epi16(step1[12], step2[13]); + step3[14] = _mm_sub_epi16(step1[15], step2[14]); + step3[15] = _mm_add_epi16(step2[14], step1[15]); + } + { + const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); + const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); + const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); + const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); + const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); + const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); + const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); + const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); + const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); + const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); + const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); + const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); + const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); + const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); + const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); + const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); + const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); + const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); + const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); + const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); + const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); + const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); + const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); + const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); + // dct_const_round_shift + const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); + const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); + const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); + const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); + const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); + const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); + const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); + const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); + const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); + const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); + const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); + const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); + const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); + const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); + const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); + const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); + const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); + const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); + const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); + const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); + const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); + const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); + const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); + const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); + const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); + const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); + const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); + const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); + const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); + const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); + const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); + const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); + // Combine + step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); + step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); + step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); + step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); + // Combine + step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); + step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); + step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); + step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); + } + // Stage 7 + { + const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]); + const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]); + const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]); + const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]); + const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); + const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); + const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); + const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); + const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); + const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); + const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); + const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); + const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); + const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); + const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); + const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); + const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); + const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); + const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); + const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); + const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); + const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); + const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); + const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); + // dct_const_round_shift + const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); + const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); + const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); + const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); + const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); + const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); + const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); + const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); + const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); + const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); + const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); + const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); + const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); + const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); + const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); + const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); + const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); + const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); + const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); + const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); + const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); + const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); + const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); + const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); + const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); + const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); + const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); + const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); + const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); + const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); + const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); + const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); + // Combine + out[ 2] = _mm_packs_epi32(out_02_6, out_02_7); + out[18] = _mm_packs_epi32(out_18_6, out_18_7); + out[10] = _mm_packs_epi32(out_10_6, out_10_7); + out[26] = _mm_packs_epi32(out_26_6, out_26_7); + out[ 6] = _mm_packs_epi32(out_06_6, out_06_7); + out[22] = _mm_packs_epi32(out_22_6, out_22_7); + out[14] = _mm_packs_epi32(out_14_6, out_14_7); + out[30] = _mm_packs_epi32(out_30_6, out_30_7); + } + { + step1[16] = _mm_add_epi16(step3[17], step2[16]); + step1[17] = _mm_sub_epi16(step2[16], step3[17]); + step1[18] = _mm_sub_epi16(step2[19], step3[18]); + step1[19] = _mm_add_epi16(step3[18], step2[19]); + step1[20] = _mm_add_epi16(step3[21], step2[20]); + step1[21] = _mm_sub_epi16(step2[20], step3[21]); + step1[22] = _mm_sub_epi16(step2[23], step3[22]); + step1[23] = _mm_add_epi16(step3[22], step2[23]); + step1[24] = _mm_add_epi16(step3[25], step2[24]); + step1[25] = _mm_sub_epi16(step2[24], step3[25]); + step1[26] = _mm_sub_epi16(step2[27], step3[26]); + step1[27] = _mm_add_epi16(step3[26], step2[27]); + step1[28] = _mm_add_epi16(step3[29], step2[28]); + step1[29] = _mm_sub_epi16(step2[28], step3[29]); + step1[30] = _mm_sub_epi16(step2[31], step3[30]); + step1[31] = _mm_add_epi16(step3[30], step2[31]); + } + // Final stage --- outputs indices are bit-reversed. + { + const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); + const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); + const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); + const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); + const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); + const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); + const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); + const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); + const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); + const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); + const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); + const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); + const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); + const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); + const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); + const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); + const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); + const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); + const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); + const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); + const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); + const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); + const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); + const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); + // dct_const_round_shift + const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); + const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); + const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); + const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); + const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); + const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); + const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); + const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); + const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); + const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); + const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); + const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); + const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); + const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); + const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); + const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); + const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); + const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); + const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); + const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); + const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); + const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); + const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); + const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); + const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); + const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); + const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); + const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); + const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); + const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); + const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); + const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); + // Combine + out[ 1] = _mm_packs_epi32(out_01_6, out_01_7); + out[17] = _mm_packs_epi32(out_17_6, out_17_7); + out[ 9] = _mm_packs_epi32(out_09_6, out_09_7); + out[25] = _mm_packs_epi32(out_25_6, out_25_7); + out[ 7] = _mm_packs_epi32(out_07_6, out_07_7); + out[23] = _mm_packs_epi32(out_23_6, out_23_7); + out[15] = _mm_packs_epi32(out_15_6, out_15_7); + out[31] = _mm_packs_epi32(out_31_6, out_31_7); + } + { + const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); + const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); + const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); + const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); + const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); + const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); + const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); + const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); + const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); + const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); + const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); + const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); + const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); + const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); + const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); + const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); + const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); + const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); + const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); + const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); + const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); + const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); + const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); + const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); + // dct_const_round_shift + const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); + const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); + const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); + const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); + const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); + const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); + const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); + const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); + const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); + const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); + const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); + const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); + const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); + const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); + const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); + const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); + const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); + const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); + const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); + const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); + const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); + const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); + const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); + const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); + const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); + const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); + const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); + const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); + const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); + const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); + const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); + const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); + // Combine + out[ 5] = _mm_packs_epi32(out_05_6, out_05_7); + out[21] = _mm_packs_epi32(out_21_6, out_21_7); + out[13] = _mm_packs_epi32(out_13_6, out_13_7); + out[29] = _mm_packs_epi32(out_29_6, out_29_7); + out[ 3] = _mm_packs_epi32(out_03_6, out_03_7); + out[19] = _mm_packs_epi32(out_19_6, out_19_7); + out[11] = _mm_packs_epi32(out_11_6, out_11_7); + out[27] = _mm_packs_epi32(out_27_6, out_27_7); + } + // Transpose the results, do it as four 8x8 transposes. + { + int transpose_block; + int16_t *output; + if (0 == pass) { + output = &intermediate[column_start * 32]; + } else { + output = &output_org[column_start * 32]; + } + for (transpose_block = 0; transpose_block < 4; ++transpose_block) { + __m128i *this_out = &out[8 * transpose_block]; + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + if (0 == pass) { + // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; + // TODO(cd): see quality impact of only doing + // output[j] = (output[j] + 1) >> 2; + // which would remove the code between here ... + __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero); + __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero); + __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero); + __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero); + __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero); + __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero); + __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero); + __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero); + tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0); + tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0); + tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0); + tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0); + tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0); + tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0); + tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0); + tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0); + // ... and here. + // PS: also change code in vp9/encoder/vp9_dct.c + tr2_0 = _mm_add_epi16(tr2_0, kOne); + tr2_1 = _mm_add_epi16(tr2_1, kOne); + tr2_2 = _mm_add_epi16(tr2_2, kOne); + tr2_3 = _mm_add_epi16(tr2_3, kOne); + tr2_4 = _mm_add_epi16(tr2_4, kOne); + tr2_5 = _mm_add_epi16(tr2_5, kOne); + tr2_6 = _mm_add_epi16(tr2_6, kOne); + tr2_7 = _mm_add_epi16(tr2_7, kOne); + tr2_0 = _mm_srai_epi16(tr2_0, 2); + tr2_1 = _mm_srai_epi16(tr2_1, 2); + tr2_2 = _mm_srai_epi16(tr2_2, 2); + tr2_3 = _mm_srai_epi16(tr2_3, 2); + tr2_4 = _mm_srai_epi16(tr2_4, 2); + tr2_5 = _mm_srai_epi16(tr2_5, 2); + tr2_6 = _mm_srai_epi16(tr2_6, 2); + tr2_7 = _mm_srai_epi16(tr2_7, 2); + } + // Note: even though all these stores are aligned, using the aligned + // intrinsic make the code slightly slower. + _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0); + _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1); + _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2); + _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3); + _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4); + _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5); + _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6); + _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7); + // Process next 8x8 + output += 8; + } + } + } + } +} diff --git a/libvpx/vp9/encoder/x86/vp9_encodeopt.asm b/libvpx/vp9/encoder/x86/vp9_encodeopt.asm deleted file mode 100644 index 734cb61..0000000 --- a/libvpx/vp9/encoder/x86/vp9_encodeopt.asm +++ /dev/null @@ -1,125 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;int vp9_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) -global sym(vp9_block_error_xmm) PRIVATE -sym(vp9_block_error_xmm): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prologue - - mov rsi, arg(0) ;coeff_ptr - mov rdi, arg(1) ;dcoef_ptr - - movdqa xmm0, [rsi] - movdqa xmm1, [rdi] - - movdqa xmm2, [rsi+16] - movdqa xmm3, [rdi+16] - - psubw xmm0, xmm1 - psubw xmm2, xmm3 - - pmaddwd xmm0, xmm0 - pmaddwd xmm2, xmm2 - - paddd xmm0, xmm2 - - pxor xmm5, xmm5 - movdqa xmm1, xmm0 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - psrldq xmm0, 8 - paddd xmm0, xmm1 - - movq rax, xmm0 - - pop rdi - pop rsi - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - -;int vp9_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) -global sym(vp9_block_error_mmx) PRIVATE -sym(vp9_block_error_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;coeff_ptr - pxor mm7, mm7 - - mov rdi, arg(1) ;dcoef_ptr - movq mm3, [rsi] - - movq mm4, [rdi] - movq mm5, [rsi+8] - - movq mm6, [rdi+8] - pxor mm1, mm1 ; from movd mm1, dc ; dc =0 - - movq mm2, mm7 - psubw mm5, mm6 - - por mm1, mm2 - pmaddwd mm5, mm5 - - pcmpeqw mm1, mm7 - psubw mm3, mm4 - - pand mm1, mm3 - pmaddwd mm1, mm1 - - paddd mm1, mm5 - movq mm3, [rsi+16] - - movq mm4, [rdi+16] - movq mm5, [rsi+24] - - movq mm6, [rdi+24] - psubw mm5, mm6 - - pmaddwd mm5, mm5 - psubw mm3, mm4 - - pmaddwd mm3, mm3 - paddd mm3, mm5 - - paddd mm1, mm3 - movq mm0, mm1 - - psrlq mm1, 32 - paddd mm0, mm1 - - movq rax, mm0 - - pop rdi - pop rsi - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret diff --git a/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/libvpx/vp9/encoder/x86/vp9_error_sse2.asm new file mode 100644 index 0000000..1126fdb --- /dev/null +++ b/libvpx/vp9/encoder/x86/vp9_error_sse2.asm @@ -0,0 +1,74 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, +; int64_t *ssz) + +INIT_XMM sse2 +cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz + pxor m4, m4 ; sse accumulator + pxor m6, m6 ; ssz accumulator + pxor m5, m5 ; dedicated zero register + lea uqcq, [uqcq+sizeq*2] + lea dqcq, [dqcq+sizeq*2] + neg sizeq +.loop: + mova m2, [uqcq+sizeq*2] + mova m0, [dqcq+sizeq*2] + mova m3, [uqcq+sizeq*2+mmsize] + mova m1, [dqcq+sizeq*2+mmsize] + psubw m0, m2 + psubw m1, m3 + ; individual errors are max. 15bit+sign, so squares are 30bit, and + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + ; accumulate in 64bit + punpckldq m7, m0, m5 + punpckhdq m0, m5 + paddq m4, m7 + punpckldq m7, m1, m5 + paddq m4, m0 + punpckhdq m1, m5 + paddq m4, m7 + punpckldq m7, m2, m5 + paddq m4, m1 + punpckhdq m2, m5 + paddq m6, m7 + punpckldq m7, m3, m5 + paddq m6, m2 + punpckhdq m3, m5 + paddq m6, m7 + paddq m6, m3 + add sizeq, mmsize + jl .loop + + ; accumulate horizontally and store in return value + movhlps m5, m4 + movhlps m7, m6 + paddq m4, m5 + paddq m6, m7 +%if ARCH_X86_64 + movq rax, m4 + movq [sszq], m6 +%else + mov eax, sszm + pshufd m5, m4, 0x1 + movq [eax], m6 + movd eax, m4 + movd edx, m5 +%endif + RET diff --git a/libvpx/vp9/encoder/x86/vp9_fwalsh_sse2.asm b/libvpx/vp9/encoder/x86/vp9_fwalsh_sse2.asm deleted file mode 100644 index 7bee9ef..0000000 --- a/libvpx/vp9/encoder/x86/vp9_fwalsh_sse2.asm +++ /dev/null @@ -1,164 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_short_walsh4x4_sse2(short *input, short *output, int pitch) -global sym(vp9_short_walsh4x4_sse2) PRIVATE -sym(vp9_short_walsh4x4_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ; input - mov rdi, arg(1) ; output - movsxd rdx, dword ptr arg(2) ; pitch - - ; first for loop - movq xmm0, MMWORD PTR [rsi] ; load input - movq xmm1, MMWORD PTR [rsi + rdx] - lea rsi, [rsi + rdx*2] - movq xmm2, MMWORD PTR [rsi] - movq xmm3, MMWORD PTR [rsi + rdx] - - punpcklwd xmm0, xmm1 - punpcklwd xmm2, xmm3 - - movdqa xmm1, xmm0 - punpckldq xmm0, xmm2 ; ip[1] ip[0] - punpckhdq xmm1, xmm2 ; ip[3] ip[2] - - movdqa xmm2, xmm0 - paddw xmm0, xmm1 - psubw xmm2, xmm1 - - psllw xmm0, 2 ; d1 a1 - psllw xmm2, 2 ; c1 b1 - - movdqa xmm1, xmm0 - punpcklqdq xmm0, xmm2 ; b1 a1 - punpckhqdq xmm1, xmm2 ; c1 d1 - - pxor xmm6, xmm6 - movq xmm6, xmm0 - pxor xmm7, xmm7 - pcmpeqw xmm7, xmm6 - paddw xmm7, [GLOBAL(c1)] - - movdqa xmm2, xmm0 - paddw xmm0, xmm1 ; b1+c1 a1+d1 - psubw xmm2, xmm1 ; b1-c1 a1-d1 - paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0) - - ; second for loop - ; input: 13 9 5 1 12 8 4 0 (xmm0) - ; 14 10 6 2 15 11 7 3 (xmm2) - ; after shuffle: - ; 13 5 9 1 12 4 8 0 (xmm0) - ; 14 6 10 2 15 7 11 3 (xmm1) - pshuflw xmm3, xmm0, 0xd8 - pshufhw xmm0, xmm3, 0xd8 - pshuflw xmm3, xmm2, 0xd8 - pshufhw xmm1, xmm3, 0xd8 - - movdqa xmm2, xmm0 - pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10 - pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10 - movdqa xmm3, xmm1 - pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13 - pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13 - - pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10 - pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10 - pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12 - pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12 - - movdqa xmm0, xmm4 - punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10 - punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10 - movdqa xmm1, xmm6 - punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12 - punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12 - - movdqa xmm2, xmm0 - paddd xmm0, xmm4 ; b21 b20 a21 a20 - psubd xmm2, xmm4 ; c21 c20 d21 d20 - movdqa xmm3, xmm1 - paddd xmm1, xmm6 ; b23 b22 a23 a22 - psubd xmm3, xmm6 ; c23 c22 d23 d22 - - pxor xmm4, xmm4 - movdqa xmm5, xmm4 - pcmpgtd xmm4, xmm0 - pcmpgtd xmm5, xmm2 - pand xmm4, [GLOBAL(cd1)] - pand xmm5, [GLOBAL(cd1)] - - pxor xmm6, xmm6 - movdqa xmm7, xmm6 - pcmpgtd xmm6, xmm1 - pcmpgtd xmm7, xmm3 - pand xmm6, [GLOBAL(cd1)] - pand xmm7, [GLOBAL(cd1)] - - paddd xmm0, xmm4 - paddd xmm2, xmm5 - paddd xmm0, [GLOBAL(cd3)] - paddd xmm2, [GLOBAL(cd3)] - paddd xmm1, xmm6 - paddd xmm3, xmm7 - paddd xmm1, [GLOBAL(cd3)] - paddd xmm3, [GLOBAL(cd3)] - - psrad xmm0, 3 - psrad xmm1, 3 - psrad xmm2, 3 - psrad xmm3, 3 - movdqa xmm4, xmm0 - punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20 - punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20 - movdqa xmm5, xmm2 - punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20 - punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20 - - packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20 - packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20 - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi + 16], xmm2 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -c1: - dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 -align 16 -cn1: - dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff -align 16 -cd1: - dd 0x00000001, 0x00000001, 0x00000001, 0x00000001 -align 16 -cd3: - dd 0x00000003, 0x00000003, 0x00000003, 0x00000003 diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm new file mode 100644 index 0000000..60f7991 --- /dev/null +++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm @@ -0,0 +1,214 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 + +SECTION .text + +%macro QUANTIZE_FN 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, zbin_oq, \ + eob, scan, iscan + cmp dword skipm, 0 + jne .blank + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, dequantmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + movd m4, dword zbin_oqm ; m4 = zbin_oq + mova m0, [zbinq] ; m0 = zbin + punpcklwd m4, m4 + mova m1, [roundq] ; m1 = round + pshufd m4, m4, 0 + mova m2, [quantq] ; m2 = quant + paddw m0, m4 ; m0 = zbin + zbin_oq + mova m3, [r2q] ; m3 = dequant + psubw m0, [pw_1] + mov r2, shiftmp + mov r3, qcoeffmp + mova m4, [r2] ; m4 = shift + mov r4, dqcoeffmp + mov r5, iscanmp + pxor m5, m5 ; m5 = dedicated zero + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob + lea coeffq, [ coeffq+ncoeffq*2] + lea iscanq, [ iscanq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] + lea dqcoeffq, [dqcoeffq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) +%ifidn %1, b_32x32 + paddw m6, m6 + paddw m11, m11 +%endif + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + paddw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m8, m4 ; m8 = m8*qsh>>16 + punpckhqdq m4, m4 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 + mova [qcoeffq+ncoeffq*2+ 0], m8 + mova [qcoeffq+ncoeffq*2+16], m13 +%ifidn %1, b_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 +%endif + mova [dqcoeffq+ncoeffq*2+ 0], m8 + mova [dqcoeffq+ncoeffq*2+16], m13 + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) +%ifidn %1, b_32x32 + paddw m6, m6 + paddw m11, m11 +%endif + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin +%ifidn %1, b_32x32 + pmovmskb r6, m7 + pmovmskb r2, m12 + or r6, r2 + jz .skip_iter +%endif + paddw m6, m1 ; m6 += round + paddw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m14, m6 ; m14 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m14, m4 ; m14 = m14*qsh>>16 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m14, m7 + pand m13, m12 + mova [qcoeffq+ncoeffq*2+ 0], m14 + mova [qcoeffq+ncoeffq*2+16], m13 +%ifidn %1, b_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + mova [dqcoeffq+ncoeffq*2+ 0], m14 + mova [dqcoeffq+ncoeffq*2+16], m13 + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop +%ifidn %1, b_32x32 + jmp .accumulate_eob +.skip_iter: + mova [qcoeffq+ncoeffq*2+ 0], m5 + mova [qcoeffq+ncoeffq*2+16], m5 + mova [dqcoeffq+ncoeffq*2+ 0], m5 + mova [dqcoeffq+ncoeffq*2+16], m5 + add ncoeffq, mmsize + jl .ac_only_loop +%endif + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw [r2], m8, 0 + RET + + ; skip-block, i.e. just write all zeroes +.blank: + mov r0, dqcoeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, qcoeffmp + mov r3, eobmp + DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob + lea dqcoeffq, [dqcoeffq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] + neg ncoeffq + pxor m7, m7 +.blank_loop: + mova [dqcoeffq+ncoeffq*2+ 0], m7 + mova [dqcoeffq+ncoeffq*2+16], m7 + mova [qcoeffq+ncoeffq*2+ 0], m7 + mova [qcoeffq+ncoeffq*2+16], m7 + add ncoeffq, mmsize + jl .blank_loop + mov word [eobq], 0 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FN b, 6 +QUANTIZE_FN b_32x32, 7 diff --git a/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm b/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm index 8fb7d41..c4c5c54 100644 --- a/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm +++ b/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm @@ -12,12 +12,42 @@ SECTION .text -; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD64XN 1 -cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows +%macro SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +%endmacro + +; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD64XN 1-2 0 + SAD_FN 64, %1, 5, %2 mov n_rowsd, %1 pxor m0, m0 .loop: @@ -25,6 +55,13 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows movu m2, [refq+16] movu m3, [refq+32] movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif psadbw m1, [srcq] psadbw m2, [srcq+16] psadbw m3, [srcq+32] @@ -47,21 +84,27 @@ cglobal sad64x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows INIT_XMM sse2 SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 +SAD64XN 64, 1 ; sad64x64_avg_sse2 +SAD64XN 32, 1 ; sad64x32_avg_sse2 ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); -%macro SAD32XN 1 -cglobal sad32x%1, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided +%macro SAD32XN 1-2 0 + SAD_FN 32, %1, 5, %2 mov n_rowsd, %1/2 pxor m0, m0 - .loop: movu m1, [refq] movu m2, [refq+16] movu m3, [refq+ref_strideq] movu m4, [refq+ref_strideq+16] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif psadbw m1, [srcq] psadbw m2, [srcq+16] psadbw m3, [srcq+src_strideq] @@ -85,16 +128,14 @@ INIT_XMM sse2 SAD32XN 64 ; sad32x64_sse2 SAD32XN 32 ; sad32x32_sse2 SAD32XN 16 ; sad32x16_sse2 +SAD32XN 64, 1 ; sad32x64_avg_sse2 +SAD32XN 32, 1 ; sad32x32_avg_sse2 +SAD32XN 16, 1 ; sad32x16_avg_sse2 ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); -%macro SAD16XN 1 -cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ - src_stride3, ref_stride3, n_rows - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - lea src_stride3q, [src_strideq*3] - lea ref_stride3q, [ref_strideq*3] +%macro SAD16XN 1-2 0 + SAD_FN 16, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 @@ -103,6 +144,13 @@ cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ movu m2, [refq+ref_strideq] movu m3, [refq+ref_strideq*2] movu m4, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif psadbw m1, [srcq] psadbw m2, [srcq+src_strideq] psadbw m3, [srcq+src_strideq*2] @@ -126,16 +174,14 @@ INIT_XMM sse2 SAD16XN 32 ; sad16x32_sse2 SAD16XN 16 ; sad16x16_sse2 SAD16XN 8 ; sad16x8_sse2 +SAD16XN 32, 1 ; sad16x32_avg_sse2 +SAD16XN 16, 1 ; sad16x16_avg_sse2 +SAD16XN 8, 1 ; sad16x8_avg_sse2 ; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); -%macro SAD8XN 1 -cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ - src_stride3, ref_stride3, n_rows - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - lea src_stride3q, [src_strideq*3] - lea ref_stride3q, [ref_strideq*3] +%macro SAD8XN 1-2 0 + SAD_FN 8, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 @@ -144,6 +190,11 @@ cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ movhps m1, [refq+ref_strideq] movh m2, [refq+ref_strideq*2] movhps m2, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif movh m3, [srcq] movhps m3, [srcq+src_strideq] movh m4, [srcq+src_strideq*2] @@ -167,16 +218,14 @@ INIT_XMM sse2 SAD8XN 16 ; sad8x16_sse2 SAD8XN 8 ; sad8x8_sse2 SAD8XN 4 ; sad8x4_sse2 +SAD8XN 16, 1 ; sad8x16_avg_sse2 +SAD8XN 8, 1 ; sad8x8_avg_sse2 +SAD8XN 4, 1 ; sad8x4_avg_sse2 ; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); -%macro SAD4XN 1 -cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \ - src_stride3, ref_stride3, n_rows - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - lea src_stride3q, [src_strideq*3] - lea ref_stride3q, [ref_strideq*3] +%macro SAD4XN 1-2 0 + SAD_FN 4, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 @@ -187,6 +236,11 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \ movd m4, [refq+ref_stride3q] punpckldq m1, m2 punpckldq m3, m4 +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m3, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif movd m2, [srcq] movd m5, [srcq+src_strideq] movd m4, [srcq+src_strideq*2] @@ -209,3 +263,5 @@ cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \ INIT_MMX sse SAD4XN 8 ; sad4x8_sse SAD4XN 4 ; sad4x4_sse +SAD4XN 8, 1 ; sad4x8_avg_sse +SAD4XN 4, 1 ; sad4x4_avg_sse diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm new file mode 100644 index 0000000..19e2feb --- /dev/null +++ b/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm @@ -0,0 +1,1288 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 15 + times 8 dw 1 + times 8 dw 14 + times 8 dw 2 + times 8 dw 13 + times 8 dw 3 + times 8 dw 12 + times 8 dw 4 + times 8 dw 11 + times 8 dw 5 + times 8 dw 10 + times 8 dw 6 + times 8 dw 9 + times 8 dw 7 + times 16 dw 8 + times 8 dw 7 + times 8 dw 9 + times 8 dw 6 + times 8 dw 10 + times 8 dw 5 + times 8 dw 11 + times 8 dw 4 + times 8 dw 12 + times 8 dw 3 + times 8 dw 13 + times 8 dw 2 + times 8 dw 14 + times 8 dw 1 + times 8 dw 15 + +bilin_filter_m_ssse3: times 8 db 16, 0 + times 8 db 15, 1 + times 8 db 14, 2 + times 8 db 13, 3 + times 8 db 12, 4 + times 8 db 11, 5 + times 8 db 10, 6 + times 8 db 9, 7 + times 16 db 8 + times 8 db 7, 9 + times 8 db 6, 10 + times 8 db 5, 11 + times 8 db 4, 12 + times 8 db 3, 13 + times 8 db 2, 14 + times 8 db 1, 15 + +SECTION .text + +; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + paddw %5, %3 + pmaddwd %3, %3 + paddw %5, %1 + pmaddwd %1, %1 + paddd %6, %3 + paddd %6, %1 +%endmacro + +%macro STORE_AND_RET 0 +%if mmsize == 16 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + pcmpgtw m5, m6 ; mask for 0 > x + movhlps m3, m7 + punpcklwd m4, m6, m5 + punpckhwd m6, m5 ; sign-extend m6 word->dword + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + pshufd m4, m6, 0x1 + movd [r1], m7 ; store sse + paddd m6, m4 + movd rax, m6 ; store sum as return value +%else ; mmsize == 8 + pshufw m4, m6, 0xe + pshufw m3, m7, 0xe + paddw m6, m4 + paddd m7, m3 + pcmpgtw m5, m6 ; mask for 0 > x + mov r1, ssem ; r1 = unsigned int *sse + punpcklwd m6, m5 ; sign-extend m6 word->dword + movd [r1], m7 ; store sse + pshufw m4, m6, 0xe + paddd m6, m4 + movd rax, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%if cpuflag(ssse3) +%define bilin_filter_m bilin_filter_m_ssse3 +%define filter_idx_shift 4 +%else +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 +%endif +; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses +; 11, not 13, if the registers are ordered correctly. May make a minor speed +; difference on Win64 +%ifdef PIC +%if %2 == 1 ; avg +cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse +%define sec_str sec_strideq +%else +cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \ + dst, dst_stride, height, sse +%endif +%define h heightd +%define bilin_filter sseq +%else +%if %2 == 1 ; avg +cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ + 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse +%if ARCH_X86_64 +%define h heightd +%define sec_str sec_strideq +%else +%define h dword heightm +%define sec_str sec_stridemp +%endif +%else +cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ + dst, dst_stride, height, sse +%define h heightd +%endif +%define bilin_filter bilin_filter_m +%endif + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we + ; could perhaps use it for something more productive then + pxor m5, m5 ; dedicated zero register +%if %1 < 16 + sar h, 1 +%if %2 == 1 ; avg + shl sec_str, 1 +%endif +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + mova m1, [dstq] +%if %2 == 1 ; avg + pavgb m0, [secq] + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%if %2 == 0 ; !avg + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] +%if %2 == 1 ; avg +%if mmsize == 16 + movhps m0, [srcq+src_strideq] +%else ; mmsize == 8 + punpckldq m0, [srcq+src_strideq] +%endif +%else ; !avg + movh m2, [srcq+src_strideq] +%endif + movh m1, [dstq] + movh m3, [dstq+dst_strideq] +%if %2 == 1 ; avg + pavgb m0, [secq] + punpcklbw m3, m5 + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; !avg + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_zero_y_zero_loop + STORE_AND_RET + +.x_zero_y_nonzero: + cmp y_offsetd, 8 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m2, [srcq+src_strideq] +%if %2 == 1 ; avg +%if mmsize == 16 + movhps m2, [srcq+src_strideq*2] +%else ; mmsize == 8 + punpckldq m2, [srcq+src_strideq*2] +%endif + movh m1, [dstq] +%if mmsize == 16 + movlhps m0, m2 +%else ; mmsize == 8 + punpckldq m0, m2 +%endif + movh m3, [dstq+dst_strideq] + pavgb m0, m2 + punpcklbw m1, m5 + pavgb m0, [secq] + punpcklbw m3, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; !avg + movh m4, [srcq+src_strideq*2] + movh m1, [dstq] + pavgb m0, m2 + movh m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_zero_y_half_loop + STORE_AND_RET + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movh m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movh m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m4, filter_y_b + paddw m0, m1 + paddw m2, filter_rnd + movh m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonzero: + cmp x_offsetd, 8 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m4, [srcq+1] +%if %2 == 1 ; avg +%if mmsize == 16 + movhps m0, [srcq+src_strideq] + movhps m4, [srcq+src_strideq+1] +%else ; mmsize == 8 + punpckldq m0, [srcq+src_strideq] + punpckldq m4, [srcq+src_strideq+1] +%endif + movh m1, [dstq] + movh m3, [dstq+dst_strideq] + pavgb m0, m4 + punpcklbw m3, m5 + pavgb m0, [secq] + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; !avg + movh m2, [srcq+src_strideq] + movh m1, [dstq] + pavgb m0, m4 + movh m4, [srcq+src_strideq+1] + movh m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_half_y_zero_loop + STORE_AND_RET + +.x_half_y_nonzero: + cmp y_offsetd, 8 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + pavgb m4, m3 + punpckhbw m3, m1, m5 + pavgb m0, m4 +%if %2 == 1 ; avg + punpcklbw m1, m5 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movh m2, [srcq] + movh m3, [srcq+1] +%if %2 == 1 ; avg +%if mmsize == 16 + movhps m2, [srcq+src_strideq] + movhps m3, [srcq+src_strideq+1] +%else + punpckldq m2, [srcq+src_strideq] + punpckldq m3, [srcq+src_strideq+1] +%endif + pavgb m2, m3 +%if mmsize == 16 + movlhps m0, m2 + movhlps m4, m2 +%else ; mmsize == 8 + punpckldq m0, m2 + pshufw m4, m2, 0xe +%endif + movh m1, [dstq] + pavgb m0, m2 + movh m3, [dstq+dst_strideq] + pavgb m0, [secq] + punpcklbw m3, m5 + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; !avg + movh m4, [srcq+src_strideq] + movh m1, [srcq+src_strideq+1] + pavgb m2, m3 + pavgb m4, m1 + pavgb m0, m2 + pavgb m2, m4 + movh m1, [dstq] + movh m3, [dstq+dst_strideq] + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_half_y_half_loop + STORE_AND_RET + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_other_loop: + movu m4, [srcq] + movu m2, [srcq+1] + mova m1, [dstq] + pavgb m4, m2 +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + punpcklbw m0, m5 + paddw m2, m3 + punpcklbw m3, m4, m5 + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 +%endif + punpckhbw m3, m1, m5 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +%if notcpuflag(ssse3) + punpcklbw m0, m5 +%endif +.x_half_y_other_loop: + movh m2, [srcq] + movh m1, [srcq+1] + movh m4, [srcq+src_strideq] + movh m3, [srcq+src_strideq+1] + pavgb m2, m1 + pavgb m4, m3 + movh m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movh m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + paddw m0, m1 + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m2, m1 + movh m1, [dstq] +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + pmullw m0, filter_x_a + pmullw m4, filter_x_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m1, [srcq+1] + movh m2, [srcq+src_strideq] + movh m4, [srcq+src_strideq+1] + movh m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + punpcklbw m0, m1 + movh m1, [dstq] + punpcklbw m2, m4 + pmaddubsw m0, filter_x_a + pmaddubsw m2, filter_x_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m0, m1 + paddw m2, filter_rnd + movh m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 8 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] +%if cpuflag(ssse3) + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + pavgb m0, m4 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%else + punpckhbw m2, m4, m5 + punpckhbw m1, m3, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + paddw m4, m3 + paddw m2, m1 + mova m1, [dstq] + psraw m4, 4 + psraw m2, 4 + punpckhbw m3, m1, m5 + ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we + ; have a 1-register shortage to be able to store the backup of the bilin + ; filtered second line as words as cache for the next line. Packing into + ; a byte costs 1 pack and 2 unpacks, but saves a register. + packuswb m4, m2 + punpcklbw m1, m5 + pavgb m0, m4 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + pavgb m0, [secq] +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + add srcq, src_strideq + psraw m0, 4 +.x_other_y_half_loop: + movh m2, [srcq] + movh m1, [srcq+1] + movh m4, [srcq+src_strideq] + movh m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movh m1, [dstq] + movh m3, [dstq+dst_strideq] + paddw m2, filter_rnd + paddw m4, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + movh m1, [dstq] + paddw m4, m3 + movh m3, [dstq+dst_strideq] +%endif + psraw m2, 4 + psraw m4, 4 + pavgw m0, m2 + pavgw m2, m4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline - also consider going to bytes here + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonhalf: +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m11, [bilin_filter+y_offsetq+16] +%endif + mova m12, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_other_loop: +%if cpuflag(ssse3) + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + punpckhbw m3, m1, m5 + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + punpcklbw m1, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 + psraw m0, 4 +%else + movu m3, [srcq] + movu m4, [srcq+1] + punpckhbw m1, m3, m5 + punpckhbw m2, m4, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m3, filter_x_a + pmullw m4, filter_x_b + paddw m3, filter_rnd + pmullw m1, filter_x_a + pmullw m2, filter_x_b + paddw m1, filter_rnd + paddw m3, m4 + paddw m1, m2 + psraw m3, 4 + psraw m1, 4 + packuswb m4, m3, m1 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + pmullw m2, filter_y_a + pmullw m1, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m2, m1 + mova m1, [dstq] + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 + punpckhbw m3, m1, m5 + psraw m0, 4 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + psraw m0, 4 +%if cpuflag(ssse3) + packuswb m0, m0 +%endif + add srcq, src_strideq +.x_other_y_other_loop: + movh m2, [srcq] + movh m1, [srcq+1] + movh m4, [srcq+src_strideq] + movh m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movh m3, [dstq+dst_strideq] + movh m1, [dstq] + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m2, m2 + packuswb m4, m4 + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + paddw m4, m3 + psraw m2, 4 + psraw m4, 4 + pmullw m0, filter_y_a + pmullw m3, m2, filter_y_b + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m0, m3 + movh m3, [dstq+dst_strideq] + paddw m2, m1 + movh m1, [dstq] + psraw m0, 4 + psraw m2, 4 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET +%endmacro + +; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical +; between the ssse3 and non-ssse3 version. It may make sense to merge their +; code in the sense that the ssse3 version would jump to the appropriate +; location in the sse/2 version, rather than duplicating that code in the +; binary. + +INIT_MMX sse +SUBPEL_VARIANCE 4 +INIT_XMM sse2 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_MMX ssse3 +SUBPEL_VARIANCE 4 +INIT_XMM ssse3 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_MMX sse +SUBPEL_VARIANCE 4, 1 +INIT_XMM sse2 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 + +INIT_MMX ssse3 +SUBPEL_VARIANCE 4, 1 +INIT_XMM ssse3 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm b/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm index 8a2a471..2ecc23e 100644 --- a/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm +++ b/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm @@ -8,292 +8,8 @@ ; be found in the AUTHORS file in the root of the source tree. ; - %include "vpx_ports/x86_abi_support.asm" -%define xmm_filter_shift 7 - -;void vp9_filter_block2d_bil_var_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int xoffset, -; int yoffset, -; int *sum, -; unsigned int *sumsquared;; -; -;) -global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE -sym(vp9_filter_block2d_bil_var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - push rbx - ; end prolog - - pxor xmm6, xmm6 ; - pxor xmm7, xmm7 ; - - lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding - movdqa xmm4, XMMWORD PTR [rsi] - - lea rcx, [GLOBAL(bilinear_filters_sse2)] - movsxd rax, dword ptr arg(5) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je filter_block2d_bil_var_sse2_sp_only - - shl rax, 5 ; point to filter coeff with xoffset - lea rax, [rax + rcx] ; HFilter - - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip second_pass filter if yoffset=0 - je filter_block2d_bil_var_sse2_fp_only - - shl rdx, 5 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - - pxor xmm0, xmm0 ; - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - movdqa xmm5, xmm1 - - movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line - lea rsi, [rsi + rbx] -%if ABI_IS_32BIT=0 - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - -filter_block2d_bil_var_sse2_loop: - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 ; - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movdqa xmm3, xmm5 ; - movdqa xmm5, xmm1 ; - - pmullw xmm3, [rdx] ; - pmullw xmm1, [rdx+16] ; - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - lea rsi, [rsi + rbx] ;ref_pixels_per_line -%if ABI_IS_32BIT - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 ; - jnz filter_block2d_bil_var_sse2_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_sp_only: - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 - je filter_block2d_bil_var_sse2_full_pixel - - shl rdx, 5 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; - movq xmm1, QWORD PTR [rsi] ; - punpcklbw xmm1, xmm0 ; - - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - lea rsi, [rsi + rax] - -filter_block2d_bil_sp_only_loop: - movq xmm3, QWORD PTR [rsi] ; - punpcklbw xmm3, xmm0 ; - movdqa xmm5, xmm3 - - pmullw xmm1, [rdx] ; - pmullw xmm3, [rdx+16] ; - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - movdqa xmm1, xmm5 ; - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_sp_only_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_full_pixel: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - pxor xmm0, xmm0 ; - -filter_block2d_bil_full_pixel_loop: - movq xmm1, QWORD PTR [rsi] ; - punpcklbw xmm1, xmm0 ; - - movq xmm2, QWORD PTR [rdi] ; - punpcklbw xmm2, xmm0 ; - - psubw xmm1, xmm2 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_full_pixel_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_fp_only: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - -filter_block2d_bil_fp_only_loop: - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 ; - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - lea rsi, [rsi + rdx] - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_fp_only_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_variance: - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(7) ; sum - mov rdi, arg(8) ; sumsquared - - movd [rsi], mm2 ; xsum - movd [rdi], mm4 ; xxsum - - ; begin epilog - pop rbx - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - ;void vp9_half_horiz_vert_variance16x_h_sse2 ;( ; unsigned char *ref_ptr, @@ -619,27 +335,3 @@ sym(vp9_half_horiz_variance16x_h_sse2): UNSHADOW_ARGS pop rbp ret - -SECTION_RODATA -; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -bilinear_filters_sse2: - dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 - dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 - dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 - dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 - dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 - dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 - dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 - dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 - dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 - dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 - dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 - dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 - dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 - dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 - dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 diff --git a/libvpx/vp9/encoder/x86/vp9_subtract_mmx.asm b/libvpx/vp9/encoder/x86/vp9_subtract_mmx.asm deleted file mode 100644 index e9eda4f..0000000 --- a/libvpx/vp9/encoder/x86/vp9_subtract_mmx.asm +++ /dev/null @@ -1,432 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride, -; short *diff, unsigned char *Predictor, -; int pitch); -global sym(vp9_subtract_b_mmx_impl) PRIVATE -sym(vp9_subtract_b_mmx_impl): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - - mov rdi, arg(2) ;diff - mov rax, arg(3) ;Predictor - mov rsi, arg(0) ;z - movsxd rdx, dword ptr arg(1);src_stride; - movsxd rcx, dword ptr arg(4);pitch - pxor mm7, mm7 - - movd mm0, [rsi] - movd mm1, [rax] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi], mm0 - - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi+rcx*2],mm0 - - - movd mm0, [rsi+rdx*2] - movd mm1, [rax+rcx*2] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi+rcx*4], mm0 - - lea rsi, [rsi+rdx*2] - lea rcx, [rcx+rcx*2] - - - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi+rcx*2], mm0 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride) -global sym(vp9_subtract_mby_mmx) PRIVATE -sym(vp9_subtract_mby_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(1) ;src - mov rdi, arg(0) ;diff - - mov rax, arg(2) ;pred - movsxd rdx, dword ptr arg(3) ;stride - - mov rcx, 16 - pxor mm0, mm0 - -.submby_loop: - - movq mm1, [rsi] - movq mm3, [rax] - - movq mm2, mm1 - movq mm4, mm3 - - punpcklbw mm1, mm0 - punpcklbw mm3, mm0 - - punpckhbw mm2, mm0 - punpckhbw mm4, mm0 - - psubw mm1, mm3 - psubw mm2, mm4 - - movq [rdi], mm1 - movq [rdi+8], mm2 - - - movq mm1, [rsi+8] - movq mm3, [rax+8] - - movq mm2, mm1 - movq mm4, mm3 - - punpcklbw mm1, mm0 - punpcklbw mm3, mm0 - - punpckhbw mm2, mm0 - punpckhbw mm4, mm0 - - psubw mm1, mm3 - psubw mm2, mm4 - - movq [rdi+16], mm1 - movq [rdi+24], mm2 - - - add rdi, 32 - add rax, 16 - - lea rsi, [rsi+rdx] - - sub rcx, 1 - jnz .submby_loop - - pop rdi - pop rsi - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) -global sym(vp9_subtract_mbuv_mmx) PRIVATE -sym(vp9_subtract_mbuv_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - ;short *udiff = diff + 256; - ;short *vdiff = diff + 320; - ;unsigned char *upred = pred + 256; - ;unsigned char *vpred = pred + 320; - - ;unsigned char *z = usrc; - ;unsigned short *diff = udiff; - ;unsigned char *Predictor= upred; - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(1) ;z = usrc - add rdi, 256*2 ;diff = diff + 256 (shorts) - add rax, 256 ;Predictor = pred + 256 - movsxd rdx, dword ptr arg(4) ;stride; - pxor mm7, mm7 - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - ;unsigned char *z = vsrc; - ;unsigned short *diff = vdiff; - ;unsigned char *Predictor= vpred; - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(2) ;z = usrc - add rdi, 320*2 ;diff = diff + 320 (shorts) - add rax, 320 ;Predictor = pred + 320 - movsxd rdx, dword ptr arg(4) ;stride; - pxor mm7, mm7 - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm b/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm index 739d948..9824080 100644 --- a/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm +++ b/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm @@ -8,349 +8,120 @@ ; be found in the AUTHORS file in the root of the source tree. ; - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride, -; short *diff, unsigned char *Predictor, -; int pitch); -global sym(vp9_subtract_b_sse2_impl) PRIVATE -sym(vp9_subtract_b_sse2_impl): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdi, arg(2) ;diff - mov rax, arg(3) ;Predictor - mov rsi, arg(0) ;z - movsxd rdx, dword ptr arg(1);src_stride; - movsxd rcx, dword ptr arg(4);pitch - pxor mm7, mm7 - - movd mm0, [rsi] - movd mm1, [rax] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi], mm0 - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*2], mm0 - - movd mm0, [rsi+rdx*2] - movd mm1, [rax+rcx*2] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*4], mm0 - - lea rsi, [rsi+rdx*2] - lea rcx, [rcx+rcx*2] - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*2], mm0 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride) -global sym(vp9_subtract_mby_sse2) PRIVATE -sym(vp9_subtract_mby_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(1) ;src - mov rdi, arg(0) ;diff - - mov rax, arg(2) ;pred - movsxd rdx, dword ptr arg(3) ;stride - - mov rcx, 8 ; do two lines at one time - -.submby_loop: - movdqa xmm0, XMMWORD PTR [rsi] ; src - movdqa xmm1, XMMWORD PTR [rax] ; pred - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - movdqa xmm4, XMMWORD PTR [rsi + rdx] - movdqa xmm5, XMMWORD PTR [rax + 16] - - movdqa xmm6, xmm4 - psubb xmm4, xmm5 - - pxor xmm5, [GLOBAL(t80)] ;convert to signed values - pxor xmm6, [GLOBAL(t80)] - pcmpgtb xmm5, xmm6 ; obtain sign information - - movdqa xmm6, xmm4 - movdqa xmm7, xmm5 - punpcklbw xmm4, xmm5 ; put sign back to subtraction - punpckhbw xmm6, xmm7 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi +32], xmm4 - movdqa XMMWORD PTR [rdi +48], xmm6 - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - sub rcx, 1 - jnz .submby_loop - - pop rdi - pop rsi - ; begin epilog - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) -global sym(vp9_subtract_mbuv_sse2) PRIVATE -sym(vp9_subtract_mbuv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(1) ;z = usrc - add rdi, 256*2 ;diff = diff + 256 (shorts) - add rax, 256 ;Predictor = pred + 256 - movsxd rdx, dword ptr arg(4) ;stride; - lea rcx, [rdx + rdx*2] - - ;u - ;line 0 1 - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - ;line 2 3 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+16] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 32], xmm0 - movdqa XMMWORD PTR [rdi + 48], xmm2 - - ;line 4 5 - lea rsi, [rsi + rdx*4] - - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax + 32] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 64], xmm0 - movdqa XMMWORD PTR [rdi + 80], xmm2 - - ;line 6 7 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 96], xmm0 - movdqa XMMWORD PTR [rdi + 112], xmm2 - - ;v - mov rsi, arg(2) ;z = vsrc - add rdi, 64*2 ;diff = diff + 320 (shorts) - add rax, 64 ;Predictor = pred + 320 - - ;line 0 1 - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - ;line 2 3 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+16] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 32], xmm0 - movdqa XMMWORD PTR [rdi + 48], xmm2 - - ;line 4 5 - lea rsi, [rsi + rdx*4] - - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax + 32] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 64], xmm0 - movdqa XMMWORD PTR [rdi + 80], xmm2 - - ;line 6 7 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 96], xmm0 - movdqa XMMWORD PTR [rdi + 112], xmm2 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -t80: - times 16 db 0x80 +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; void vp9_subtract_block(int rows, int cols, +; int16_t *diff, ptrdiff_t diff_stride, +; const uint8_t *src, ptrdiff_t src_stride, +; const uint8_t *pred, ptrdiff_t pred_stride) + +INIT_XMM sse2 +cglobal subtract_block, 7, 7, 8, \ + rows, cols, diff, diff_stride, src, src_stride, \ + pred, pred_stride +%define pred_str colsq + pxor m7, m7 ; dedicated zero register + cmp colsd, 4 + je .case_4 + cmp colsd, 8 + je .case_8 + cmp colsd, 16 + je .case_16 + cmp colsd, 32 + je .case_32 + +%macro loop16 6 + mova m0, [srcq+%1] + mova m4, [srcq+%2] + mova m1, [predq+%3] + mova m5, [predq+%4] + punpckhbw m2, m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m0, m7 + punpcklbw m1, m7 + psubw m2, m3 + psubw m0, m1 + punpckhbw m1, m4, m7 + punpckhbw m3, m5, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 + psubw m1, m3 + psubw m4, m5 + mova [diffq+mmsize*0+%5], m0 + mova [diffq+mmsize*1+%5], m2 + mova [diffq+mmsize*0+%6], m4 + mova [diffq+mmsize*1+%6], m1 +%endmacro + + mov pred_str, pred_stridemp +.loop_64: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_64 + RET + +.case_32: + mov pred_str, pred_stridemp +.loop_32: + loop16 0, mmsize, 0, mmsize, 0, 2*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_32 + RET + +.case_16: + mov pred_str, pred_stridemp +.loop_16: + loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 + lea diffq, [diffq+diff_strideq*4] + lea predq, [predq+pred_str*2] + lea srcq, [srcq+src_strideq*2] + sub rowsd, 2 + jg .loop_16 + RET + +%macro loop_h 0 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m1, [predq] + movh m3, [predq+pred_str] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + psubw m0, m1 + psubw m2, m3 + mova [diffq], m0 + mova [diffq+diff_strideq*2], m2 +%endmacro + +.case_8: + mov pred_str, pred_stridemp +.loop_8: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_8 + RET + +INIT_MMX +.case_4: + mov pred_str, pred_stridemp +.loop_4: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_4 + RET diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm index 9f140c9..d3dbefe 100644 --- a/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm +++ b/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm @@ -508,344 +508,3 @@ sym(vp9_get4x4sse_cs_mmx): UNSHADOW_ARGS pop rbp ret - -%define mmx_filter_shift 7 - -;void vp9_filter_block2d_bil4x4_var_mmx -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned short *HFilter, -; unsigned short *VFilter, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_filter_block2d_bil4x4_var_mmx) PRIVATE -sym(vp9_filter_block2d_bil4x4_var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - - pxor mm6, mm6 ; - pxor mm7, mm7 ; - - mov rax, arg(4) ;HFilter ; - mov rdx, arg(5) ;VFilter ; - - mov rsi, arg(0) ;ref_ptr ; - mov rdi, arg(2) ;src_ptr ; - - mov rcx, 4 ; - pxor mm0, mm0 ; - - movd mm1, [rsi] ; - movd mm3, [rsi+1] ; - - punpcklbw mm1, mm0 ; - pmullw mm1, [rax] ; - - punpcklbw mm3, mm0 ; - pmullw mm3, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - movq mm5, mm1 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - add rsi, r8 -%endif - -.filter_block2d_bil4x4_var_mmx_loop: - - movd mm1, [rsi] ; - movd mm3, [rsi+1] ; - - punpcklbw mm1, mm0 ; - pmullw mm1, [rax] ; - - punpcklbw mm3, mm0 ; - pmullw mm3, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - movq mm3, mm5 ; - - movq mm5, mm1 ; - pmullw mm3, [rdx] ; - - pmullw mm1, [rdx+8] ; - paddw mm1, mm3 ; - - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - psraw mm1, mmx_filter_shift ; - - movd mm3, [rdi] ; - punpcklbw mm3, mm0 ; - - psubw mm1, mm3 ; - paddw mm6, mm1 ; - - pmaddwd mm1, mm1 ; - paddd mm7, mm1 ; - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .filter_block2d_bil4x4_var_mmx_loop ; - - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rdi, arg(6) ;sum - mov rsi, arg(7) ;sumsquared - - movd dword ptr [rdi], mm2 ; - movd dword ptr [rsi], mm4 ; - - - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - - -;void vp9_filter_block2d_bil_var_mmx -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; unsigned short *HFilter, -; unsigned short *VFilter, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_filter_block2d_bil_var_mmx) PRIVATE -sym(vp9_filter_block2d_bil_var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - pxor mm6, mm6 ; - pxor mm7, mm7 ; - mov rax, arg(5) ;HFilter ; - - mov rdx, arg(6) ;VFilter ; - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - - pxor mm0, mm0 ; - movq mm1, [rsi] ; - - movq mm3, [rsi+1] ; - movq mm2, mm1 ; - - movq mm4, mm3 ; - punpcklbw mm1, mm0 ; - - punpckhbw mm2, mm0 ; - pmullw mm1, [rax] ; - - pmullw mm2, [rax] ; - punpcklbw mm3, mm0 ; - - punpckhbw mm4, mm0 ; - pmullw mm3, [rax+8] ; - - pmullw mm4, [rax+8] ; - paddw mm1, mm3 ; - - paddw mm2, mm4 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - - psraw mm2, mmx_filter_shift ; - movq mm5, mm1 - - packuswb mm5, mm2 ; -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - add rsi, r8 -%endif - -.filter_block2d_bil_var_mmx_loop: - - movq mm1, [rsi] ; - movq mm3, [rsi+1] ; - - movq mm2, mm1 ; - movq mm4, mm3 ; - - punpcklbw mm1, mm0 ; - punpckhbw mm2, mm0 ; - - pmullw mm1, [rax] ; - pmullw mm2, [rax] ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - pmullw mm3, [rax+8] ; - pmullw mm4, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm2, mm4 ; - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - psraw mm1, mmx_filter_shift ; - - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - psraw mm2, mmx_filter_shift ; - - movq mm3, mm5 ; - movq mm4, mm5 ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - movq mm5, mm1 ; - packuswb mm5, mm2 ; - - pmullw mm3, [rdx] ; - pmullw mm4, [rdx] ; - - pmullw mm1, [rdx+8] ; - pmullw mm2, [rdx+8] ; - - paddw mm1, mm3 ; - paddw mm2, mm4 ; - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - psraw mm2, mmx_filter_shift ; - - movq mm3, [rdi] ; - movq mm4, mm3 ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - psubw mm1, mm3 ; - psubw mm2, mm4 ; - - paddw mm6, mm1 ; - pmaddwd mm1, mm1 ; - - paddw mm6, mm2 ; - pmaddwd mm2, mm2 ; - - paddd mm7, mm1 ; - paddd mm7, mm2 ; - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .filter_block2d_bil_var_mmx_loop ; - - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rdi, arg(7) ;sum - mov rsi, arg(8) ;sumsquared - - movd dword ptr [rdi], mm2 ; - movd dword ptr [rsi], mm4 ; - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -;short mmx_bi_rd[4] = { 64, 64, 64, 64}; -align 16 -mmx_bi_rd: - times 4 dw 64 diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm index 896dd18..2c50881 100644 --- a/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm +++ b/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm @@ -11,8 +11,6 @@ %include "vpx_ports/x86_abi_support.asm" -%define xmm_filter_shift 7 - ;unsigned int vp9_get_mb_ss_sse2 ;( ; short *src_ptr @@ -734,28 +732,3 @@ sym(vp9_half_horiz_variance8x_h_sse2): UNSHADOW_ARGS pop rbp ret - - -SECTION_RODATA -; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -bilinear_filters_sse2: - dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 - dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 - dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 - dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 - dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 - dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 - dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 - dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 - dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 - dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 - dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 - dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 - dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 - dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 - dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 diff --git a/libvpx/vp9/encoder/x86/vp9_variance_impl_ssse3.asm b/libvpx/vp9/encoder/x86/vp9_variance_impl_ssse3.asm deleted file mode 100644 index 98a4a16..0000000 --- a/libvpx/vp9/encoder/x86/vp9_variance_impl_ssse3.asm +++ /dev/null @@ -1,372 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define xmm_filter_shift 7 - - -;void vp9_filter_block2d_bil_var_ssse3 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int xoffset, -; int yoffset, -; int *sum, -; unsigned int *sumsquared;; -; -;) -;Note: The filter coefficient at offset=0 is 128. Since the second register -;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. -global sym(vp9_filter_block2d_bil_var_ssse3) PRIVATE -sym(vp9_filter_block2d_bil_var_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 - pxor xmm7, xmm7 - - lea rcx, [GLOBAL(bilinear_filters_ssse3)] - movsxd rax, dword ptr arg(5) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .filter_block2d_bil_var_ssse3_sp_only - - shl rax, 4 ; point to filter coeff with xoffset - lea rax, [rax + rcx] ; HFilter - - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip second_pass filter if yoffset=0 - je .filter_block2d_bil_var_ssse3_fp_only - - shl rdx, 4 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi+1] - movdqa xmm2, xmm0 - - punpcklbw xmm0, xmm1 - punpckhbw xmm2, xmm1 - pmaddubsw xmm0, [rax] - pmaddubsw xmm2, [rax] - - paddw xmm0, [GLOBAL(xmm_bi_rd)] - paddw xmm2, [GLOBAL(xmm_bi_rd)] - psraw xmm0, xmm_filter_shift - psraw xmm2, xmm_filter_shift - - packuswb xmm0, xmm2 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line - lea rsi, [rsi + r8] -%endif - -.filter_block2d_bil_var_ssse3_loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rsi+1] - movdqa xmm3, xmm1 - - punpcklbw xmm1, xmm2 - punpckhbw xmm3, xmm2 - pmaddubsw xmm1, [rax] - pmaddubsw xmm3, [rax] - - paddw xmm1, [GLOBAL(xmm_bi_rd)] - paddw xmm3, [GLOBAL(xmm_bi_rd)] - psraw xmm1, xmm_filter_shift - psraw xmm3, xmm_filter_shift - packuswb xmm1, xmm3 - - movdqa xmm2, xmm0 - movdqa xmm0, xmm1 - movdqa xmm3, xmm2 - - punpcklbw xmm2, xmm1 - punpckhbw xmm3, xmm1 - pmaddubsw xmm2, [rdx] - pmaddubsw xmm3, [rdx] - - paddw xmm2, [GLOBAL(xmm_bi_rd)] - paddw xmm3, [GLOBAL(xmm_bi_rd)] - psraw xmm2, xmm_filter_shift - psraw xmm3, xmm_filter_shift - - movq xmm1, QWORD PTR [rdi] - pxor xmm4, xmm4 - punpcklbw xmm1, xmm4 - movq xmm5, QWORD PTR [rdi+8] - punpcklbw xmm5, xmm4 - - psubw xmm2, xmm1 - psubw xmm3, xmm5 - paddw xmm6, xmm2 - paddw xmm6, xmm3 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - paddd xmm7, xmm2 - paddd xmm7, xmm3 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rsi, [rsi + r8] - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 - jnz .filter_block2d_bil_var_ssse3_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_var_ssse3_sp_only: - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; Both xoffset =0 and yoffset=0 - je .filter_block2d_bil_var_ssse3_full_pixel - - shl rdx, 4 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - movdqu xmm1, XMMWORD PTR [rsi] - movdqa xmm0, xmm1 - -%if ABI_IS_32BIT=0 - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - - lea rsi, [rsi + rax] - -.filter_block2d_bil_sp_only_loop: - movdqu xmm3, XMMWORD PTR [rsi] - movdqa xmm2, xmm1 - movdqa xmm0, xmm3 - - punpcklbw xmm1, xmm3 - punpckhbw xmm2, xmm3 - pmaddubsw xmm1, [rdx] - pmaddubsw xmm2, [rdx] - - paddw xmm1, [GLOBAL(xmm_bi_rd)] - paddw xmm2, [GLOBAL(xmm_bi_rd)] - psraw xmm1, xmm_filter_shift - psraw xmm2, xmm_filter_shift - - movq xmm3, QWORD PTR [rdi] - pxor xmm4, xmm4 - punpcklbw xmm3, xmm4 - movq xmm5, QWORD PTR [rdi+8] - punpcklbw xmm5, xmm4 - - psubw xmm1, xmm3 - psubw xmm2, xmm5 - paddw xmm6, xmm1 - paddw xmm6, xmm2 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm7, xmm1 - paddd xmm7, xmm2 - - movdqa xmm1, xmm0 - lea rsi, [rsi + rax] ;ref_pixels_per_line - -%if ABI_IS_32BIT - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 - jnz .filter_block2d_bil_sp_only_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_var_ssse3_full_pixel: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - pxor xmm0, xmm0 - -.filter_block2d_bil_full_pixel_loop: - movq xmm1, QWORD PTR [rsi] - punpcklbw xmm1, xmm0 - movq xmm2, QWORD PTR [rsi+8] - punpcklbw xmm2, xmm0 - - movq xmm3, QWORD PTR [rdi] - punpcklbw xmm3, xmm0 - movq xmm4, QWORD PTR [rdi+8] - punpcklbw xmm4, xmm0 - - psubw xmm1, xmm3 - psubw xmm2, xmm4 - paddw xmm6, xmm1 - paddw xmm6, xmm2 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm7, xmm1 - paddd xmm7, xmm2 - - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rdx] ;src_pixels_per_line - sub rcx, 1 - jnz .filter_block2d_bil_full_pixel_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_var_ssse3_fp_only: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 - -%if ABI_IS_32BIT=0 - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - -.filter_block2d_bil_fp_only_loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rsi+1] - movdqa xmm3, xmm1 - - punpcklbw xmm1, xmm2 - punpckhbw xmm3, xmm2 - pmaddubsw xmm1, [rax] - pmaddubsw xmm3, [rax] - - paddw xmm1, [GLOBAL(xmm_bi_rd)] - paddw xmm3, [GLOBAL(xmm_bi_rd)] - psraw xmm1, xmm_filter_shift - psraw xmm3, xmm_filter_shift - - movq xmm2, XMMWORD PTR [rdi] - pxor xmm4, xmm4 - punpcklbw xmm2, xmm4 - movq xmm5, QWORD PTR [rdi+8] - punpcklbw xmm5, xmm4 - - psubw xmm1, xmm2 - psubw xmm3, xmm5 - paddw xmm6, xmm1 - paddw xmm6, xmm3 - pmaddwd xmm1, xmm1 - pmaddwd xmm3, xmm3 - paddd xmm7, xmm1 - paddd xmm7, xmm3 - - lea rsi, [rsi + rdx] -%if ABI_IS_32BIT - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 - jnz .filter_block2d_bil_fp_only_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_variance: - pxor xmm0, xmm0 - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(7) ;[Sum] - mov rdi, arg(8) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -bilinear_filters_ssse3: - times 8 db 128, 0 - times 8 db 120, 8 - times 8 db 112, 16 - times 8 db 104, 24 - times 8 db 96, 32 - times 8 db 88, 40 - times 8 db 80, 48 - times 8 db 72, 56 - times 8 db 64, 64 - times 8 db 56, 72 - times 8 db 48, 80 - times 8 db 40, 88 - times 8 db 32, 96 - times 8 db 24, 104 - times 8 db 16, 112 - times 8 db 8, 120 diff --git a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c index bad1cfa..d141560 100644 --- a/libvpx/vp9/encoder/x86/vp9_variance_mmx.c +++ b/libvpx/vp9/encoder/x86/vp9_variance_mmx.c @@ -13,27 +13,6 @@ #include "vp9/common/vp9_pragmas.h" #include "vpx_ports/mem.h" -extern void filter_block1d_h6_mmx -( - const unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - short *vp7_filter -); -extern void filter_block1d_v6_mmx -( - const short *src_ptr, - unsigned char *output_ptr, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - short *vp7_filter -); - extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr); extern unsigned int vp9_get8x8var_mmx ( @@ -53,30 +32,6 @@ extern unsigned int vp9_get4x4var_mmx unsigned int *SSE, int *Sum ); -extern void vp9_filter_block2d_bil4x4_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); -extern void vp9_filter_block2d_bil_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); - unsigned int vp9_variance4x4_mmx( const unsigned char *src_ptr, @@ -190,193 +145,3 @@ unsigned int vp9_variance8x16_mmx( return (var - (((unsigned int)avg * avg) >> 7)); } - -DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]); - -unsigned int vp9_sub_pixel_variance4x4_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) - -{ - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil4x4_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 4)); -} - - -unsigned int vp9_sub_pixel_variance8x8_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 6)); -} - -unsigned int vp9_sub_pixel_variance16x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0 - ); - - vp9_filter_block2d_bil_var_mmx( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); - - -} - -unsigned int vp9_sub_pixel_mse16x16_mmx( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); - return *sse; -} - -unsigned int vp9_sub_pixel_variance16x8_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0 - ); - - - vp9_filter_block2d_bil_var_mmx( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); -} - -unsigned int vp9_sub_pixel_variance8x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 7)); -} - - -unsigned int vp9_variance_halfpixvar16x16_h_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0, - ref_ptr, recon_stride, sse); -} - - -unsigned int vp9_variance_halfpixvar16x16_v_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8, - ref_ptr, recon_stride, sse); -} - - -unsigned int vp9_variance_halfpixvar16x16_hv_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8, - ref_ptr, recon_stride, sse); -} diff --git a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c index 67ca925..b4ff850 100644 --- a/libvpx/vp9/encoder/x86/vp9_variance_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_variance_sse2.c @@ -9,29 +9,11 @@ */ #include "vpx_config.h" + #include "vp9/encoder/vp9_variance.h" #include "vp9/common/vp9_pragmas.h" #include "vpx_ports/mem.h" -#define HALFNDX 8 - -extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); - -extern void vp9_filter_block2d_bil4x4_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); - extern unsigned int vp9_get4x4var_mmx ( const unsigned char *src_ptr, @@ -64,18 +46,6 @@ unsigned int vp9_get8x8var_sse2 unsigned int *SSE, int *Sum ); -void vp9_filter_block2d_bil_var_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int xoffset, - int yoffset, - int *sum, - unsigned int *sumsquared -); void vp9_half_horiz_vert_variance8x_h_sse2 ( const unsigned char *ref_ptr, @@ -137,8 +107,6 @@ void vp9_half_vert_variance16x_h_sse2 unsigned int *sumsquared ); -DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]); - typedef unsigned int (*get_var_sse2) ( const unsigned char *src_ptr, int source_stride, @@ -375,347 +343,162 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, return (var - (((int64_t)avg * avg) >> 11)); } -unsigned int vp9_sub_pixel_variance4x4_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil4x4_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 4)); +#define DECL(w, opt) \ +int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint8_t *dst, \ + ptrdiff_t dst_stride, \ + int height, unsigned int *sse) +#define DECLS(opt1, opt2) \ +DECL(4, opt2); \ +DECL(8, opt1); \ +DECL(16, opt1) + +DECLS(sse2, sse); +DECLS(ssse3, ssse3); +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst, \ + int dst_stride, \ + unsigned int *sse_ptr) { \ + unsigned int sse; \ + int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + h, &sse); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \ + x_offset, y_offset, \ + dst + 16, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ } - -unsigned int vp9_sub_pixel_variance8x8_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum; - unsigned int xxsum; - - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum, &xxsum); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum, &xxsum); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum, &xxsum); - } else { - vp9_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum, &xxsum); - } - - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 6)); -} - -static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, int *avg) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - // note we could avoid these if statements if the calling function - // just called the appropriate functions inside. - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else { - vp9_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum0, &xxsum0 - ); - - vp9_filter_block2d_bil_var_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum1, &xxsum1 - ); - xsum0 += xsum1; - xxsum0 += xxsum1; - } - - *sse = xxsum0; - *avg = xsum0; -} - -unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse_ptr) { - int avg; - unsigned int sse; - - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse, &avg); - *sse_ptr = sse; - - return (sse - (((unsigned int) avg * avg) >> 8)); -} - -unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse_ptr) { - int avg0, avg1, avg2, avg3; - unsigned int sse0, sse1, sse2, sse3; - - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse0, &avg0); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse1, &avg1); - src_ptr += 16 * src_pixels_per_line; - dst_ptr += 16 * dst_pixels_per_line; - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse3, &avg3); - sse0 += sse1 + sse2 + sse3; - avg0 += avg1 + avg2 + avg3; - *sse_ptr = sse0; - - return (sse0 - (((unsigned int) avg0 * avg0) >> 10)); -} - -unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse_ptr) { - int avg0, avg1, avg2, avg3, avg4; - unsigned int sse0, sse1, sse2, sse3, sse4; - - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse0, &avg0); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse1, &avg1); - sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 32, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 48, dst_pixels_per_line, - &sse3, &avg3); - src_ptr += 16 * src_pixels_per_line; - dst_ptr += 16 * dst_pixels_per_line; - avg0 += avg1 + avg2 + avg3; - sse0 += sse1 + sse2 + sse3; - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse1, &avg1); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 32, dst_pixels_per_line, - &sse3, &avg3); - sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 48, dst_pixels_per_line, - &sse4, &avg4); - src_ptr += 16 * src_pixels_per_line; - dst_ptr += 16 * dst_pixels_per_line; - avg0 += avg1 + avg2 + avg3 + avg4; - sse0 += sse1 + sse2 + sse3 + sse4; - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse1, &avg1); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 32, dst_pixels_per_line, - &sse3, &avg3); - sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 48, dst_pixels_per_line, - &sse4, &avg4); - src_ptr += 16 * src_pixels_per_line; - dst_ptr += 16 * dst_pixels_per_line; - avg0 += avg1 + avg2 + avg3 + avg4; - sse0 += sse1 + sse2 + sse3 + sse4; - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse1, &avg1); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 32, dst_pixels_per_line, - &sse3, &avg3); - sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 48, dst_pixels_per_line, - &sse4, &avg4); - avg0 += avg1 + avg2 + avg3 + avg4; - sse0 += sse1 + sse2 + sse3 + sse4; - *sse_ptr = sse0; - - return (sse0 - (((unsigned int) avg0 * avg0) >> 12)); -} - -unsigned int vp9_sub_pixel_mse16x16_sse2( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, sse); - return *sse; -} - -unsigned int vp9_sub_pixel_variance16x8_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse - -) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else { - vp9_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum0, &xxsum0); - - vp9_filter_block2d_bil_var_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum1, &xxsum1); - xsum0 += xsum1; - xxsum0 += xxsum1; - } - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); -} - -unsigned int vp9_sub_pixel_variance8x16_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum; - unsigned int xxsum; - - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum, &xxsum); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum, &xxsum); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum, &xxsum); - } else { - vp9_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum, &xxsum); - } - - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 7)); +#define FNS(opt1, opt2) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ +FN(16, 8, 16, 4, 3, opt1,); \ +FN(8, 16, 8, 3, 4, opt1,); \ +FN(8, 8, 8, 3, 3, opt1,); \ +FN(8, 4, 8, 3, 2, opt1,); \ +FN(4, 8, 4, 2, 3, opt2,); \ +FN(4, 4, 4, 2, 2, opt2,) + +FNS(sse2, sse); +FNS(ssse3, ssse3); + +#undef FNS +#undef FN + +#define DECL(w, opt) \ +int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint8_t *dst, \ + ptrdiff_t dst_stride, \ + const uint8_t *sec, \ + ptrdiff_t sec_stride, \ + int height, unsigned int *sse) +#define DECLS(opt1, opt2) \ +DECL(4, opt2); \ +DECL(8, opt1); \ +DECL(16, opt1) + +DECLS(sse2, sse); +DECLS(ssse3, ssse3); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst, \ + int dst_stride, \ + unsigned int *sseptr, \ + const uint8_t *sec) { \ + unsigned int sse; \ + int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + sec, w, h, &sse); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \ + x_offset, y_offset, \ + dst + 16, dst_stride, \ + sec + 16, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + sec + 32, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + sec + 48, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sseptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ } +#define FNS(opt1, opt2) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ +FN(16, 8, 16, 4, 3, opt1,); \ +FN(8, 16, 8, 3, 4, opt1,); \ +FN(8, 8, 8, 3, 3, opt1,); \ +FN(8, 4, 8, 3, 2, opt1,); \ +FN(4, 8, 4, 2, 3, opt2,); \ +FN(4, 4, 4, 2, 2, opt2,) + +FNS(sse2, sse); +FNS(ssse3, ssse3); + +#undef FNS +#undef FN unsigned int vp9_variance_halfpixvar16x16_h_wmt( const unsigned char *src_ptr, diff --git a/libvpx/vp9/encoder/x86/vp9_variance_ssse3.c b/libvpx/vp9/encoder/x86/vp9_variance_ssse3.c deleted file mode 100644 index 882acad..0000000 --- a/libvpx/vp9/encoder/x86/vp9_variance_ssse3.c +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/common/vp9_pragmas.h" -#include "vpx_ports/mem.h" - -#define HALFNDX 8 - -extern void vp9_half_horiz_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -extern void vp9_half_horiz_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -extern void vp9_half_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -extern void vp9_filter_block2d_bil_var_ssse3 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int xoffset, - int yoffset, - int *sum, - unsigned int *sumsquared -); - -unsigned int vp9_sub_pixel_variance16x16_ssse3 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum0; - unsigned int xxsum0; - - // note we could avoid these if statements if the calling function - // just called the appropriate functions inside. - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else { - vp9_filter_block2d_bil_var_ssse3( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum0, &xxsum0); - } - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} - -unsigned int vp9_sub_pixel_variance16x8_ssse3 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse - -) { - int xsum0; - unsigned int xxsum0; - - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else { - vp9_filter_block2d_bil_var_ssse3( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum0, &xxsum0); - } - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); -} diff --git a/libvpx/vp9/encoder/x86/vp9_x86_csystemdependent.c b/libvpx/vp9/encoder/x86/vp9_x86_csystemdependent.c deleted file mode 100644 index 6016e14..0000000 --- a/libvpx/vp9/encoder/x86/vp9_x86_csystemdependent.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "./vpx_config.h" -#include "vpx_ports/x86.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/x86/vp9_dct_mmx.h" - -// TODO(jimbankoski) Consider rewriting the c to take the same values rather -// than going through these pointer conversions -#if 0 && HAVE_MMX -void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) { - vp9_short_fdct4x4_mmx(input, output, pitch); - vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch); -} - -void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride, - short *diff, unsigned char *predictor, - int pitch); -void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) { - unsigned char *z = *(be->base_src) + be->src; - unsigned int src_stride = be->src_stride; - short *diff = &be->src_diff[0]; - unsigned char *predictor = *(bd->base_dst) + bd->dst; - // TODO(jingning): The prototype function in c has been changed. Need to - // modify the mmx and sse versions. - vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch); -} - -#endif - -#if 0 && HAVE_SSE2 -void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride, - short *diff, unsigned char *predictor, - int pitch); -void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) { - unsigned char *z = *(be->base_src) + be->src; - unsigned int src_stride = be->src_stride; - short *diff = &be->src_diff[0]; - unsigned char *predictor = *(bd->base_dst) + bd->dst; - // TODO(jingning): The prototype function in c has been changed. Need to - // modify the mmx and sse versions. - vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch); -} - -#endif diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk index 7a74833..5a0c1c9 100644 --- a/libvpx/vp9/vp9_common.mk +++ b/libvpx/vp9/vp9_common.mk @@ -14,7 +14,6 @@ VP9_COMMON_SRCS-yes += common/vp9_pragmas.h VP9_COMMON_SRCS-yes += common/vp9_ppflags.h VP9_COMMON_SRCS-yes += common/vp9_onyx.h VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c -VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c VP9_COMMON_SRCS-yes += common/vp9_convolve.c VP9_COMMON_SRCS-yes += common/vp9_convolve.h VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c @@ -39,7 +38,6 @@ VP9_COMMON_SRCS-yes += common/vp9_extend.h VP9_COMMON_SRCS-yes += common/vp9_findnearmv.h VP9_COMMON_SRCS-yes += common/vp9_idct.h VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h -VP9_COMMON_SRCS-yes += common/vp9_modecont.h VP9_COMMON_SRCS-yes += common/vp9_mv.h VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h VP9_COMMON_SRCS-yes += common/vp9_pred_common.h @@ -60,9 +58,6 @@ VP9_COMMON_SRCS-yes += common/vp9_tile_common.c VP9_COMMON_SRCS-yes += common/vp9_treecoder.h VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c -VP9_COMMON_SRCS-yes += common/vp9_mbpitch.c -VP9_COMMON_SRCS-yes += common/vp9_modecont.c -VP9_COMMON_SRCS-yes += common/vp9_modecontext.c VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h VP9_COMMON_SRCS-yes += common/vp9_quant_common.c @@ -70,37 +65,31 @@ VP9_COMMON_SRCS-yes += common/vp9_reconinter.c VP9_COMMON_SRCS-yes += common/vp9_reconintra.c VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c VP9_COMMON_SRCS-yes += common/vp9_treecoder.c -VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c +VP9_COMMON_SRCS-yes += common/vp9_common_data.c +VP9_COMMON_SRCS-yes += common/vp9_common_data.h -VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c -VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm -VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm +VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm ifeq ($(CONFIG_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm endif -# common (c) -ifeq ($(CONFIG_CSM),yes) -VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c -VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm -endif - VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c -$(eval $(call asm_offsets_template,\ - vp9_asm_com_offsets.asm, $(VP9_PREFIX)common/vp9_asm_com_offsets.c)) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM) $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh)) diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c index e5b5089..be7828f 100644 --- a/libvpx/vp9/vp9_cx_iface.c +++ b/libvpx/vp9/vp9_cx_iface.c @@ -233,10 +233,10 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, oxcf->width = cfg.g_w; oxcf->height = cfg.g_h; /* guess a frame rate if out of whack, use 30 */ - oxcf->frame_rate = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num); + oxcf->framerate = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num); - if (oxcf->frame_rate > 180) { - oxcf->frame_rate = 30; + if (oxcf->framerate > 180) { + oxcf->framerate = 30; } switch (cfg.g_pass) { @@ -1032,6 +1032,7 @@ static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = { {VP8E_SET_CQ_LEVEL, set_param}, {VP8E_SET_MAX_INTRA_BITRATE_PCT, set_param}, {VP9E_SET_LOSSLESS, set_param}, + {VP9E_SET_FRAME_PARALLEL_DECODING, set_param}, {VP9_GET_REFERENCE, get_reference}, { -1, NULL}, }; diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c index ea6946b..05029b9 100644 --- a/libvpx/vp9/vp9_dx_iface.c +++ b/libvpx/vp9/vp9_dx_iface.c @@ -19,36 +19,29 @@ #include "decoder/vp9_onyxd_int.h" #include "vp9/vp9_iface_common.h" -#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0) -typedef vpx_codec_stream_info_t vp8_stream_info_t; +#define VP9_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0) +typedef vpx_codec_stream_info_t vp9_stream_info_t; /* Structures for handling memory allocations */ typedef enum { - VP8_SEG_ALG_PRIV = 256, - VP8_SEG_MAX + VP9_SEG_ALG_PRIV = 256, + VP9_SEG_MAX } mem_seg_id_t; #define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0]))) -static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t); +static unsigned long priv_sz(const vpx_codec_dec_cfg_t *si, + vpx_codec_flags_t flags); -typedef struct { - unsigned int id; - unsigned long sz; - unsigned int align; - unsigned int flags; - unsigned long(*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t); -} mem_req_t; - -static const mem_req_t vp8_mem_req_segs[] = { - {VP8_SEG_ALG_PRIV, 0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz}, - {VP8_SEG_MAX, 0, 0, 0, NULL} +static const mem_req_t vp9_mem_req_segs[] = { + {VP9_SEG_ALG_PRIV, 0, 8, VPX_CODEC_MEM_ZERO, priv_sz}, + {VP9_SEG_MAX, 0, 0, 0, NULL} }; struct vpx_codec_alg_priv { vpx_codec_priv_t base; - vpx_codec_mmap_t mmaps[NELEMENTS(vp8_mem_req_segs) - 1]; + vpx_codec_mmap_t mmaps[NELEMENTS(vp9_mem_req_segs) - 1]; vpx_codec_dec_cfg_t cfg; - vp8_stream_info_t si; + vp9_stream_info_t si; int defer_alloc; int decoder_init; VP9D_PTR pbi; @@ -67,8 +60,8 @@ struct vpx_codec_alg_priv { int invert_tile_order; }; -static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, - vpx_codec_flags_t flags) { +static unsigned long priv_sz(const vpx_codec_dec_cfg_t *si, + vpx_codec_flags_t flags) { /* Although this declaration is constant, we can't use it in the requested * segments list because we want to define the requested segments list * before defining the private type (so that the number of memory maps is @@ -78,59 +71,7 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, return sizeof(vpx_codec_alg_priv_t); } - -static void vp8_mmap_dtor(vpx_codec_mmap_t *mmap) { - free(mmap->priv); -} - -static vpx_codec_err_t vp8_mmap_alloc(vpx_codec_mmap_t *mmap) { - vpx_codec_err_t res; - unsigned int align; - - align = mmap->align ? mmap->align - 1 : 0; - - if (mmap->flags & VPX_CODEC_MEM_ZERO) - mmap->priv = calloc(1, mmap->sz + align); - else - mmap->priv = malloc(mmap->sz + align); - - res = (mmap->priv) ? VPX_CODEC_OK : VPX_CODEC_MEM_ERROR; - mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align); - mmap->dtor = vp8_mmap_dtor; - return res; -} - -static vpx_codec_err_t vp8_validate_mmaps(const vp8_stream_info_t *si, - const vpx_codec_mmap_t *mmaps, - vpx_codec_flags_t init_flags) { - int i; - vpx_codec_err_t res = VPX_CODEC_OK; - - for (i = 0; i < NELEMENTS(vp8_mem_req_segs) - 1; i++) { - /* Ensure the segment has been allocated */ - if (!mmaps[i].base) { - res = VPX_CODEC_MEM_ERROR; - break; - } - - /* Verify variable size segment is big enough for the current si. */ - if (vp8_mem_req_segs[i].calc_sz) { - vpx_codec_dec_cfg_t cfg; - - cfg.w = si->w; - cfg.h = si->h; - - if (mmaps[i].sz < vp8_mem_req_segs[i].calc_sz(&cfg, init_flags)) { - res = VPX_CODEC_MEM_ERROR; - break; - } - } - } - - return res; -} - -static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) { +static void vp9_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) { int i; ctx->priv = mmap->base; @@ -139,7 +80,7 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) { ctx->priv->alg_priv = mmap->base; for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++) - ctx->priv->alg_priv->mmaps[i].id = vp8_mem_req_segs[i].id; + ctx->priv->alg_priv->mmaps[i].id = vp9_mem_req_segs[i].id; ctx->priv->alg_priv->mmaps[0] = *mmap; ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si); @@ -152,20 +93,11 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) { } } -static void *mmap_lkup(vpx_codec_alg_priv_t *ctx, unsigned int id) { - int i; - - for (i = 0; i < NELEMENTS(ctx->mmaps); i++) - if (ctx->mmaps[i].id == id) - return ctx->mmaps[i].base; - - return NULL; -} -static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) { +static void vp9_finalize_mmaps(vpx_codec_alg_priv_t *ctx) { /* nothing to clean up */ } -static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, +static vpx_codec_err_t vp9_init(vpx_codec_ctx_t *ctx, vpx_codec_priv_enc_mr_cfg_t *data) { vpx_codec_err_t res = VPX_CODEC_OK; @@ -176,15 +108,15 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, if (!ctx->priv) { vpx_codec_mmap_t mmap; - mmap.id = vp8_mem_req_segs[0].id; + mmap.id = vp9_mem_req_segs[0].id; mmap.sz = sizeof(vpx_codec_alg_priv_t); - mmap.align = vp8_mem_req_segs[0].align; - mmap.flags = vp8_mem_req_segs[0].flags; + mmap.align = vp9_mem_req_segs[0].align; + mmap.flags = vp9_mem_req_segs[0].flags; - res = vp8_mmap_alloc(&mmap); + res = vpx_mmap_alloc(&mmap); if (!res) { - vp8_init_ctx(ctx, &mmap); + vp9_init_ctx(ctx, &mmap); ctx->priv->alg_priv->defer_alloc = 1; /*post processing level initialized to do nothing */ @@ -194,7 +126,7 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, return res; } -static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) { +static vpx_codec_err_t vp9_destroy(vpx_codec_alg_priv_t *ctx) { int i; vp9_remove_decompressor(ctx->pbi); @@ -207,43 +139,44 @@ static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) { return VPX_CODEC_OK; } -static vpx_codec_err_t vp8_peek_si(const uint8_t *data, +static vpx_codec_err_t vp9_peek_si(const uint8_t *data, unsigned int data_sz, vpx_codec_stream_info_t *si) { vpx_codec_err_t res = VPX_CODEC_OK; - if (data + data_sz <= data) + if (data_sz <= 8) return VPX_CODEC_UNSUP_BITSTREAM; + + if (data + data_sz <= data) { res = VPX_CODEC_INVALID_PARAM; - else { - si->is_kf = 0; + } else { + const int frame_marker = (data[0] >> 6) & 0x3; + const int version = (data[0] >> 4) & 0x3; + if (frame_marker != 0x2) return VPX_CODEC_UNSUP_BITSTREAM; + if (version != 0) return VPX_CODEC_UNSUP_BITSTREAM; - if (data_sz >= 8 && (data[0] & 0xD8) == 0x80) { /* I-Frame */ + si->is_kf = !((data[0] >> 2) & 0x1); + if (si->is_kf) { const uint8_t *c = data + 1; - si->is_kf = 1; if (c[0] != SYNC_CODE_0 || c[1] != SYNC_CODE_1 || c[2] != SYNC_CODE_2) - res = VPX_CODEC_UNSUP_BITSTREAM; + return VPX_CODEC_UNSUP_BITSTREAM; - si->w = (c[3] << 8) | c[4]; - si->h = (c[5] << 8) | c[6]; - - // printf("w=%d, h=%d\n", si->w, si->h); - if (!(si->h | si->w)) - res = VPX_CODEC_UNSUP_BITSTREAM; - } else - res = VPX_CODEC_UNSUP_BITSTREAM; + c += 3; + si->w = (((c[0] & 0xf) << 12) | (c[1] << 4) | ((c[2] >> 4) & 0xf)) + 1; + si->h = (((c[2] & 0xf) << 12) | (c[3] << 4) | ((c[4] >> 4) & 0xf)) + 1; + } } return res; } -static vpx_codec_err_t vp8_get_si(vpx_codec_alg_priv_t *ctx, +static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t *ctx, vpx_codec_stream_info_t *si) { unsigned int sz; - if (si->sz >= sizeof(vp8_stream_info_t)) - sz = sizeof(vp8_stream_info_t); + if (si->sz >= sizeof(vp9_stream_info_t)) + sz = sizeof(vp9_stream_info_t); else sz = sizeof(vpx_codec_stream_info_t); @@ -293,27 +226,29 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, cfg.w = ctx->si.w; cfg.h = ctx->si.h; - ctx->mmaps[i].id = vp8_mem_req_segs[i].id; - ctx->mmaps[i].sz = vp8_mem_req_segs[i].sz; - ctx->mmaps[i].align = vp8_mem_req_segs[i].align; - ctx->mmaps[i].flags = vp8_mem_req_segs[i].flags; + ctx->mmaps[i].id = vp9_mem_req_segs[i].id; + ctx->mmaps[i].sz = vp9_mem_req_segs[i].sz; + ctx->mmaps[i].align = vp9_mem_req_segs[i].align; + ctx->mmaps[i].flags = vp9_mem_req_segs[i].flags; if (!ctx->mmaps[i].sz) - ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg, + ctx->mmaps[i].sz = vp9_mem_req_segs[i].calc_sz(&cfg, ctx->base.init_flags); - res = vp8_mmap_alloc(&ctx->mmaps[i]); + res = vpx_mmap_alloc(&ctx->mmaps[i]); } if (!res) - vp8_finalize_mmaps(ctx); + vp9_finalize_mmaps(ctx); ctx->defer_alloc = 0; } /* Initialize the decoder instance on the first frame*/ if (!res && !ctx->decoder_init) { - res = vp8_validate_mmaps(&ctx->si, ctx->mmaps, ctx->base.init_flags); + res = vpx_validate_mmaps(&ctx->si, ctx->mmaps, + vp9_mem_req_segs, NELEMENTS(vp9_mem_req_segs), + ctx->base.init_flags); if (!res) { VP9D_CONFIG oxcf; @@ -483,7 +418,7 @@ static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t *ctx, return res; } -static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx, +static vpx_image_t *vp9_get_frame(vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter) { vpx_image_t *img = NULL; @@ -501,24 +436,22 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx, return img; } - -static -vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t *ctx, - vpx_codec_mmap_t *mmap, - vpx_codec_iter_t *iter) { +static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t *ctx, + vpx_codec_mmap_t *mmap, + vpx_codec_iter_t *iter) { vpx_codec_err_t res; const mem_req_t *seg_iter = *iter; /* Get address of next segment request */ do { if (!seg_iter) - seg_iter = vp8_mem_req_segs; - else if (seg_iter->id != VP8_SEG_MAX) + seg_iter = vp9_mem_req_segs; + else if (seg_iter->id != VP9_SEG_MAX) seg_iter++; *iter = (vpx_codec_iter_t)seg_iter; - if (seg_iter->id != VP8_SEG_MAX) { + if (seg_iter->id != VP9_SEG_MAX) { mmap->id = seg_iter->id; mmap->sz = seg_iter->sz; mmap->align = seg_iter->align; @@ -535,15 +468,15 @@ vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t *ctx, return res; } -static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t *ctx, +static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) { vpx_codec_err_t res = VPX_CODEC_MEM_ERROR; int i, done; if (!ctx->priv) { - if (mmap->id == VP8_SEG_ALG_PRIV) { + if (mmap->id == VP9_SEG_ALG_PRIV) { if (!ctx->priv) { - vp8_init_ctx(ctx, mmap); + vp9_init_ctx(ctx, mmap); res = VPX_CODEC_OK; } } @@ -564,17 +497,16 @@ static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t *ctx, } if (done && !res) { - vp8_finalize_mmaps(ctx->priv->alg_priv); + vp9_finalize_mmaps(ctx->priv->alg_priv); res = ctx->iface->init(ctx, NULL); } return res; } - -static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { +static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx, + int ctr_id, + va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); @@ -591,9 +523,9 @@ static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx, } -static vpx_codec_err_t vp9_copy_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { +static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx, + int ctr_id, + va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); @@ -626,9 +558,9 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, } } -static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { +static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx, + int ctr_id, + va_list args) { #if CONFIG_POSTPROC vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); @@ -644,9 +576,9 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx, #endif } -static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx, - int ctrl_id, - va_list args) { +static vpx_codec_err_t set_dbg_options(vpx_codec_alg_priv_t *ctx, + int ctrl_id, + va_list args) { #if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC int data = va_arg(args, int); @@ -665,9 +597,9 @@ static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx, #endif } -static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, - int ctrl_id, - va_list args) { +static vpx_codec_err_t get_last_ref_updates(vpx_codec_alg_priv_t *ctx, + int ctrl_id, + va_list args) { int *update_info = va_arg(args, int *); VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi; @@ -680,9 +612,9 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, } -static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, - int ctrl_id, - va_list args) { +static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx, + int ctrl_id, + va_list args) { int *corrupted = va_arg(args, int *); @@ -704,15 +636,15 @@ static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx, } static vpx_codec_ctrl_fn_map_t ctf_maps[] = { - {VP8_SET_REFERENCE, vp9_set_reference}, - {VP8_COPY_REFERENCE, vp9_copy_reference}, - {VP8_SET_POSTPROC, vp8_set_postproc}, - {VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_options}, - {VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_options}, - {VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_options}, - {VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_options}, - {VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates}, - {VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted}, + {VP8_SET_REFERENCE, set_reference}, + {VP8_COPY_REFERENCE, copy_reference}, + {VP8_SET_POSTPROC, set_postproc}, + {VP8_SET_DBG_COLOR_REF_FRAME, set_dbg_options}, + {VP8_SET_DBG_COLOR_MB_MODES, set_dbg_options}, + {VP8_SET_DBG_COLOR_B_MODES, set_dbg_options}, + {VP8_SET_DBG_DISPLAY_MV, set_dbg_options}, + {VP8D_GET_LAST_REF_UPDATES, get_last_ref_updates}, + {VP8D_GET_FRAME_CORRUPTED, get_frame_corrupted}, {VP9_GET_REFERENCE, get_reference}, {VP9_INVERT_TILE_DECODE_ORDER, set_invert_tile_order}, { -1, NULL}, @@ -725,18 +657,18 @@ static vpx_codec_ctrl_fn_map_t ctf_maps[] = { CODEC_INTERFACE(vpx_codec_vp9_dx) = { "WebM Project VP9 Decoder" VERSION_STRING, VPX_CODEC_INTERNAL_ABI_VERSION, - VPX_CODEC_CAP_DECODER | VP8_CAP_POSTPROC, + VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC, /* vpx_codec_caps_t caps; */ - vp8_init, /* vpx_codec_init_fn_t init; */ - vp8_destroy, /* vpx_codec_destroy_fn_t destroy; */ + vp9_init, /* vpx_codec_init_fn_t init; */ + vp9_destroy, /* vpx_codec_destroy_fn_t destroy; */ ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ - vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */ - vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */ + vp9_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */ + vp9_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */ { - vp8_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */ - vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */ + vp9_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */ + vp9_get_si, /* vpx_codec_get_si_fn_t get_si; */ vp9_decode, /* vpx_codec_decode_fn_t decode; */ - vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */ + vp9_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */ }, { /* encoder functions */ diff --git a/libvpx/vp9/vp9_iface_common.h b/libvpx/vp9/vp9_iface_common.h index dc41d77..ed0122c 100644 --- a/libvpx/vp9/vp9_iface_common.h +++ b/libvpx/vp9/vp9_iface_common.h @@ -29,7 +29,7 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, img->fmt = VPX_IMG_FMT_I420; } img->w = yv12->y_stride; - img->h = multiple8(yv12->y_height + 2 * VP9BORDERINPIXELS); + img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9BORDERINPIXELS, 3); img->d_w = yv12->y_crop_width; img->d_h = yv12->y_crop_height; img->x_chroma_shift = yv12->uv_width < yv12->y_width; @@ -74,8 +74,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->alpha_stride = yv12->alpha_buffer ? img->stride[VPX_PLANE_ALPHA] : 0; yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2; - yv12->clrtype = REG_YUV; - #if CONFIG_ALPHA // For development purposes, force alpha to hold the same data a Y for now. yv12->alpha_buffer = yv12->y_buffer; diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk index 4bed6c0..dee83c9 100644 --- a/libvpx/vp9/vp9cx.mk +++ b/libvpx/vp9/vp9cx.mk @@ -58,6 +58,8 @@ VP9_CX_SRCS-yes += encoder/vp9_rdopt.c VP9_CX_SRCS-yes += encoder/vp9_sad_c.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.h +VP9_CX_SRCS-yes += encoder/vp9_subexp.c +VP9_CX_SRCS-yes += encoder/vp9_subexp.h VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c VP9_CX_SRCS-yes += encoder/vp9_tokenize.c VP9_CX_SRCS-yes += encoder/vp9_treewriter.c @@ -73,27 +75,24 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h -VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm -VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm -VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h -VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_subtract_mmx.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm +ifeq ($(ARCH_X86_64),yes) +VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm +endif VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm -VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c diff --git a/libvpx/vp9/vp9dx.mk b/libvpx/vp9/vp9dx.mk index 7ae3219..6cad293 100644 --- a/libvpx/vp9/vp9dx.mk +++ b/libvpx/vp9/vp9dx.mk @@ -17,7 +17,6 @@ VP9_DX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no) VP9_DX_SRCS-yes += vp9_dx_iface.c -VP9_DX_SRCS-yes += decoder/vp9_asm_dec_offsets.c VP9_DX_SRCS-yes += decoder/vp9_dboolhuff.c VP9_DX_SRCS-yes += decoder/vp9_decodemv.c VP9_DX_SRCS-yes += decoder/vp9_decodframe.c @@ -33,10 +32,10 @@ VP9_DX_SRCS-yes += decoder/vp9_treereader.h VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c VP9_DX_SRCS-yes += decoder/vp9_idct_blk.c VP9_DX_SRCS-yes += decoder/vp9_idct_blk.h +VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c +VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes)) VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c - -$(eval $(call asm_offsets_template,\ - vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/vp9_asm_dec_offsets.c)) +VP9_DX_SRCS-$(HAVE_NEON) += decoder/arm/neon/vp9_add_constant_residual_neon$(ASM) diff --git a/libvpx/vpx/internal/vpx_codec_internal.h b/libvpx/vpx/internal/vpx_codec_internal.h index d7bcd46..05fed97 100644 --- a/libvpx/vpx/internal/vpx_codec_internal.h +++ b/libvpx/vpx/internal/vpx_codec_internal.h @@ -94,9 +94,10 @@ typedef vpx_codec_err_t (*vpx_codec_destroy_fn_t)(vpx_codec_alg_priv_t *ctx); /*!\brief parse stream info function pointer prototype * - * Performs high level parsing of the bitstream. This function is called by - * the generic vpx_codec_parse_stream() wrapper function, so plugins implementing - * this interface may trust the input parameters to be properly initialized. + * Performs high level parsing of the bitstream. This function is called by the + * generic vpx_codec_peek_stream_info() wrapper function, so plugins + * implementing this interface may trust the input parameters to be properly + * initialized. * * \param[in] data Pointer to a block of data to parse * \param[in] data_sz Size of the data buffer @@ -301,7 +302,7 @@ struct vpx_codec_iface { vpx_codec_set_mmap_fn_t set_mmap; /**< \copydoc ::vpx_codec_set_mmap_fn_t */ struct vpx_codec_dec_iface { vpx_codec_peek_si_fn_t peek_si; /**< \copydoc ::vpx_codec_peek_si_fn_t */ - vpx_codec_get_si_fn_t get_si; /**< \copydoc ::vpx_codec_peek_si_fn_t */ + vpx_codec_get_si_fn_t get_si; /**< \copydoc ::vpx_codec_get_si_fn_t */ vpx_codec_decode_fn_t decode; /**< \copydoc ::vpx_codec_decode_fn_t */ vpx_codec_get_frame_fn_t get_frame; /**< \copydoc ::vpx_codec_get_frame_fn_t */ } dec; @@ -473,4 +474,30 @@ static void vpx_internal_error(struct vpx_internal_error_info *info, if (info->setjmp) longjmp(info->jmp, info->error_code); } + +//------------------------------------------------------------------------------ +// mmap interface + +typedef struct { + unsigned int id; + unsigned long sz; + unsigned int align; + unsigned int flags; + unsigned long (*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t); +} mem_req_t; + +// Allocates mmap.priv and sets mmap.base based on mmap.sz/align/flags +// requirements. +// Returns #VPX_CODEC_OK on success, #VPX_CODEC_MEM_ERROR otherwise. +vpx_codec_err_t vpx_mmap_alloc(vpx_codec_mmap_t *mmap); + +// Frees mmap.base allocated by a call to vpx_mmap_alloc(). +void vpx_mmap_dtor(vpx_codec_mmap_t *mmap); + +// Checks each mmap has the size requirement specificied by mem_reqs. +// Returns #VPX_CODEC_OK on success, #VPX_CODEC_MEM_ERROR otherwise. +vpx_codec_err_t vpx_validate_mmaps(const vpx_codec_stream_info_t *si, + const vpx_codec_mmap_t *mmaps, + const mem_req_t *mem_reqs, int nreqs, + vpx_codec_flags_t init_flags); #endif diff --git a/libvpx/vpx/src/vpx_codec.c b/libvpx/vpx/src/vpx_codec.c index 61d7f4c..1f664ae 100644 --- a/libvpx/vpx/src/vpx_codec.c +++ b/libvpx/vpx/src/vpx_codec.c @@ -14,6 +14,7 @@ * */ #include <stdarg.h> +#include <stdlib.h> #include "vpx/vpx_integer.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx_version.h" @@ -133,3 +134,51 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, return SAVE_STATUS(ctx, res); } + +//------------------------------------------------------------------------------ +// mmap interface + +vpx_codec_err_t vpx_mmap_alloc(vpx_codec_mmap_t *mmap) { + unsigned int align = mmap->align ? mmap->align - 1 : 0; + + if (mmap->flags & VPX_CODEC_MEM_ZERO) + mmap->priv = calloc(1, mmap->sz + align); + else + mmap->priv = malloc(mmap->sz + align); + + if (mmap->priv == NULL) return VPX_CODEC_MEM_ERROR; + mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align); + mmap->dtor = vpx_mmap_dtor; + return VPX_CODEC_OK; +} + +void vpx_mmap_dtor(vpx_codec_mmap_t *mmap) { + free(mmap->priv); +} + +vpx_codec_err_t vpx_validate_mmaps(const vpx_codec_stream_info_t *si, + const vpx_codec_mmap_t *mmaps, + const mem_req_t *mem_reqs, int nreqs, + vpx_codec_flags_t init_flags) { + int i; + + for (i = 0; i < nreqs - 1; ++i) { + /* Ensure the segment has been allocated */ + if (mmaps[i].base == NULL) { + return VPX_CODEC_MEM_ERROR; + } + + /* Verify variable size segment is big enough for the current si. */ + if (mem_reqs[i].calc_sz != NULL) { + vpx_codec_dec_cfg_t cfg; + + cfg.w = si->w; + cfg.h = si->h; + + if (mmaps[i].sz < mem_reqs[i].calc_sz(&cfg, init_flags)) { + return VPX_CODEC_MEM_ERROR; + } + } + } + return VPX_CODEC_OK; +} diff --git a/libvpx/vpx_scale/generic/yv12config.c b/libvpx/vpx_scale/generic/yv12config.c index 754a615..b18155b 100644 --- a/libvpx/vpx_scale/generic/yv12config.c +++ b/libvpx/vpx_scale/generic/yv12config.c @@ -170,6 +170,8 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, ybf->y_height = aligned_height; ybf->y_stride = y_stride; + ybf->uv_crop_width = (width + ss_x) >> ss_x; + ybf->uv_crop_height = (height + ss_y) >> ss_y; ybf->uv_width = uv_width; ybf->uv_height = uv_height; ybf->uv_stride = uv_stride; diff --git a/libvpx/vpx_scale/generic/yv12extend.c b/libvpx/vpx_scale/generic/yv12extend.c index c38fb80..cc8da2a 100644 --- a/libvpx/vpx_scale/generic/yv12extend.c +++ b/libvpx/vpx_scale/generic/yv12extend.c @@ -96,15 +96,16 @@ vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { } #if CONFIG_VP9 -void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, - int subsampling_x, int subsampling_y) { +static void extend_frame(YV12_BUFFER_CONFIG *ybf, + int subsampling_x, int subsampling_y, + int ext_size) { const int c_w = (ybf->y_crop_width + subsampling_x) >> subsampling_x; const int c_h = (ybf->y_crop_height + subsampling_y) >> subsampling_y; - const int c_et = ybf->border >> subsampling_y; - const int c_el = ybf->border >> subsampling_x; - const int c_eb = (ybf->border + ybf->y_height - ybf->y_crop_height + + const int c_et = ext_size >> subsampling_y; + const int c_el = ext_size >> subsampling_x; + const int c_eb = (ext_size + ybf->y_height - ybf->y_crop_height + subsampling_y) >> subsampling_y; - const int c_er = (ybf->border + ybf->y_width - ybf->y_crop_width + + const int c_er = (ext_size + ybf->y_width - ybf->y_crop_width + subsampling_x) >> subsampling_x; assert(ybf->y_height - ybf->y_crop_height < 16); @@ -114,9 +115,9 @@ void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height, - ybf->border, ybf->border, - ybf->border + ybf->y_height - ybf->y_crop_height, - ybf->border + ybf->y_width - ybf->y_crop_width); + ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); extend_plane(ybf->u_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er); @@ -124,6 +125,19 @@ void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, extend_plane(ybf->v_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er); } + + +void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, + int subsampling_x, int subsampling_y) { + extend_frame(ybf, subsampling_x, subsampling_y, ybf->border); +} + +void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf, + int subsampling_x, int subsampling_y) { + const int inner_bw = ybf->border > VP9INNERBORDERINPIXLES ? + VP9INNERBORDERINPIXLES : ybf->border; + extend_frame(ybf, subsampling_x, subsampling_y, inner_bw); +} #endif /**************************************************************************** diff --git a/libvpx/vpx_scale/vpx_scale_rtcd.sh b/libvpx/vpx_scale/vpx_scale_rtcd.sh index b4f8907..21d1e52 100644 --- a/libvpx/vpx_scale/vpx_scale_rtcd.sh +++ b/libvpx/vpx_scale/vpx_scale_rtcd.sh @@ -28,4 +28,7 @@ specialize vp8_yv12_copy_y neon if [ "$CONFIG_VP9" = "yes" ]; then prototype void vp9_extend_frame_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y" specialize vp9_extend_frame_borders + + prototype void vp9_extend_frame_inner_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y" + specialize vp9_extend_frame_inner_borders_c fi diff --git a/libvpx/vpx_scale/yv12config.h b/libvpx/vpx_scale/yv12config.h index 7b8bd85..66e587a 100644 --- a/libvpx/vpx_scale/yv12config.h +++ b/libvpx/vpx_scale/yv12config.h @@ -18,27 +18,10 @@ extern "C" { #include "vpx/vpx_integer.h" #define VP8BORDERINPIXELS 32 -#define VP9BORDERINPIXELS 96 +#define VP9INNERBORDERINPIXLES 96 +#define VP9BORDERINPIXELS 160 #define VP9_INTERP_EXTEND 4 - /************************************* - For INT_YUV: - - Y = (R+G*2+B)/4; - U = (R-B)/2; - V = (G*2 - R - B)/4; - And - R = Y+U-V; - G = Y+V; - B = Y-U-V; - ************************************/ - typedef enum - { - REG_YUV = 0, /* Regular yuv */ - INT_YUV = 1 /* The type of yuv that can be tranfer to and from RGB through integer transform */ - } - YUV_TYPE; - typedef struct yv12_buffer_config { int y_width; int y_height; @@ -49,6 +32,8 @@ extern "C" { int uv_width; int uv_height; + int uv_crop_width; + int uv_crop_height; int uv_stride; /* int uvinternal_width; */ @@ -65,7 +50,6 @@ extern "C" { int buffer_alloc_sz; int border; int frame_size; - YUV_TYPE clrtype; int corrupted; int flags; diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c index a60b84d..547b572 100644 --- a/libvpx/vpxenc.c +++ b/libvpx/vpxenc.c @@ -1180,22 +1180,22 @@ static void usage_exit() { exec_name); fprintf(stderr, "\nOptions:\n"); - arg_show_usage(stdout, main_args); + arg_show_usage(stderr, main_args); fprintf(stderr, "\nEncoder Global Options:\n"); - arg_show_usage(stdout, global_args); + arg_show_usage(stderr, global_args); fprintf(stderr, "\nRate Control Options:\n"); - arg_show_usage(stdout, rc_args); + arg_show_usage(stderr, rc_args); fprintf(stderr, "\nTwopass Rate Control Options:\n"); - arg_show_usage(stdout, rc_twopass_args); + arg_show_usage(stderr, rc_twopass_args); fprintf(stderr, "\nKeyframe Placement Options:\n"); - arg_show_usage(stdout, kf_args); + arg_show_usage(stderr, kf_args); #if CONFIG_VP8_ENCODER fprintf(stderr, "\nVP8 Specific Options:\n"); - arg_show_usage(stdout, vp8_args); + arg_show_usage(stderr, vp8_args); #endif #if CONFIG_VP9_ENCODER fprintf(stderr, "\nVP9 Specific Options:\n"); - arg_show_usage(stdout, vp9_args); + arg_show_usage(stderr, vp9_args); #endif fprintf(stderr, "\nStream timebase (--timebase):\n" " The desired precision of timestamps in the output, expressed\n" diff --git a/mips-dspr2/libvpx_srcs.txt b/mips-dspr2/libvpx_srcs.txt index e74102e..d756208 100644 --- a/mips-dspr2/libvpx_srcs.txt +++ b/mips-dspr2/libvpx_srcs.txt @@ -66,7 +66,6 @@ vp8/common/treecoder.c vp8/common/treecoder.h vp8/common/variance_c.c vp8/common/variance.h -vp8/common/vp8_asm_com_offsets.c vp8/common/vp8_entropymodedata.h vp8/decoder/dboolhuff.c vp8/decoder/dboolhuff.h @@ -80,7 +79,6 @@ vp8/decoder/onyxd_if.c vp8/decoder/onyxd_int.h vp8/decoder/threading.c vp8/decoder/treereader.h -vp8/decoder/vp8_asm_dec_offsets.c vp8/encoder/bitstream.c vp8/encoder/bitstream.h vp8/encoder/block.h @@ -136,8 +134,9 @@ vp8/vp8dx.mk vp9/common/generic/vp9_systemdependent.c vp9/common/vp9_alloccommon.c vp9/common/vp9_alloccommon.h -vp9/common/vp9_asm_com_offsets.c vp9/common/vp9_blockd.h +vp9/common/vp9_common_data.c +vp9/common/vp9_common_data.h vp9/common/vp9_common.h vp9/common/vp9_convolve.c vp9/common/vp9_convolve.h @@ -161,10 +160,6 @@ vp9/common/vp9_idct.h vp9/common/vp9_loopfilter.c vp9/common/vp9_loopfilter_filters.c vp9/common/vp9_loopfilter.h -vp9/common/vp9_mbpitch.c -vp9/common/vp9_modecont.c -vp9/common/vp9_modecontext.c -vp9/common/vp9_modecont.h vp9/common/vp9_mv.h vp9/common/vp9_mvref_common.c vp9/common/vp9_mvref_common.h @@ -192,7 +187,6 @@ vp9/common/vp9_tile_common.c vp9/common/vp9_tile_common.h vp9/common/vp9_treecoder.c vp9/common/vp9_treecoder.h -vp9/decoder/vp9_asm_dec_offsets.c vp9/decoder/vp9_dboolhuff.c vp9/decoder/vp9_dboolhuff.h vp9/decoder/vp9_decodemv.c @@ -201,6 +195,8 @@ vp9/decoder/vp9_decodframe.c vp9/decoder/vp9_decodframe.h vp9/decoder/vp9_detokenize.c vp9/decoder/vp9_detokenize.h +vp9/decoder/vp9_dsubexp.c +vp9/decoder/vp9_dsubexp.h vp9/decoder/vp9_idct_blk.c vp9/decoder/vp9_idct_blk.h vp9/decoder/vp9_onyxd.h diff --git a/mips-dspr2/vp9_rtcd.h b/mips-dspr2/vp9_rtcd.h index 2905eae..0752f45 100644 --- a/mips-dspr2/vp9_rtcd.h +++ b/mips-dspr2/vp9_rtcd.h @@ -38,28 +38,161 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob); #define vp9_idct_add_32x32 vp9_idct_add_32x32_c -void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -void vp9_copy_mem16x16_dspr2(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem16x16 vp9_copy_mem16x16_dspr2 +void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c -void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -void vp9_copy_mem8x8_dspr2(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem8x8 vp9_copy_mem8x8_dspr2 +void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c -void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem8x4 vp9_copy_mem8x4_c +void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c -void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available); -#define vp9_build_intra_predictors vp9_build_intra_predictors_c +void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c -void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize); -#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c +void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c -void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize); -#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c +void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c -void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride); -#define vp9_intra4x4_predict vp9_intra4x4_predict_c +void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c + +void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c + +void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c + +void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c + +void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c + +void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c + +void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c + +void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c + +void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c + +void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c + +void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c + +void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c + +void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c + +void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c + +void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c + +void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c + +void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c + +void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c + +void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c + +void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c + +void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c + +void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c + +void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c + +void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c + +void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride); #define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c @@ -79,7 +212,7 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *bli void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); @@ -97,22 +230,28 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); #define vp9_blend_b vp9_blend_b_c -void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_copy vp9_convolve_copy_c + +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_avg vp9_convolve_avg_c + +void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8 vp9_convolve8_c -void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_horiz vp9_convolve8_horiz_c -void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_vert vp9_convolve8_vert_c -void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg vp9_convolve8_avg_c -void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c -void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride); @@ -160,9 +299,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx void vp9_idct4_1d_c(int16_t *input, int16_t *output); #define vp9_idct4_1d vp9_idct4_1d_c -void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride); -#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c - void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c diff --git a/mips-dspr2/vpx_config.h b/mips-dspr2/vpx_config.h index 0ca4657..13a092d 100644 --- a/mips-dspr2/vpx_config.h +++ b/mips-dspr2/vpx_config.h @@ -87,5 +87,4 @@ #define CONFIG_MULTIPLE_ARF 0 #define CONFIG_NON420 0 #define CONFIG_ALPHA 0 -#define CONFIG_BALANCED_COEFTREE 0 #endif /* VPX_CONFIG_H */ diff --git a/mips-dspr2/vpx_scale_rtcd.h b/mips-dspr2/vpx_scale_rtcd.h index 7af466a..be038f4 100644 --- a/mips-dspr2/vpx_scale_rtcd.h +++ b/mips-dspr2/vpx_scale_rtcd.h @@ -42,6 +42,9 @@ void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_co void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); #define vp9_extend_frame_borders vp9_extend_frame_borders_c +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c + void vpx_scale_rtcd(void); #include "vpx_config.h" diff --git a/mips/libvpx_srcs.txt b/mips/libvpx_srcs.txt index 8c1ec80..402ac24 100644 --- a/mips/libvpx_srcs.txt +++ b/mips/libvpx_srcs.txt @@ -60,7 +60,6 @@ vp8/common/treecoder.c vp8/common/treecoder.h vp8/common/variance_c.c vp8/common/variance.h -vp8/common/vp8_asm_com_offsets.c vp8/common/vp8_entropymodedata.h vp8/decoder/dboolhuff.c vp8/decoder/dboolhuff.h @@ -74,7 +73,6 @@ vp8/decoder/onyxd_if.c vp8/decoder/onyxd_int.h vp8/decoder/threading.c vp8/decoder/treereader.h -vp8/decoder/vp8_asm_dec_offsets.c vp8/encoder/bitstream.c vp8/encoder/bitstream.h vp8/encoder/block.h @@ -130,8 +128,9 @@ vp8/vp8dx.mk vp9/common/generic/vp9_systemdependent.c vp9/common/vp9_alloccommon.c vp9/common/vp9_alloccommon.h -vp9/common/vp9_asm_com_offsets.c vp9/common/vp9_blockd.h +vp9/common/vp9_common_data.c +vp9/common/vp9_common_data.h vp9/common/vp9_common.h vp9/common/vp9_convolve.c vp9/common/vp9_convolve.h @@ -155,10 +154,6 @@ vp9/common/vp9_idct.h vp9/common/vp9_loopfilter.c vp9/common/vp9_loopfilter_filters.c vp9/common/vp9_loopfilter.h -vp9/common/vp9_mbpitch.c -vp9/common/vp9_modecont.c -vp9/common/vp9_modecontext.c -vp9/common/vp9_modecont.h vp9/common/vp9_mv.h vp9/common/vp9_mvref_common.c vp9/common/vp9_mvref_common.h @@ -186,7 +181,6 @@ vp9/common/vp9_tile_common.c vp9/common/vp9_tile_common.h vp9/common/vp9_treecoder.c vp9/common/vp9_treecoder.h -vp9/decoder/vp9_asm_dec_offsets.c vp9/decoder/vp9_dboolhuff.c vp9/decoder/vp9_dboolhuff.h vp9/decoder/vp9_decodemv.c @@ -195,6 +189,8 @@ vp9/decoder/vp9_decodframe.c vp9/decoder/vp9_decodframe.h vp9/decoder/vp9_detokenize.c vp9/decoder/vp9_detokenize.h +vp9/decoder/vp9_dsubexp.c +vp9/decoder/vp9_dsubexp.h vp9/decoder/vp9_idct_blk.c vp9/decoder/vp9_idct_blk.h vp9/decoder/vp9_onyxd.h diff --git a/mips/vp9_rtcd.h b/mips/vp9_rtcd.h index 1d7b4d2..0752f45 100644 --- a/mips/vp9_rtcd.h +++ b/mips/vp9_rtcd.h @@ -38,26 +38,161 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob); void vp9_idct_add_32x32_c(int16_t *q, uint8_t *dst, int stride, int eob); #define vp9_idct_add_32x32 vp9_idct_add_32x32_c -void vp9_copy_mem16x16_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem16x16 vp9_copy_mem16x16_c +void vp9_d27_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_4x4 vp9_d27_predictor_4x4_c -void vp9_copy_mem8x8_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem8x8 vp9_copy_mem8x8_c +void vp9_d45_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c -void vp9_copy_mem8x4_c(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch); -#define vp9_copy_mem8x4 vp9_copy_mem8x4_c +void vp9_d63_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c -void vp9_build_intra_predictors_c(uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available); -#define vp9_build_intra_predictors vp9_build_intra_predictors_c +void vp9_h_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c -void vp9_build_intra_predictors_sby_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize); -#define vp9_build_intra_predictors_sby_s vp9_build_intra_predictors_sby_s_c +void vp9_d117_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c -void vp9_build_intra_predictors_sbuv_s_c(struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize); -#define vp9_build_intra_predictors_sbuv_s vp9_build_intra_predictors_sbuv_s_c +void vp9_d135_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c -void vp9_intra4x4_predict_c(struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride); -#define vp9_intra4x4_predict vp9_intra4x4_predict_c +void vp9_d153_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c + +void vp9_v_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c + +void vp9_tm_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c + +void vp9_dc_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_d27_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_8x8 vp9_d27_predictor_8x8_c + +void vp9_d45_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c + +void vp9_d63_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c + +void vp9_h_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c + +void vp9_d117_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c + +void vp9_v_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c + +void vp9_tm_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c + +void vp9_dc_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_d27_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_16x16 vp9_d27_predictor_16x16_c + +void vp9_d45_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c + +void vp9_d63_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c + +void vp9_h_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c + +void vp9_d117_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d135_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d153_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c + +void vp9_v_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c + +void vp9_tm_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c + +void vp9_dc_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c + +void vp9_dc_top_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_128_predictor_16x16_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_d27_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d27_predictor_32x32 vp9_d27_predictor_32x32_c + +void vp9_d45_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c + +void vp9_d63_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c + +void vp9_h_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c + +void vp9_d117_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d135_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d153_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_v_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c + +void vp9_tm_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c + +void vp9_dc_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c void vp9_add_constant_residual_8x8_c(const int16_t diff, uint8_t *dest, int stride); #define vp9_add_constant_residual_8x8 vp9_add_constant_residual_8x8_c @@ -77,7 +212,7 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *bli void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); @@ -95,22 +230,28 @@ void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, in void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); #define vp9_blend_b vp9_blend_b_c -void vp9_convolve8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_copy vp9_convolve_copy_c + +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_avg vp9_convolve_avg_c + +void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8 vp9_convolve8_c -void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_horiz vp9_convolve8_horiz_c -void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_vert vp9_convolve8_vert_c -void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg vp9_convolve8_avg_c -void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg_horiz vp9_convolve8_avg_horiz_c -void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride); @@ -158,9 +299,6 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *output, int pitch, int tx void vp9_idct4_1d_c(int16_t *input, int16_t *output); #define vp9_idct4_1d vp9_idct4_1d_c -void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride); -#define vp9_dc_only_idct_add vp9_dc_only_idct_add_c - void vp9_short_iwalsh4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride); #define vp9_short_iwalsh4x4_1_add vp9_short_iwalsh4x4_1_add_c diff --git a/mips/vpx_config.h b/mips/vpx_config.h index 49eab1e..51ea388 100644 --- a/mips/vpx_config.h +++ b/mips/vpx_config.h @@ -87,5 +87,4 @@ #define CONFIG_MULTIPLE_ARF 0 #define CONFIG_NON420 0 #define CONFIG_ALPHA 0 -#define CONFIG_BALANCED_COEFTREE 0 #endif /* VPX_CONFIG_H */ diff --git a/mips/vpx_scale_rtcd.h b/mips/vpx_scale_rtcd.h index 7af466a..be038f4 100644 --- a/mips/vpx_scale_rtcd.h +++ b/mips/vpx_scale_rtcd.h @@ -42,6 +42,9 @@ void vp8_yv12_copy_y_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_co void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); #define vp9_extend_frame_borders vp9_extend_frame_borders_c +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c + void vpx_scale_rtcd(void); #include "vpx_config.h" |